Skip to content

Commit 8aed616

Browse files
Small code clean
1 parent a47dbaf commit 8aed616

File tree

7 files changed

+118
-219
lines changed

7 files changed

+118
-219
lines changed

samples/python/text_generation/speculative_decoding_lm.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ def main():
2121
# User can run main and draft model on different devices.
2222
# Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in `openvino_genai.draft_model` for draft.
2323
# CPU, GPU and NPU can be used. For NPU, the preferred configuration is when both the main and draft models use NPU.
24-
main_device = 'NPU'
25-
draft_device = 'NPU'
24+
main_device = 'CPU'
25+
draft_device = 'CPU'
2626

2727
draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
2828

2929
pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
3030

3131
config = openvino_genai.GenerationConfig()
32-
config.max_new_tokens = 20
32+
config.max_new_tokens = 100
3333
# Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded.
3434
# Add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration.
3535
# NOTE: ContinuousBatching backend uses `num_assistant_tokens` as is. Stateful backend uses `num_assistant_tokens`'s copy as initial

src/cpp/src/continuous_batching/pipeline.cpp

Lines changed: 0 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -22,43 +22,6 @@
2222
using namespace ov::genai;
2323

2424
namespace {
25-
struct Eagle3RTInfo {
26-
bool eagle3_mode = false;
27-
std::vector<int> hidden_layers_list;
28-
std::filesystem::path dt_mapping_table;
29-
};
30-
31-
Eagle3RTInfo
32-
extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path& models_path) {
33-
Eagle3RTInfo eagle_rt_info;
34-
if (config.find("eagle3_mode") != config.end()) {
35-
eagle_rt_info.eagle3_mode = config.at("eagle3_mode").as<bool>();
36-
config.erase("eagle3_mode");
37-
if (config.find("hidden_layers_list") != config.end()) {
38-
eagle_rt_info.hidden_layers_list = config.at("hidden_layers_list").as<std::vector<int>>();
39-
config.erase("hidden_layers_list");
40-
} else {
41-
// compute the layers from number of hidden layers
42-
auto config_file_path = models_path / "config.json";
43-
if (!std::filesystem::exists(config_file_path))
44-
OPENVINO_THROW("cannot deduce layers for hidden layer extraction");
45-
std::ifstream file(config_file_path);
46-
47-
nlohmann::json data = nlohmann::json::parse(file);
48-
using ov::genai::utils::read_json_param;
49-
int num_decoder_layers = 0;
50-
read_json_param(data, "num_hidden_layers", num_decoder_layers);
51-
OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
52-
// The following default hidden layer selection corresponds to the EAGLE reference implementation:
53-
// https://github.com/SafeAILab/EAGLE/blob/0ea94696/eagle/model/modeling_llama_kv.py#L1138
54-
// These layers (2, num_decoder_layers / 2, num_decoder_layers - 3) are chosen to capture features from
55-
// early, middle, and late stages of the decoder, as recommended by the EAGLE authors.
56-
// If you wish to use different layers, provide the "hidden_layers_list" parameter in the config.
57-
eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
58-
}
59-
}
60-
return eagle_rt_info;
61-
}
6225

6326
bool
6427
extract_prompt_lookup_from_config(ov::AnyMap& config) {

src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
namespace ov::genai {
2121

22-
// Forward declarations for Eagle3 transformation functions
2322
void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared_ptr<ov::Model>& draft_model);
2423
void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model, const std::vector<int>& hidden_layers_to_abstract, const std::string& device = "");
2524
std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(const std::shared_ptr<ov::Model>& model);

src/cpp/src/speculative_decoding/speculative_decoding_stateful.cpp

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// SPDX-License-Identifier: Apache-2.0
33

44
#include "speculative_decoding_stateful.hpp"
5+
#include "speculative_decoding_utils.hpp"
56
#include "continuous_batching/timer.hpp"
67
#include "openvino/runtime/core.hpp"
78
#include "openvino/core/parallel.hpp"
@@ -42,18 +43,6 @@ void update_perf_stat_by_infer_duration(ov::genai::RawPerfMetrics& raw_perf_coun
4243
raw_perf_counters.m_batch_sizes.emplace_back(num_generated_tokens);
4344
}
4445

45-
void ensure_num_assistant_tokens_is_set(ov::genai::GenerationConfig& generation_config) {
46-
auto assistant_confidence_threshold = generation_config.assistant_confidence_threshold;
47-
OPENVINO_ASSERT(assistant_confidence_threshold == 0.f,
48-
"Stateful (non Continuous Batching) Speculative Decoding pipeline only supports `num_assistant_tokens` "
49-
"as parameter in GenerationConfig and doesn't work with `assistant_confidence_threshold`.\nPlease "
50-
"remove its specification or set it to 0.f.");
51-
52-
constexpr std::size_t default_num_assistant_tokens = 5;
53-
if (generation_config.num_assistant_tokens == 0) {
54-
generation_config.num_assistant_tokens = default_num_assistant_tokens;
55-
}
56-
}
5746
}// anonymous namespace
5847

5948
namespace ov {
@@ -392,7 +381,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
392381
OPENVINO_ASSERT(m_draft_request != nullptr, "Failed to create draft model inference wrapper");
393382

394383
// Specifying number candidates to generate
395-
ensure_num_assistant_tokens_is_set(m_generation_config);
384+
ov::genai::speculative_decoding::ensure_num_assistant_tokens_is_set(m_generation_config);
396385
m_candidates_num = m_generation_config.num_assistant_tokens;
397386
// We set the upper limit for candidates number as two times the number requested
398387
// by user.
@@ -412,7 +401,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
412401
GenerationConfig StatefulSpeculativeLLMPipeline::resolve_generation_config(OptionalGenerationConfig generation_config) {
413402
GenerationConfig config = generation_config.value_or(m_generation_config);
414403

415-
ensure_num_assistant_tokens_is_set(config);
404+
ov::genai::speculative_decoding::ensure_num_assistant_tokens_is_set(config);
416405
m_candidates_num = config.num_assistant_tokens;
417406
// We set the upper limit for candidates number as two times the number
418407
// requested by user.

0 commit comments

Comments
 (0)