openvinotoolkit
diff --git a/‎samples/python/text_generation/speculative_decoding_lm.py‎
Lines changed: 3 additions & 3 deletions b/‎samples/python/text_generation/speculative_decoding_lm.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/cpp/src/continuous_batching/pipeline.cpp‎
Lines changed: 0 additions & 37 deletions b/‎src/cpp/src/continuous_batching/pipeline.cpp‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp‎
Lines changed: 0 additions & 1 deletion b/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/cpp/src/speculative_decoding/speculative_decoding_stateful.cpp‎
Lines changed: 3 additions & 14 deletions b/‎src/cpp/src/speculative_decoding/speculative_decoding_stateful.cpp‎
Lines changed: 3 additions & 14 deletions
@@ -21,15 +21,15 @@ def main():
     # User can run main and draft model on different devices.
     # Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in `openvino_genai.draft_model` for draft.
     # CPU, GPU and NPU can be used. For NPU, the preferred configuration is when both the main and draft models use NPU.
-    main_device = 'NPU'
-    draft_device = 'NPU'
+    main_device = 'CPU'
+    draft_device = 'CPU'
 
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
     pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
 
     config = openvino_genai.GenerationConfig()
-    config.max_new_tokens = 20
+    config.max_new_tokens = 100
     # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded.
     # Add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration.
     # NOTE: ContinuousBatching backend uses `num_assistant_tokens` as is. Stateful backend uses `num_assistant_tokens`'s copy as initial
 
@@ -22,43 +22,6 @@
 using namespace ov::genai;
 
 namespace {
-struct Eagle3RTInfo {
-    bool eagle3_mode = false;
-    std::vector<int> hidden_layers_list;
-    std::filesystem::path dt_mapping_table;
-};
-
-Eagle3RTInfo
-extract_eagle_mode_from_config(ov::AnyMap& config, const std::filesystem::path& models_path) {
-    Eagle3RTInfo eagle_rt_info;
-    if (config.find("eagle3_mode") != config.end()) {
-        eagle_rt_info.eagle3_mode = config.at("eagle3_mode").as<bool>();
-        config.erase("eagle3_mode");
-        if (config.find("hidden_layers_list") != config.end()) {
-            eagle_rt_info.hidden_layers_list = config.at("hidden_layers_list").as<std::vector<int>>();
-            config.erase("hidden_layers_list");
-        } else {
-            // compute the layers from number of hidden layers
-            auto config_file_path = models_path / "config.json";
-            if (!std::filesystem::exists(config_file_path))
-                OPENVINO_THROW("cannot deduce layers for hidden layer extraction");
-            std::ifstream file(config_file_path);
-
-            nlohmann::json data = nlohmann::json::parse(file);
-            using ov::genai::utils::read_json_param;
-            int num_decoder_layers = 0;
-            read_json_param(data, "num_hidden_layers", num_decoder_layers);
-            OPENVINO_ASSERT(num_decoder_layers > 3, "num_decoder_layers is too small to deduce hidden layers for extraction");
-            // The following default hidden layer selection corresponds to the EAGLE reference implementation:
-            // https://github.com/SafeAILab/EAGLE/blob/0ea94696/eagle/model/modeling_llama_kv.py#L1138
-            // These layers (2, num_decoder_layers / 2, num_decoder_layers - 3) are chosen to capture features from
-            // early, middle, and late stages of the decoder, as recommended by the EAGLE authors.
-            // If you wish to use different layers, provide the "hidden_layers_list" parameter in the config.
-            eagle_rt_info.hidden_layers_list = { 2, num_decoder_layers / 2, num_decoder_layers - 3 };
-        }
-    }
-    return eagle_rt_info;
-}
 
 bool
 extract_prompt_lookup_from_config(ov::AnyMap& config) {
 
@@ -19,7 +19,6 @@
 
 namespace ov::genai {
 
-// Forward declarations for Eagle3 transformation functions
 void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared_ptr<ov::Model>& draft_model);
 void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model, const std::vector<int>& hidden_layers_to_abstract, const std::string& device = "");
 std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(const std::shared_ptr<ov::Model>& model);
 
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "speculative_decoding_stateful.hpp"
+#include "speculative_decoding_utils.hpp"
 #include "continuous_batching/timer.hpp"
 #include "openvino/runtime/core.hpp"
 #include "openvino/core/parallel.hpp"
@@ -42,18 +43,6 @@ void update_perf_stat_by_infer_duration(ov::genai::RawPerfMetrics& raw_perf_coun
     raw_perf_counters.m_batch_sizes.emplace_back(num_generated_tokens);
 }
 
-void ensure_num_assistant_tokens_is_set(ov::genai::GenerationConfig& generation_config) {
-    auto assistant_confidence_threshold = generation_config.assistant_confidence_threshold;
-    OPENVINO_ASSERT(assistant_confidence_threshold == 0.f,
-        "Stateful (non Continuous Batching) Speculative Decoding pipeline only supports `num_assistant_tokens` "
-        "as parameter in GenerationConfig and doesn't work with `assistant_confidence_threshold`.\nPlease "
-        "remove its specification or set it to 0.f.");
-
-    constexpr std::size_t default_num_assistant_tokens = 5;
-    if (generation_config.num_assistant_tokens == 0) {
-        generation_config.num_assistant_tokens = default_num_assistant_tokens;
-    }
-}
 }// anonymous namespace
 
 namespace ov {
@@ -392,7 +381,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
     OPENVINO_ASSERT(m_draft_request != nullptr, "Failed to create draft model inference wrapper");
 
     // Specifying number candidates to generate
-    ensure_num_assistant_tokens_is_set(m_generation_config);
+    ov::genai::speculative_decoding::ensure_num_assistant_tokens_is_set(m_generation_config);
     m_candidates_num = m_generation_config.num_assistant_tokens;
     // We set the upper limit for candidates number as two times the number requested
     // by user.
@@ -412,7 +401,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
 GenerationConfig StatefulSpeculativeLLMPipeline::resolve_generation_config(OptionalGenerationConfig generation_config) {
     GenerationConfig config = generation_config.value_or(m_generation_config);
 
-    ensure_num_assistant_tokens_is_set(config);
+    ov::genai::speculative_decoding::ensure_num_assistant_tokens_is_set(config);
     m_candidates_num = config.num_assistant_tokens;
     // We set the upper limit for candidates number as two times the number
     // requested by user.