openvinotoolkit
diff --git a/‎samples/python/text_generation/speculative_decoding_lm.py‎
Lines changed: 3 additions & 3 deletions b/‎samples/python/text_generation/speculative_decoding_lm.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/cpp/src/continuous_batching/pipeline.cpp‎
Lines changed: 4 additions & 3 deletions b/‎src/cpp/src/continuous_batching/pipeline.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/cpp/src/llm/pipeline.cpp‎
Lines changed: 37 additions & 8 deletions b/‎src/cpp/src/llm/pipeline.cpp‎
Lines changed: 37 additions & 8 deletions
diff --git a/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.cpp‎
Lines changed: 74 additions & 10 deletions b/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.cpp‎
Lines changed: 74 additions & 10 deletions
diff --git a/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp‎
Lines changed: 11 additions & 2 deletions b/‎src/cpp/src/speculative_decoding/speculative_decoding_eagle3_impl.hpp‎
Lines changed: 11 additions & 2 deletions
@@ -21,15 +21,15 @@ def main():
     # User can run main and draft model on different devices.
     # Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in `openvino_genai.draft_model` for draft.
     # CPU, GPU and NPU can be used. For NPU, the preferred configuration is when both the main and draft models use NPU.
-    main_device = 'CPU'
-    draft_device = 'CPU'
+    main_device = 'NPU'
+    draft_device = 'NPU'
 
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
     pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
 
     config = openvino_genai.GenerationConfig()
-    config.max_new_tokens = 100
+    config.max_new_tokens = 20
     # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded.
     # Add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration.
     # NOTE: ContinuousBatching backend uses `num_assistant_tokens` as is. Stateful backend uses `num_assistant_tokens`'s copy as initial
 
@@ -12,6 +12,7 @@
 #include "continuous_batching/pipeline_impl.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
 #include "speculative_decoding/speculative_decoding_eagle3_impl.hpp"
+#include "speculative_decoding/speculative_decoding_utils.hpp"
 #include "prompt_lookup/prompt_lookup_impl.hpp"
 #include "continuous_batching/timer.hpp"
 #include "utils.hpp"
@@ -85,7 +86,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
+    auto eagle_rt_info = speculative_decoding::extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
 
     auto model = utils::read_model(models_path, properties);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
@@ -132,7 +133,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
+    auto eagle_rt_info = speculative_decoding::extract_eagle_mode_from_config(draft_model_desr.properties, models_path);
     auto model = utils::read_model(models_path, properties_without_draft_model);
     auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
     properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
@@ -182,7 +183,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
     auto properties_without_draft_model = properties;
     auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
     auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
-    auto eagle_rt_info = extract_eagle_mode_from_config(draft_model_desr.properties, std::filesystem::path(model_str));
+    auto eagle_rt_info = speculative_decoding::extract_eagle_mode_from_config(draft_model_desr.properties, std::filesystem::path(model_str));
     auto model = utils::singleton_core().read_model(model_str, weights_tensor);
 
     auto rt_info = model->get_rt_info();
 
@@ -13,6 +13,8 @@
 #include "llm/pipeline_continuous_batching_adapter.hpp"
 #include "speculative_decoding/speculative_decoding_impl.hpp"
 #include "speculative_decoding/speculative_decoding_stateful.hpp"
+#include "speculative_decoding/speculative_decoding_stateful_eagle3.hpp"
+#include "speculative_decoding/speculative_decoding_utils.hpp"
 #include "utils.hpp"
 
 namespace {
@@ -142,7 +144,8 @@ static std::unique_ptr<LLMPipelineImplBase> create(
         tokenizer,
         device,
         properties,
-        utils::from_config_json_if_exists(models_path));
+        utils::from_config_json_if_exists(models_path),
+        models_path);
 }
 
 static std::unique_ptr<LLMPipelineImplBase> create(
@@ -157,17 +160,43 @@ static std::unique_ptr<LLMPipelineImplBase> create(
     const ov::genai::Tokenizer& tokenizer,
     const std::string& device,
     const ov::AnyMap& properties,
-    const ov::genai::GenerationConfig& generation_config) {
+    const ov::genai::GenerationConfig& generation_config,
+    const std::filesystem::path& models_path = {}) {
 
     auto properties_without_draft_model = properties;
     auto draft_model_descr = ov::genai::utils::extract_draft_model_from_config(properties_without_draft_model);
+    
     if (draft_model_descr.model != nullptr) {
-        // FIXME: Add support for StatefulSpeculativeLLMPipeline for non-NPU devices for both models.
-        OPENVINO_ASSERT(device == "NPU" || draft_model_descr.device == "NPU",
-            "Stateful Speculative Decoding is expected to be launched when NPU is requested as "
-            "execution device for one or both models.");
-        auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, properties_without_draft_model, {}, generation_config);
-        return std::make_unique<StatefulSpeculativeLLMPipeline>(main_model_descr, draft_model_descr);
+        // Extract Eagle3 configuration from draft model properties
+        // Pass models_path for auto-deducing hidden_layers_list from config.json
+        auto eagle_rt_info = ov::genai::speculative_decoding::extract_eagle_mode_from_config(
+            draft_model_descr.properties, 
+            models_path
+        );
+        
+        if (eagle_rt_info.eagle3_mode) {
+            // Eagle3 Speculative Decoding mode
+            OPENVINO_ASSERT(device == "NPU" || draft_model_descr.device == "NPU",
+                "Stateful Eagle3 Speculative Decoding is expected to be launched when NPU is requested as "
+                "execution device for one or both models.");
+            
+            auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, 
+                                                        properties_without_draft_model, {}, generation_config);
+            return std::make_unique<StatefulEagle3LLMPipeline>(
+                main_model_descr, 
+                draft_model_descr, 
+                eagle_rt_info.hidden_layers_list
+            );
+        } else {
+            // Standard Speculative Decoding mode
+            OPENVINO_ASSERT(device == "NPU" || draft_model_descr.device == "NPU",
+                "Stateful Speculative Decoding is expected to be launched when NPU is requested as "
+                "execution device for one or both models.");
+            
+            auto main_model_descr = ov::genai::ModelDesc(model, tokenizer, device, 
+                                                        properties_without_draft_model, {}, generation_config);
+            return std::make_unique<StatefulSpeculativeLLMPipeline>(main_model_descr, draft_model_descr);
+        }
     }
 
     return std::make_unique<StatefulLLMPipeline>(model, tokenizer, device,
 
@@ -52,7 +52,7 @@ void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared
     }
 }
 
-std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(std::shared_ptr<ov::Model>& model) {
+std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(const std::shared_ptr<ov::Model>& model) {
     // extract result nodes from model
     for (const auto& result : model->get_results()) {
         auto input_node = result->input_value(0).get_node_shared_ptr();
@@ -62,14 +62,36 @@ std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(std::shared_ptr<
     }
     return nullptr;
 }
+
+void remove_d2t_result_node(std::shared_ptr<ov::Model>& model) {
+    // Find and remove the d2t Result node
+    std::shared_ptr<ov::op::v0::Result> d2t_result_to_remove = nullptr;
+    
+    for (const auto& result : model->get_results()) {
+        auto input_node = result->input_value(0).get_node_shared_ptr();
+        if (ov::is_type<ov::op::v0::Constant>(input_node) && 
+            input_node->get_friendly_name().find("d2t") != std::string::npos) {
+            d2t_result_to_remove = result;
+            break;
+        }
+    }
+    
+    if (d2t_result_to_remove) {
+        model->remove_result(d2t_result_to_remove);
+        model->validate_nodes_and_infer_types();
+    }
+}
+
 void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model,
-                                  const std::vector<int>& hidden_layers_to_abstract) {
+                                  const std::vector<int>& hidden_layers_to_abstract,
+                                  const std::string& device) {
     ov::pass::Manager pm;
-    pm.register_pass<EagleModelTransform>(hidden_layers_to_abstract);
+    pm.register_pass<EagleModelTransform>(hidden_layers_to_abstract, device);
     pm.run_passes(model);
 }
 
-EagleModelTransform::EagleModelTransform(const std::vector<int>& layers) : m_layer_ids(layers) {
+EagleModelTransform::EagleModelTransform(const std::vector<int>& layers, const std::string& device) 
+    : m_layer_ids(layers), m_device(device) {
 }
 
 bool EagleModelTransform::run_on_model(const std::shared_ptr<ov::Model>& model) {
@@ -82,7 +104,7 @@ bool EagleModelTransform::run_on_model(const std::shared_ptr<ov::Model>& model)
         manager.register_pass<EagleBaseTransform>(m_new_results);
         // input transform for draft
         // here we apply a trick for the fc layer in draft model
-        manager.register_pass<EagleInputTransform>(m_new_parameters);
+        manager.register_pass<EagleInputTransform>(m_new_parameters, m_device);
         manager.run_passes(model);
 
         model->add_parameters(m_new_parameters);
@@ -109,7 +131,8 @@ bool EagleModelTransform::run_on_model(const std::shared_ptr<ov::Model>& model)
     return false;
 }
 
-EagleInputTransform::EagleInputTransform(std::vector<std::shared_ptr<v0::Parameter>>& params) {
+EagleInputTransform::EagleInputTransform(std::vector<std::shared_ptr<v0::Parameter>>& params, const std::string& device) 
+    : m_device(device) {
     register_matcher(
         std::make_shared<ov::pass::pattern::Matcher>(ov::pass::pattern::wrap_type<v0::MatMul>(), this->get_type_info().name),
         ([&params, this](ov::pass::pattern::Matcher& m) {
@@ -126,6 +149,7 @@ EagleInputTransform::EagleInputTransform(std::vector<std::shared_ptr<v0::Paramet
         })
     );
 }
+
 bool EagleInputTransform::apply(NodePtr node, std::vector<std::shared_ptr<v0::Parameter>>& params) {
     if (ov::is_type<v0::MatMul>(node)) {
         auto matmul_node = ov::as_type_ptr<v0::MatMul>(node);
@@ -135,16 +159,56 @@ bool EagleInputTransform::apply(NodePtr node, std::vector<std::shared_ptr<v0::Pa
             return false;
         }
 
+        auto matmul_input0 = matmul_node->input_value(0);
+        auto matmul_input1 = matmul_node->input_value(1);
+        
+        std::shared_ptr<ov::Node> matmul_output_node;
+        
+        // Apply scaling optimization for NPU devices to prevent FP16 overflow
+        if (m_device.find("NPU") != std::string::npos) {
+            // Scale input down by 100x before MatMul to avoid FP16 overflow, then scale result back up
+            // The factor 100 (0.01 and 100.0) is an empirical value
+            auto scale_down_const = std::make_shared<v0::Constant>(matmul_input0.get_element_type(), ov::Shape{}, 0.01f);
+            auto multiply_scale_down = std::make_shared<v1::Multiply>(matmul_input0, scale_down_const);
+            multiply_scale_down->set_friendly_name(matmul_node->get_friendly_name() + "/multiply_scale_down");
+            
+            // Create new MatMul with scaled input
+            auto new_matmul = std::make_shared<v0::MatMul>(multiply_scale_down, matmul_input1,
+                                                            matmul_node->get_transpose_a(), 
+                                                            matmul_node->get_transpose_b());
+            new_matmul->set_friendly_name(matmul_node->get_friendly_name() + "/matmul");
+            
+            // Scale result back up to maintain numerical equivalence
+            auto scale_up_const = std::make_shared<v0::Constant>(new_matmul->get_element_type(), ov::Shape{}, 100.0f);
+            auto multiply_scale_up = std::make_shared<v1::Multiply>(new_matmul->output(0), scale_up_const);
+            multiply_scale_up->set_friendly_name(matmul_node->get_friendly_name() + "/multiply_scale_up");
+            
+            matmul_output_node = multiply_scale_up;
+        } else {
+            // Default behavior: Use MatMul directly without scaling
+            auto new_matmul = std::make_shared<v0::MatMul>(matmul_input0, matmul_input1,
+                                                            matmul_node->get_transpose_a(), 
+                                                            matmul_node->get_transpose_b());
+            new_matmul->set_friendly_name(matmul_node->get_friendly_name() + "/matmul");
+            
+            matmul_output_node = new_matmul;
+        }
+
         auto shape = node->get_output_partial_shape(0);
         auto internal_hidden_state = std::make_shared<v0::Parameter>(node->get_element_type(), node->get_output_partial_shape(0));
         internal_hidden_state->output(0).set_names({"internal_hidden_states"});
         internal_hidden_state->set_friendly_name("internal_hidden_states");
-        // create new eltwise node to add output of MatMul node and internal hidden state input from last cycle of itself
-        auto new_eltwise = std::make_shared<v1::Add>(internal_hidden_state, matmul_node->output(0));
+        
+        // Create new Add node (MatMul output + internal_hidden_state)
+        auto new_eltwise = std::make_shared<v1::Add>(internal_hidden_state, matmul_output_node->output(0));
+        new_eltwise->set_friendly_name(matmul_node->get_friendly_name() + "/add");
+        
+        // Replace the original MatMul node with the new Add
         ov::replace_node(matmul_node, new_eltwise);
         params.push_back(internal_hidden_state);
         return true;
     }
+    return false;
 }
 
 EagleBaseTransform::EagleBaseTransform(std::vector<std::shared_ptr<v0::Result>>& results) {
@@ -303,8 +367,8 @@ ContinuousBatchingPipeline::Eagle3DecodingImpl::Eagle3DecodingImpl(const ov::gen
     // target model: hidden state extraction, draft model: hidden state import , hidden state extraction
     // eagle3 specific : dt importing
     share_embedding_weights(main_model, draft_model);
-    extract_hidden_state_generic(main_model, hidden_layers);
-    extract_hidden_state_generic(draft_model, { -1 });
+    extract_hidden_state_generic(main_model, hidden_layers, main_device);
+    extract_hidden_state_generic(draft_model, { -1 }, draft_device);
 
     // to create `main_pipeline` with enabled validation_mode and `draft_pipeline` with disabled validation mode
     m_main_pipeline = std::make_shared<ContinuousBatchingForEagle3DecodingImpl>(main_model,
 
@@ -18,6 +18,13 @@
 #include "openvino/pass/manager.hpp"
 
 namespace ov::genai {
+
+// Forward declarations for Eagle3 transformation functions
+void share_embedding_weights(std::shared_ptr<ov::Model>& main_model, std::shared_ptr<ov::Model>& draft_model);
+void extract_hidden_state_generic(std::shared_ptr<ov::Model>& model, const std::vector<int>& hidden_layers_to_abstract, const std::string& device = "");
+std::shared_ptr<ov::op::v0::Constant> extract_d2t_mapping_table(const std::shared_ptr<ov::Model>& model);
+void remove_d2t_result_node(std::shared_ptr<ov::Model>& model);
+
 class ContinuousBatchingPipeline::Eagle3DecodingImpl : public ContinuousBatchingPipeline::SpeculativeDecodingImpl {
 public:
     template<class Impl>
@@ -73,13 +80,14 @@ class EagleInputTransform : public ov::pass::MatcherPass { // eagle3 specific fo
 public:
     using NodePtr = std::shared_ptr<ov::Node>;
     OPENVINO_MATCHER_PASS_RTTI("EagleInputTransform");
-    EagleInputTransform(std::vector<std::shared_ptr<ov::op::v0::Parameter>>& params);
+    EagleInputTransform(std::vector<std::shared_ptr<ov::op::v0::Parameter>>& params, const std::string& device = "");
 
     ~EagleInputTransform() = default;
 
 private:
     bool apply(NodePtr node, std::vector<std::shared_ptr<ov::op::v0::Parameter>>& params);
     size_t applied = 0;
+    std::string m_device;
 };
 class Eagle3Transform : public ov::pass::MatcherPass {
 public:
@@ -95,11 +103,12 @@ class Eagle3Transform : public ov::pass::MatcherPass {
 
 class EagleModelTransform : public ov::pass::ModelPass {
 public:
-    EagleModelTransform(const std::vector<int>& layer_ids);
+    EagleModelTransform(const std::vector<int>& layer_ids, const std::string& device = "");
     bool run_on_model(const std::shared_ptr<Model>& model) override;
 
 private:
     const std::vector<int> m_layer_ids;
+    std::string m_device;
     std::vector<std::shared_ptr<ov::op::v0::Result>> m_new_results;
     std::vector<std::shared_ptr<ov::op::v0::Parameter>> m_new_parameters;
     std::vector<Output<Node>> m_hidden_layer_outputs;