22// SPDX-License-Identifier: Apache-2.0
33
44#include " speculative_decoding_stateful.hpp"
5+ #include " speculative_decoding_utils.hpp"
56#include " continuous_batching/timer.hpp"
67#include " openvino/runtime/core.hpp"
78#include " openvino/core/parallel.hpp"
@@ -42,18 +43,6 @@ void update_perf_stat_by_infer_duration(ov::genai::RawPerfMetrics& raw_perf_coun
4243 raw_perf_counters.m_batch_sizes .emplace_back (num_generated_tokens);
4344}
4445
45- void ensure_num_assistant_tokens_is_set (ov::genai::GenerationConfig& generation_config) {
46- auto assistant_confidence_threshold = generation_config.assistant_confidence_threshold ;
47- OPENVINO_ASSERT (assistant_confidence_threshold == 0 .f ,
48- " Stateful (non Continuous Batching) Speculative Decoding pipeline only supports `num_assistant_tokens` "
49- " as parameter in GenerationConfig and doesn't work with `assistant_confidence_threshold`.\n Please "
50- " remove its specification or set it to 0.f." );
51-
52- constexpr std::size_t default_num_assistant_tokens = 5 ;
53- if (generation_config.num_assistant_tokens == 0 ) {
54- generation_config.num_assistant_tokens = default_num_assistant_tokens;
55- }
56- }
5746}// anonymous namespace
5847
5948namespace ov {
@@ -392,7 +381,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
392381 OPENVINO_ASSERT (m_draft_request != nullptr , " Failed to create draft model inference wrapper" );
393382
394383 // Specifying number candidates to generate
395- ensure_num_assistant_tokens_is_set (m_generation_config);
384+ ov::genai::speculative_decoding:: ensure_num_assistant_tokens_is_set (m_generation_config);
396385 m_candidates_num = m_generation_config.num_assistant_tokens ;
397386 // We set the upper limit for candidates number as two times the number requested
398387 // by user.
@@ -412,7 +401,7 @@ StatefulSpeculativeLLMPipeline::StatefulSpeculativeLLMPipeline(
412401GenerationConfig StatefulSpeculativeLLMPipeline::resolve_generation_config (OptionalGenerationConfig generation_config) {
413402 GenerationConfig config = generation_config.value_or (m_generation_config);
414403
415- ensure_num_assistant_tokens_is_set (config);
404+ ov::genai::speculative_decoding:: ensure_num_assistant_tokens_is_set (config);
416405 m_candidates_num = config.num_assistant_tokens ;
417406 // We set the upper limit for candidates number as two times the number
418407 // requested by user.
0 commit comments