Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion .github/workflows/linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -534,7 +534,7 @@ jobs:
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
timeout: 60
- name: 'API tests'
cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "not eagle3" ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
timeout: 60
- name: 'Rag tests'
Expand All @@ -551,6 +551,12 @@ jobs:
python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
timeout: 90
- name: 'EAGLE3 speculative decoding tests'
cmd: |
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
timeout: 90
defaults:
run:
shell: bash
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/manylinux_2_28.yml
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ jobs:
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
timeout: 60
- name: 'API tests'
cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
cmd: 'python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "not eagle3" ./tests/python_tests/test_generation_config.py ./tests/python_tests/test_sampling.py ./tests/python_tests/test_text_streamer.py'
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
timeout: 60
- name: 'Rag tests'
Expand All @@ -489,6 +489,12 @@ jobs:
python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
timeout: 90
- name: 'EAGLE3 speculative decoding tests'
cmd: |
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
timeout: 90
defaults:
run:
shell: bash
Expand Down
8 changes: 7 additions & 1 deletion .github/workflows/windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ jobs:
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).tokenizers.test }}
timeout: 60
- name: 'API tests'
cmd: 'python -m pytest -s -v tests/python_tests/test_continuous_batching.py tests/python_tests/test_generation_config.py tests/python_tests/test_sampling.py tests/python_tests/test_text_streamer.py'
cmd: 'python -m pytest -s -v tests/python_tests/test_continuous_batching.py -k "not eagle3" tests/python_tests/test_generation_config.py tests/python_tests/test_sampling.py tests/python_tests/test_text_streamer.py'
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).continuous_batching.test || fromJSON(needs.smart_ci.outputs.affected_components).sampling.test || fromJSON(needs.smart_ci.outputs.affected_components).text_streamer.test }}
timeout: 60
- name: 'Rag tests'
Expand All @@ -640,6 +640,12 @@ jobs:
python -m pytest -v ./tools/who_what_benchmark/tests -m nanollava
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).WWB.test }}
timeout: 90
- name: 'EAGLE3 speculative decoding tests'
cmd: |
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
Copy link

Copilot AI Nov 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Installing from a specific commit hash (ea9607daf32919024cdd4390deec9693a7b64d23) in a personal GitHub repository is fragile and not reproducible long-term. Consider using a tagged release from the official repository or documenting why this specific commit is required.

Suggested change
python -m pip install git+https://github.com/xufang-lisa/optimum-intel.git@ea9607daf32919024cdd4390deec9693a7b64d23
# Install optimum-intel from the official repository for reproducibility.
python -m pip install optimum-intel

Copilot uses AI. Check for mistakes.
python -m pytest -v ./tests/python_tests/test_continuous_batching.py -k "eagle3"
run_condition: ${{ fromJSON(needs.smart_ci.outputs.affected_components).speculative_decoding.test }}
timeout: 90
defaults:
run:
shell: pwsh
Expand Down
6 changes: 3 additions & 3 deletions samples/python/text_generation/speculative_decoding_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,15 @@ def main():
# User can run main and draft model on different devices.
# Please, set device for main model in `openvino_genai.LLMPipeline` constructor and in `openvino_genai.draft_model` for draft.
# CPU, GPU and NPU can be used. For NPU, the preferred configuration is when both the main and draft models use NPU.
main_device = 'CPU'
draft_device = 'CPU'
main_device = 'NPU'
draft_device = 'NPU'

draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)

pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)

config = openvino_genai.GenerationConfig()
config.max_new_tokens = 100
config.max_new_tokens = 20
# Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded.
# Add parameter to enable speculative decoding to generate `num_assistant_tokens` candidates by draft_model per iteration.
# NOTE: ContinuousBatching backend uses `num_assistant_tokens` as is. Stateful backend uses `num_assistant_tokens`'s copy as initial
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,13 +65,18 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
class ContinuousBatchingImpl;

class ContinuousBatchingForSpeculativeDecodingImpl;
class ContinuousBatchingForEagle3DecodingImpl;
class ContinuousBatchingForPromptLookupImpl;
class SpeculativeDecodingImpl;
class Eagle3DecodingImpl;
class PromptLookupImpl;

friend class ContinuousBatchingForSpeculativeDecodingImpl;

friend class ContinuousBatchingForPromptLookupImpl;
friend class ContinuousBatchingForEagle3DecodingImpl;
friend class SpeculativeDecodingImpl;
friend class Eagle3DecodingImpl;
friend class PromptLookupImpl;

std::shared_ptr<IContinuousBatchingPipeline> m_impl;
Expand Down
Loading