Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
7b3ad31
Introduce add_extension to genai.
xipingyan Nov 3, 2025
2f60301
add draft test.
xipingyan Nov 5, 2025
80efe54
Update src/python/openvino_genai/py_openvino_genai.pyi
xipingyan Nov 5, 2025
4a02c8e
Update src/python/openvino_genai/__init__.pyi
xipingyan Nov 5, 2025
31078ab
Update tests/python_tests/test_add_extension.py
xipingyan Nov 5, 2025
281fe8c
Update tests/python_tests/test_add_extension.py
xipingyan Nov 5, 2025
ad64976
Merge branch 'master' into xp/introduce_add_extention_to_genai
xipingyan Nov 5, 2025
e449070
Update src/python/openvino_genai/py_openvino_genai.pyi
xipingyan Nov 5, 2025
4f60a60
Update src/cpp/include/openvino/genai/generation_config.hpp
xipingyan Nov 5, 2025
698e5ec
Update generation_config.hpp
xipingyan Nov 5, 2025
8c56ca8
Update tests/python_tests/test_add_extension.py
xipingyan Nov 5, 2025
c1e0a7c
Update src/python/py_openvino_genai.cpp
xipingyan Nov 5, 2025
57bd6a1
Update tests/python_tests/test_add_extension.py
xipingyan Nov 5, 2025
a4ef9c8
Update tests/python_tests/test_add_extension.py
xipingyan Nov 5, 2025
7260493
Update src/cpp/include/openvino/genai/generation_config.hpp
xipingyan Nov 10, 2025
689b08c
Get tokenizer so path.
xipingyan Nov 10, 2025
12e53a1
Merge branch 'master' into xp/introduce_add_extention_to_genai
xipingyan Nov 10, 2025
072b0b0
Update tests/python_tests/test_add_extension.py
xipingyan Nov 10, 2025
c05b092
Update tests/python_tests/test_add_extension.py
xipingyan Nov 10, 2025
7e1ac0f
Update tests/python_tests/test_add_extension.py
xipingyan Nov 10, 2025
ba86cc4
enable add_extension to vlm pipleline with properties.
xipingyan Nov 11, 2025
4705288
Merge remote-tracking branch 'origin/xp/introduce_add_extention_to_ge…
xipingyan Nov 11, 2025
323e4c2
remove unecessary file
xipingyan Nov 11, 2025
82b65bc
Update src/cpp/src/utils.cpp
xipingyan Nov 11, 2025
3b6391d
Update src/cpp/src/visual_language/pipeline.cpp
xipingyan Nov 11, 2025
0542ae3
Update tests/python_tests/test_vlm_pipeline.py
xipingyan Nov 11, 2025
02d5310
Update tests/python_tests/test_vlm_pipeline.py
xipingyan Nov 11, 2025
97a88fc
Update src/cpp/src/visual_language/pipeline.cpp
xipingyan Nov 11, 2025
200fb05
Remove print.
xipingyan Nov 11, 2025
f49ebb9
Update src/cpp/include/openvino/genai/generation_config.hpp
xipingyan Nov 11, 2025
611c1fb
git pushMerge branch 'xp/introduce_add_extention_to_genai' of https:/…
xipingyan Nov 11, 2025
34499ec
enable llm piple , lots of interleave call, about ContinuousBatchingP…
xipingyan Nov 11, 2025
a92fe1f
add test case of cb and llm
sunxiaoxia2022 Nov 13, 2025
75900d0
change extract_extensions to add_extensions_to_core
sunxiaoxia2022 Nov 14, 2025
bf6bd7f
fix conflict
sunxiaoxia2022 Nov 14, 2025
8a80895
revert some change
sunxiaoxia2022 Nov 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/cpp/src/continuous_batching/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline( const std::filesystem::p
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);

utils::add_extensions_to_core(properties_without_draft_model);
auto model = utils::read_model(models_path, properties);
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inconsistent use of properties vs properties_without_draft_model. At line 49, add_extensions_to_core is called on properties_without_draft_model, but at line 50, utils::read_model is called with the original properties instead of properties_without_draft_model. While this may work because read_model only extracts GGUF properties and ignores EXTENSIONS, it's inconsistent with the pattern used in other constructors (see lines 93 and 138 where the same variable is used for both). Consider changing line 50 to use properties_without_draft_model for consistency.

Suggested change
auto model = utils::read_model(models_path, properties);
auto model = utils::read_model(models_path, properties_without_draft_model);

Copilot uses AI. Check for mistakes.
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
Expand Down Expand Up @@ -88,6 +89,7 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);

utils::add_extensions_to_core(properties_without_draft_model);
auto model = utils::read_model(models_path, properties_without_draft_model);
auto [properties_without_draft_model_without_gguf, enable_save_ov_model] = utils::extract_gguf_properties(properties_without_draft_model);
properties_without_draft_model_without_gguf[ov::cache_model_path.name()] = models_path;
Expand Down Expand Up @@ -131,6 +133,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto properties_without_draft_model = properties;
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);

utils::add_extensions_to_core(properties_without_draft_model);
auto model = utils::singleton_core().read_model(model_str, weights_tensor);

auto rt_info = model->get_rt_info();
Expand Down Expand Up @@ -177,6 +181,8 @@ ContinuousBatchingPipeline::ContinuousBatchingPipeline(
auto draft_model_desr = utils::extract_draft_model_from_config(properties_without_draft_model);
auto is_prompt_lookup_enabled = extract_prompt_lookup_from_config(properties_without_draft_model);
auto model_pair = utils::get_model_weights_pair(models_map, "language");

utils::add_extensions_to_core(properties_without_draft_model);
auto model = utils::singleton_core().read_model(model_pair.first, model_pair.second);

auto rt_info = model->get_rt_info();
Expand Down
1 change: 1 addition & 0 deletions src/cpp/src/llm/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ ov::genai::LLMPipeline::LLMPipeline(

bool is_npu_requested = ov::genai::utils::is_npu_requested(device, user_properties);
auto [properties, attention_backend] = utils::extract_attention_backend(user_properties, is_npu_requested);
utils::add_extensions_to_core(properties);
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The add_extensions_to_core call is missing in the path-based LLMPipeline constructors (lines 173-210 and 212-248). While the tensor-based constructor at line 262 includes this call, the path-based ones don't. This is inconsistent and could cause issues if users try to use extensions with path-based initialization. Consider adding utils::add_extensions_to_core(properties); after line 220 (and similarly after line 182 in the other constructor) to ensure consistent behavior across all constructor overloads.

Copilot uses AI. Check for mistakes.

if (is_npu_requested) {
m_pimpl = StatefulPipeline::create(
Expand Down
4 changes: 3 additions & 1 deletion src/cpp/src/llm/pipeline_stateful.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,9 @@ StatefulLLMPipeline::StatefulLLMPipeline(
m_max_prompt_len = kv_desc.max_prompt_len;
m_max_kv_cache_size = kv_desc.max_prompt_len + kv_desc.min_response_len;
} else {
compiled_model = utils::singleton_core().compile_model(model, device, *filtered_properties);
auto properties_without_extensions = *filtered_properties;
utils::add_extensions_to_core(properties_without_extensions);
Comment on lines +81 to +82
Copy link

Copilot AI Nov 14, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extensions are not processed for the NPU case. The add_extensions_to_core call is only in the else block (line 82), but for NPU devices (line 75-79), utils::compile_decoder_for_npu is called with *filtered_properties that still contains the EXTENSIONS key. This means extensions won't be loaded for NPU devices. Consider moving the extension handling before the NPU check, e.g., add auto properties_without_extensions = *filtered_properties; utils::add_extensions_to_core(properties_without_extensions); before line 74 and use properties_without_extensions in both branches.

Copilot uses AI. Check for mistakes.
compiled_model = utils::singleton_core().compile_model(model, device, properties_without_extensions);
}
m_model_runner = compiled_model.create_infer_request();
ov::genai::utils::print_compiled_model_properties(compiled_model, "Stateful LLM model");
Expand Down
11 changes: 11 additions & 0 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -718,6 +718,17 @@ std::pair<ov::AnyMap, std::string> extract_attention_backend(const ov::AnyMap& e
return {properties, attention_backend};
};

void add_extensions_to_core(ov::AnyMap& properties) {
auto it = properties.find(EXTENSIONS_ARG_NAME);
if (it != properties.end()) {
auto extensions = it->second.as<std::vector<std::string>>();
for (const auto& extension : extensions) {
singleton_core().add_extension(extension);
}
properties.erase(it);
}
}

void release_core_plugin(const std::string& device) {
try {
singleton_core().unload_plugin(device);
Expand Down
3 changes: 3 additions & 0 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ void read_anymap_param(const ov::AnyMap& config_map, const std::string& name, T&
const std::string STREAMER_ARG_NAME = "streamer";
const std::string CONFIG_ARG_NAME = "generation_config";
const std::string DRAFT_MODEL_ARG_NAME = "draft_model";
const std::string EXTENSIONS_ARG_NAME = "EXTENSIONS";

template<typename Config = ov::genai::GenerationConfig>
Config from_config_json_if_exists(const std::filesystem::path& models_path, const char config_name[] = "generation_config.json") {
Expand Down Expand Up @@ -286,6 +287,8 @@ bool explicitly_requires_paged_attention(const ov::AnyMap& properties, bool is_n

std::pair<ov::AnyMap, std::string> extract_attention_backend(const ov::AnyMap& external_properties, bool is_npu_requested = false);

void add_extensions_to_core(ov::AnyMap& properties);

void save_openvino_model(const std::shared_ptr<ov::Model>& model, const std::string& save_path, bool compress_to_fp16);

ov::Tensor merge_text_and_image_embeddings_llava(const ov::Tensor& input_ids, ov::Tensor& text_embeds, const std::vector<ov::Tensor>& image_embeds, int64_t image_token_id);
Expand Down
10 changes: 7 additions & 3 deletions src/cpp/src/visual_language/embedding_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ EmbeddingsModel::EmbeddingsModel(const std::filesystem::path& model_dir,
const std::string& device,
const ov::AnyMap& properties) {
ov::Core core = utils::singleton_core();
std::shared_ptr<ov::Model> m_model = core.read_model(model_dir / "openvino_text_embeddings_model.xml", {}, properties);
auto properties_copy = properties;
utils::add_extensions_to_core(properties_copy);
std::shared_ptr<ov::Model> m_model = core.read_model(model_dir / "openvino_text_embeddings_model.xml", {}, properties_copy);
// apply embedding postprocessing step by merging them into the model
merge_postprocess(m_model, scale_emb);

ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties);
ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties_copy);
ov::genai::utils::print_compiled_model_properties(compiled_model, "text embeddings model");
m_embeddings_requests_queue = init(compiled_model);
}
Expand All @@ -60,11 +62,13 @@ EmbeddingsModel::EmbeddingsModel(const std::string& model,
const std::string& device,
const ov::AnyMap& properties) {
ov::Core core = utils::singleton_core();
auto properties_copy = properties;
utils::add_extensions_to_core(properties_copy);
std::shared_ptr<ov::Model> m_model = core.read_model(model, weights);
// apply embedding postprocessing step by merging them into the model
merge_postprocess(m_model, scale_emb);

ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties);
ov::CompiledModel compiled_model = core.compile_model(m_model, device, properties_copy);
m_embeddings_requests_queue = init(compiled_model);
}

Expand Down
8 changes: 6 additions & 2 deletions src/cpp/src/visual_language/pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{

auto properties_copy = properties;
auto language_model_path = models_dir / "openvino_language_model.xml";
auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy);
utils::add_extensions_to_core(properties_copy);
auto language_model = utils::singleton_core().read_model(language_model_path, {}, properties_copy);
auto kv_pos = ov::genai::utils::get_kv_axes_pos(language_model);

// In case user provided properties per-device
Expand Down Expand Up @@ -157,8 +158,11 @@ class VLMPipeline::VLMPipelineImpl : public VLMPipelineBase{
m_embedding = m_inputs_embedder->get_embedding_model();

auto m_language_pair = utils::get_model_weights_pair(models_map, "language");
auto properties_without_extensions = properties;
utils::add_extensions_to_core(properties_without_extensions);

m_language = utils::singleton_core().compile_model(
m_language_pair.first, m_language_pair.second, device, properties
m_language_pair.first, m_language_pair.second, device, properties_without_extensions
).create_infer_request();

m_language.get_tensor("attention_mask").set_shape({1, 0});
Expand Down
16 changes: 12 additions & 4 deletions src/cpp/src/visual_language/qwen2vl/classes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,10 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const std::filesystem::path& model_di
: VisionEncoder(model_dir, device, properties),
use_ov_image_preprocess(check_image_preprocess_env()) {
if (use_ov_image_preprocess) {
auto properties_without_extensions = properties;
utils::add_extensions_to_core(properties_without_extensions);
auto model_org = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_model.xml");
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties);
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties_without_extensions);
}
}

Expand All @@ -674,8 +676,10 @@ VisionEncoderQwen2VL::VisionEncoderQwen2VL(const ModelsMap& models_map,
if (use_ov_image_preprocess) {
const auto& [vision_encoder_model, vision_encoder_weights] =
utils::get_model_weights_pair(models_map, "vision_embeddings");
auto properties_without_extensions = properties;
utils::add_extensions_to_core(properties_without_extensions);
auto model_org = utils::singleton_core().read_model(vision_encoder_model, vision_encoder_weights);
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties);
m_ireq_queue_vision_encoder = create_vision_encoder_ireq(model_org, m_processor_config, device, properties_without_extensions);
}
}

Expand Down Expand Up @@ -923,10 +927,12 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL(
const std::string& device,
const ov::AnyMap device_config) :
IInputsEmbedder(vlm_config, model_dir, device, device_config) {
auto properties_without_extensions = device_config;
utils::add_extensions_to_core(properties_without_extensions);
auto model = utils::singleton_core().read_model(model_dir / "openvino_vision_embeddings_merger_model.xml");
utils::request_vl_sdpa_transformations(model);

auto compiled_model = utils::singleton_core().compile_model(model, device, device_config);
auto compiled_model = utils::singleton_core().compile_model(model, device, properties_without_extensions);

m_with_cu_seqlens_input = utils::check_vl_sdpa_transformations(compiled_model);
ov::genai::utils::print_compiled_model_properties(compiled_model,
Expand All @@ -952,14 +958,16 @@ InputsEmbedderQwen2VL::InputsEmbedderQwen2VL(
const std::string& device,
const ov::AnyMap device_config) :
IInputsEmbedder(vlm_config, models_map, tokenizer, config_dir_path, device, device_config) {
auto properties_without_extensions = device_config;
utils::add_extensions_to_core(properties_without_extensions);
auto model = utils::singleton_core().read_model(
utils::get_model_weights_pair(models_map, "vision_embeddings_merger").first,
utils::get_model_weights_pair(models_map, "vision_embeddings_merger").second);
utils::request_vl_sdpa_transformations(model);

auto compiled_model = utils::singleton_core().compile_model(model,
device,
device_config
properties_without_extensions
);

m_with_cu_seqlens_input = utils::check_vl_sdpa_transformations(compiled_model);
Expand Down
2 changes: 1 addition & 1 deletion src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -4465,4 +4465,4 @@ def draft_model(models_path: os.PathLike | str | bytes, device: str = '', **kwar
def get_version() -> str:
"""
OpenVINO GenAI version
"""
"""
14 changes: 14 additions & 0 deletions tests/python_tests/test_continuous_batching.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,3 +530,17 @@ def test_speculative_decoding_extended_perf_metrics(pipeline_type):
assert std_gen_duration == 0
else:
assert extended_perf_metrics is None


@pytest.mark.precommit
def test_continuous_batching_add_extension():
model_id = 'katuni4ka/tiny-random-phi3'
_, _, models_path = download_and_convert_model(model_id)

scheduler_config = SchedulerConfig()

properties = {"EXTENSIONS": ["fake_path"]}

with pytest.raises(RuntimeError) as exc_info:
ContinuousBatchingPipeline(models_path, scheduler_config, "CPU", properties)
assert "Cannot find entry point to the extension library" in str(exc_info.value)
11 changes: 11 additions & 0 deletions tests/python_tests/test_llm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,3 +809,14 @@ def py_streamer(py_str: str):
else:
assert it_cnt > 0


@pytest.mark.precommit
def test_llm_pipeline_add_extension():
model_id = "katuni4ka/tiny-random-phi3"
_, _, models_path = download_and_convert_model(model_id)

properties = {"EXTENSIONS": ["fake_path"]}

with pytest.raises(RuntimeError) as exc_info:
ov_genai.LLMPipeline(models_path, "CPU", **properties)
assert "Cannot find entry point to the extension library" in str(exc_info.value)
17 changes: 14 additions & 3 deletions tests/python_tests/test_vlm_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,12 +304,12 @@ def ov_pipe_model(request: pytest.FixtureRequest) -> VlmModelInfo:
ids=lambda p: f"{p[0]}/{p[1]}",
indirect=["ov_pipe_model"],
)

@pytest.fixture(scope="module")
def ov_continious_batching_pipe() -> ContinuousBatchingPipeline:
models_path = _get_ov_model(MODEL_IDS[0])
return ContinuousBatchingPipeline(models_path, SchedulerConfig(), "CPU")

@pytest.fixture(scope="module")
def ov_continious_batching_pipe_gemma() -> ContinuousBatchingPipeline:
models_path = _get_ov_model(MODEL_IDS[8])
Expand Down Expand Up @@ -1366,7 +1366,7 @@ def test_model_tags_missing_native(ov_pipe_model: VlmModelInfo):

with pytest.raises(RuntimeError):
ov_pipe.generate(image_tag(0))


@pytest.mark.parametrize(
"ov_pipe_model,has_image,has_video",
Expand Down Expand Up @@ -1508,3 +1508,14 @@ def get_nanollava_processor():
genai_text = genai_output.texts[0]

assert optimum_text == genai_text

@pytest.mark.precommit
def test_vlm_pipeline_add_extension():
model_id = VIDEO_MODEL_IDS[1]
models_path = _get_ov_model(model_id)

properties = {"EXTENSIONS": ["fake_path"]}

with pytest.raises(RuntimeError) as exc_info:
VLMPipeline(models_path, "CPU", config=properties)
assert "Cannot find entry point to the extension library" in str(exc_info.value)
Loading