From 75996ff8e1dbecc9a8b5869ddd7bbbc2e21ac59c Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 13:55:48 +0400 Subject: [PATCH 01/12] Remove lora from vlm models list --- .../_components/vlm-models-table/models.ts | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/site/docs/supported-models/_components/vlm-models-table/models.ts b/site/docs/supported-models/_components/vlm-models-table/models.ts index 347fb02b0f..423369a7ba 100644 --- a/site/docs/supported-models/_components/vlm-models-table/models.ts +++ b/site/docs/supported-models/_components/vlm-models-table/models.ts @@ -2,7 +2,6 @@ type VLMModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; notesLink?: string; }>; @@ -14,7 +13,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'InternVLChatModel', - loraSupport: false, links: [ 'https://huggingface.co/OpenGVLab/InternVL2-1B', 'https://huggingface.co/OpenGVLab/InternVL2-2B', @@ -39,7 +37,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-v1.5', - loraSupport: false, links: ['https://huggingface.co/llava-hf/llava-1.5-7b-hf'], }, ], @@ -49,12 +46,10 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'nanoLLaVA', - loraSupport: false, links: ['https://huggingface.co/qnguyen3/nanoLLaVA'], }, { name: 'nanoLLaVA-1.5', - loraSupport: false, links: ['https://huggingface.co/qnguyen3/nanoLLaVA-1.5'], }, ], @@ -64,7 +59,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-v1.6', - loraSupport: false, links: [ 'https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf', 'https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf', @@ -78,7 +72,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-Next-Video', - loraSupport: false, links: [ 'https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf', ], @@ -90,7 +83,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'MiniCPM-o-2_6', - loraSupport: false, links: ['https://huggingface.co/openbmb/MiniCPM-o-2_6'], notesLink: '#minicpm-o-notes', }, @@ -101,7 +93,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'MiniCPM-V-2_6', - loraSupport: false, links: ['https://huggingface.co/openbmb/MiniCPM-V-2_6'], }, ], @@ -111,7 +102,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'phi3_v', - loraSupport: false, links: [ 'https://huggingface.co/microsoft/Phi-3-vision-128k-instruct', 'https://huggingface.co/microsoft/Phi-3.5-vision-instruct', @@ -125,7 +115,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'phi4mm', - loraSupport: false, links: [ 'https://huggingface.co/microsoft/Phi-4-multimodal-instruct', ], @@ -138,7 +127,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'Qwen2-VL', - loraSupport: false, links: [ 'https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct', 'https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct', @@ -153,7 +141,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'Qwen2.5-VL', - loraSupport: false, links: [ 'https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct', 'https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct', @@ -166,7 +153,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'gemma3', - loraSupport: false, links: [ 'https://huggingface.co/google/gemma-3-4b-it', 'https://huggingface.co/google/gemma-3-12b-it', From c81a9238b62306d752c08831a7507725ac1d503b Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 13:56:40 +0400 Subject: [PATCH 02/12] Remove lora from whisper models list --- .../_components/whisper-models-table/models.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/site/docs/supported-models/_components/whisper-models-table/models.ts b/site/docs/supported-models/_components/whisper-models-table/models.ts index 44177796b7..ac84eaeb98 100644 --- a/site/docs/supported-models/_components/whisper-models-table/models.ts +++ b/site/docs/supported-models/_components/whisper-models-table/models.ts @@ -2,7 +2,6 @@ type WhisperModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; }>; }; @@ -13,7 +12,6 @@ export const WHISPER_MODELS: WhisperModelType[] = [ models: [ { name: 'Whisper', - loraSupport: false, links: [ 'https://huggingface.co/openai/whisper-tiny', 'https://huggingface.co/openai/whisper-tiny.en', @@ -28,7 +26,6 @@ export const WHISPER_MODELS: WhisperModelType[] = [ }, { name: 'Distil-Whisper', - loraSupport: false, links: [ 'https://huggingface.co/distil-whisper/distil-small.en', 'https://huggingface.co/distil-whisper/distil-medium.en', From 121b9cdf7895db339135f74f107c8453d1ad6bd6 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 13:57:19 +0400 Subject: [PATCH 03/12] Remove lora from speech generation models list --- .../_components/speech-generation-models-table/models.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/site/docs/supported-models/_components/speech-generation-models-table/models.ts b/site/docs/supported-models/_components/speech-generation-models-table/models.ts index 4e415bb473..285f8c613a 100644 --- a/site/docs/supported-models/_components/speech-generation-models-table/models.ts +++ b/site/docs/supported-models/_components/speech-generation-models-table/models.ts @@ -2,7 +2,6 @@ type SpeechGenerationModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; }>; }; @@ -13,7 +12,6 @@ export const SPEECH_GENERATION_MODELS: SpeechGenerationModelType[] = [ models: [ { name: 'SpeechT5 TTS', - loraSupport: false, links: ['https://huggingface.co/microsoft/speecht5_tts'], }, ], From 6629ea518708ca4c6e31ba85870dbceffebfb81a Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 14:11:04 +0400 Subject: [PATCH 04/12] Fix links relative to repo root --- .github/CONTRIBUTING.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 55d6e655db..407f422029 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,5 +1,5 @@ -1. See [pull_request_template.md](./pull_request_template.md) for pull request (PR) requirements. -2. See [BUILD.md](../src/docs/BUILD.md) for instructions on how to build `OpenVINO™ GenAI`. +1. See [pull_request_template.md](/.github/pull_request_template.md) for pull request (PR) requirements. +2. See [BUILD.md](/src/docs/BUILD.md) for instructions on how to build `OpenVINO™ GenAI`. 3. Code style is determined by the file the change is made in. If ambiguous, look into the neighboring files of the same type. In case of contradiction, pick any of the options but stay consistent in your choice. 4. Don't push branches directly to the upstream repository. Once a branch is pushed to upstream, non-admins lose push access to it, preventing you from updating your changes. Instead, push to your fork and open PRs from there. 5. Your PR will be tested after one of the developers approves the tests run. From e6e7a4c4cb052ba30bdee0db396a696cb465308e Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 14:14:49 +0400 Subject: [PATCH 05/12] Remove outdated docker document --- src/docs/DOCKER.md | 88 ---------------------------------------------- 1 file changed, 88 deletions(-) delete mode 100644 src/docs/DOCKER.md diff --git a/src/docs/DOCKER.md b/src/docs/DOCKER.md deleted file mode 100644 index 3d27eba19e..0000000000 --- a/src/docs/DOCKER.md +++ /dev/null @@ -1,88 +0,0 @@ -# Building openvino_llm:latest genai docker image -```Bash -git clone --branch ct-beam-search https://github.com/ilya-lavrenov/openvino.genai.git -git submodule update --remote --init -cd text_generation/causal_lm/cpp/continuous_batching/ -make -``` - -```Bash -cd ../../../.. -docker run -it -v `pwd`:/workspace/openvino.genai/ openvino_llm:latest -cd /workspace/openvino.genai/ -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j -``` - -# Downloading LLM models -```Bash -cd /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ -optimum-cli export openvino --model facebook/opt-125m ./ov_model -``` - -# Running throuput benchmark application -```Bash -cd /workspace/openvino.genai/ -./build/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark --model /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ov_model --dataset /workspace/ShareGPT_V3_unfiltered_cleaned_split.json --dynamic_split_fuse --num_prompts 100 --device CPU --plugin_config {/"ENABLE_PROFILING/":true} -``` - - -# How to create environment to debug and develop continuous batching project with OpenVINO: - -1. Build OpenVINO with python bindings: -``` -cd /path/to/openvino -mkdir build -cd build -cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. -make -j24 -``` -2. Set PYTHONPATH, LD_LIBRARY_PATH and OpenVINO_DIR environment variables: -``` -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/openvino/bin/intel64/{ov_build_type} -export PYTHONPATH=${PYTHONPATH}:/path/to/openvino/bin/intel64/Release/python:/path/to/openvino/tools/ovc -export OpenVINO_DIR=/path/to/openvino/{ov_build_type} -``` -3. Build OpenVINO tokenizers: -``` -cd /path/to/openvino.genai/thirdparty/openvino_tokenizers -mkdir build -cd build -cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. -make -j24 -``` -4. Create virtual environment to generate models and run python tests: -> NOTE: Comment installation of `openvino` and `openvino_tokenizers` to your env in `/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt -``` -cd /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching -python3 -m venv .env -source .env/bin/activate -pip3 install -r python/tests/requirements.txt -``` -5. Install `openvino_tokenizers` to your virtual environment: -``` -cd /path/to/openvino.genai/thirdparty/openvino_tokenizers -export OpenVINO_DIR=/path/to/openvino/build -pip install --no-deps . -``` -6. Create build directory in `continuous batching` project: -``` -mkdir /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build -``` -7. Generate cmake project: -``` -cd build -cmake -DCMAKE_BUILD_TYPE=Debug -DOpenVINO_DIR=/path/to/openvino/build .. -``` -8. Build the project -``` -make -j24 -``` -9. Extend `PYTHONPATH` by `continuous batching`: -``` -export PYTHONPATH=${PYTHONPATH}:/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build/python -``` -10. Run python tests: -``` -cd python/tests -pytest . -``` From 8fcd6cae50b69ae5582d94f3286f1b8c040c17c1 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 14:21:44 +0400 Subject: [PATCH 06/12] Remove duplicated how it works document and assets, update link to docs --- src/README.md | 2 +- src/docs/HOW_IT_WORKS.md | 25 ------------------------- src/docs/beam_idx-drop.gif | 3 --- src/docs/beam_idx-fork.gif | 3 --- src/docs/stateful.jpg | 3 --- src/docs/stateless.jpg | 3 --- 6 files changed, 1 insertion(+), 38 deletions(-) delete mode 100644 src/docs/HOW_IT_WORKS.md delete mode 100644 src/docs/beam_idx-drop.gif delete mode 100644 src/docs/beam_idx-fork.gif delete mode 100644 src/docs/stateful.jpg delete mode 100644 src/docs/stateless.jpg diff --git a/src/README.md b/src/README.md index d54ed69da0..6651ef6291 100644 --- a/src/README.md +++ b/src/README.md @@ -539,7 +539,7 @@ print(tokens.input_ids.shape) ## How It Works -For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). +For information on how OpenVINO™ GenAI works, refer to the [How It Works](https://openvinotoolkit.github.io/openvino.genai/docs/concepts/how-it-works) page. ## Supported Models diff --git a/src/docs/HOW_IT_WORKS.md b/src/docs/HOW_IT_WORKS.md deleted file mode 100644 index 075ab1e9c2..0000000000 --- a/src/docs/HOW_IT_WORKS.md +++ /dev/null @@ -1,25 +0,0 @@ -# OpenVINO™ GenAI: How it works - -## Stateful LLM - -A common optimization for LLM inference is using a past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model originally implemented in a DL framework (e.g. PyTorch models from Hugging Face). For further optimization and easier use, the model is transformed to a stateful form. This transformation improves inference performance and decreases the allocated runtime memory in long-running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to the stateful models can be found in the [Stateful Models article](https://docs.openvino.ai/2025/openvino-workflow/running-inference/stateful-models.html). - -Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. - -To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../tools/llm_bench) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. - -Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. - -![](beam_idx-fork.gif) -![](beam_idx-drop.gif) - -The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: -1. `input_ids` contains the next selected token -2. `attention_mask` is filled with `1` -3. `position_ids` encodes a position of currently generating token in the sequence -4. `beam_idx` selects beams - -The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. - -![](stateless.jpg) -![](stateful.jpg) diff --git a/src/docs/beam_idx-drop.gif b/src/docs/beam_idx-drop.gif deleted file mode 100644 index 1c0f596d06..0000000000 --- a/src/docs/beam_idx-drop.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:701839c28ac1e05c1c9e23823c74a10149a343210192e51df36e563ff6e257e4 -size 5700875 diff --git a/src/docs/beam_idx-fork.gif b/src/docs/beam_idx-fork.gif deleted file mode 100644 index 6255595bfd..0000000000 --- a/src/docs/beam_idx-fork.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:292753b30a2153c92cedf16672ba182a851ec30c95c309cdaca13173f00fe700 -size 6062552 diff --git a/src/docs/stateful.jpg b/src/docs/stateful.jpg deleted file mode 100644 index 11e7f68e23..0000000000 --- a/src/docs/stateful.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6fb5ab9990c845eef8847bdf76799fcaefe0a9afa10fb9d07f6df4394a9e2ad -size 129471 diff --git a/src/docs/stateless.jpg b/src/docs/stateless.jpg deleted file mode 100644 index 0e8823e77e..0000000000 --- a/src/docs/stateless.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20904ff7a8793359b978cfcdc85c482e0764291af17b572936955f586e202ea9 -size 113440 From dac198e3cd22951388a84fe1db05ceacab511364 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 14:27:59 +0400 Subject: [PATCH 07/12] Remove duplicated workflow image, update link --- README.md | 2 +- src/docs/openvino_genai.svg | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) delete mode 100644 src/docs/openvino_genai.svg diff --git a/README.md b/README.md index 19b060d4b2..0d72465ab9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ![Python](https://img.shields.io/badge/python-3.10+-green) ![OS](https://img.shields.io/badge/OS-Linux_|_Windows_|_MacOS-blue) -![](src/docs/openvino_genai.svg) +![](site/static/img/openvino-genai-workflow.svg) diff --git a/src/docs/openvino_genai.svg b/src/docs/openvino_genai.svg deleted file mode 100644 index 1517985d4c..0000000000 --- a/src/docs/openvino_genai.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:703732cd6a85f2cbcfd0915d63c10483114f05b71b834d2228501700074d0053 -size 1053573 From 7f1b27774b0e60946dcb21e3490f6273e5a82c4f Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 14:37:55 +0400 Subject: [PATCH 08/12] Remove duplicated debug log document, update link --- site/docs/guides/debug-logging.mdx | 10 ++++ src/README.md | 2 +- src/docs/DEBUG_LOG.md | 79 ------------------------------ 3 files changed, 11 insertions(+), 80 deletions(-) delete mode 100644 src/docs/DEBUG_LOG.md diff --git a/site/docs/guides/debug-logging.mdx b/site/docs/guides/debug-logging.mdx index 9a774b9fd0..fd91dd8d82 100644 --- a/site/docs/guides/debug-logging.mdx +++ b/site/docs/guides/debug-logging.mdx @@ -76,3 +76,13 @@ Accepted token rate, %: 51 =============================== Request_id: 0 ||| 40 0 40 20 0 0 40 40 0 20 20 20 0 40 0 0 20 80 0 80 20 0 0 0 40 80 0 40 60 40 80 0 0 0 0 40 20 20 0 40 20 40 0 20 0 0 0 ``` + +When GGUF model passed to pipeline, the details debug info will be also printed. + +```sh title="Output:" +[GGUF Reader]: Loading and unpacking model from: gguf_models/qwen2.5-0.5b-instruct-q4_0.gguf +[GGUF Reader]: Loading and unpacking model done. Time: 196ms +[GGUF Reader]: Start generating OpenVINO model... +[GGUF Reader]: Save generated OpenVINO model to: gguf_models/openvino_model.xml done. Time: 466 ms +[GGUF Reader]: Model generation done. Time: 757ms +``` diff --git a/src/README.md b/src/README.md index 6651ef6291..d43cd35352 100644 --- a/src/README.md +++ b/src/README.md @@ -547,4 +547,4 @@ For a list of supported models, refer to the [Supported Models](https://openvino ## Debug Log -For using debug log, refer to [DEBUG Log](./doc/DEBUG_LOG.md). +For using debug log, refer to the [Debug Logging](https://openvinotoolkit.github.io/openvino.genai/docs/guides/debug-logging) page. diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md deleted file mode 100644 index 50b59b6eae..0000000000 --- a/src/docs/DEBUG_LOG.md +++ /dev/null @@ -1,79 +0,0 @@ -## 1. Using Debug Log - -There are six levels of logs, which can be called explicitly or set via the ``OPENVINO_LOG_LEVEL`` environment variable: - -0 - ``ov::log::Level::NO`` -1 - ``ov::log::Level::ERR`` -2 - ``ov::log::Level::WARNING`` -3 - ``ov::log::Level::INFO`` -4 - ``ov::log::Level::DEBUG`` -5 - ``ov::log::Level::TRACE`` - -When setting the environment variable OPENVINO_LOG_LEVEL > ov::log::Level::WARNING, the properties of the compiled model can be printed. - -For example: - -Linux - export OPENVINO_LOG_LEVEL=3 -Windows - set OPENVINO_LOG_LEVEL=3 - -the properties of the compiled model are printed as follows: -```sh - NETWORK_NAME: Model0 - OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - NUM_STREAMS: 1 - INFERENCE_NUM_THREADS: 48 - PERF_COUNT: NO - INFERENCE_PRECISION_HINT: bf16 - PERFORMANCE_HINT: LATENCY - EXECUTION_MODE_HINT: PERFORMANCE - PERFORMANCE_HINT_NUM_REQUESTS: 0 - ENABLE_CPU_PINNING: YES - SCHEDULING_CORE_TYPE: ANY_CORE - MODEL_DISTRIBUTION_POLICY: - ENABLE_HYPER_THREADING: NO - EXECUTION_DEVICES: CPU - CPU_DENORMALS_OPTIMIZATION: NO - LOG_LEVEL: LOG_NONE - CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1 - DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - KV_CACHE_PRECISION: f16 - AFFINITY: CORE - EXECUTION_DEVICES: - CPU: Intel(R) Xeon(R) Platinum 8468 -``` - -When Speculative Decoding or Prompt Lookup pipeline is executed, performance metrics will be also printed. - -For example: - -``` -=============================== -Total duration, sec: 26.6217 -Draft model duration, sec: 1.60329 -Main model duration, sec: 25.0184 -Draft model duration, %: 6.02248 -Main model duration, %: 93.9775 -AVG acceptance rate, %: 21.6809 -=============================== -REQUEST_ID: 0 -Main model iterations: 47 -Token per sec: 3.75633 -AVG acceptance rate, %: 21.6809 -Accepted tokens by draft model: 51 -Generated tokens: 100 -Accepted token rate, %: 51 -=============================== -Request_id: 0 ||| 40 0 40 20 0 0 40 40 0 20 20 20 0 40 0 0 20 80 0 80 20 0 0 0 40 80 0 40 60 40 80 0 0 0 0 40 20 20 0 40 20 40 0 20 0 0 0 -``` - - -When GGUF model passed to LLMPipeline, the details debug info will be also printed. - -For example: -``` -[GGUF Reader]: Loading and unpacking model from: gguf_models/qwen2.5-0.5b-instruct-q4_0.gguf -[GGUF Reader]: Loading and unpacking model done. Time: 196ms -[GGUF Reader]: Start generating OpenVINO model... -[GGUF Reader]: Save generated OpenVINO model to: gguf_models/openvino_model.xml done. Time: 466 ms -[GGUF Reader]: Model generation done. Time: 757ms -``` From 759d0cea20a9c288b674401d2ac20acebf1a261d Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 15:59:35 +0400 Subject: [PATCH 09/12] Update model table components --- .../_components/speech-generation-models-table/index.tsx | 8 +++----- .../_components/vlm-models-table/index.tsx | 8 +++----- .../_components/whisper-models-table/index.tsx | 8 +++----- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/site/docs/supported-models/_components/speech-generation-models-table/index.tsx b/site/docs/supported-models/_components/speech-generation-models-table/index.tsx index f6d5d87b55..a9e6a0bf99 100644 --- a/site/docs/supported-models/_components/speech-generation-models-table/index.tsx +++ b/site/docs/supported-models/_components/speech-generation-models-table/index.tsx @@ -1,9 +1,9 @@ import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { SPEECH_GENERATION_MODELS } from './models'; export default function SpeechGenerationModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = SPEECH_GENERATION_MODELS.map(({ architecture, models }) => ( <> @@ -12,13 +12,11 @@ export default function SpeechGenerationModelsTable(): React.JSX.Element { {architecture} {models[0].name} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} diff --git a/site/docs/supported-models/_components/vlm-models-table/index.tsx b/site/docs/supported-models/_components/vlm-models-table/index.tsx index 765a3fd8fb..210d4afd12 100644 --- a/site/docs/supported-models/_components/vlm-models-table/index.tsx +++ b/site/docs/supported-models/_components/vlm-models-table/index.tsx @@ -1,10 +1,10 @@ import Link from '@docusaurus/Link'; import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { VLM_MODELS } from './models'; export default function VLMModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = VLM_MODELS.map(({ architecture, models }) => ( <> @@ -20,13 +20,11 @@ export default function VLMModelsTable(): React.JSX.Element { )} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} diff --git a/site/docs/supported-models/_components/whisper-models-table/index.tsx b/site/docs/supported-models/_components/whisper-models-table/index.tsx index b48b1dfb54..65a1ae5fd4 100644 --- a/site/docs/supported-models/_components/whisper-models-table/index.tsx +++ b/site/docs/supported-models/_components/whisper-models-table/index.tsx @@ -1,9 +1,9 @@ import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { WHISPER_MODELS } from './models'; export default function WhisperModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = WHISPER_MODELS.map(({ architecture, models }) => ( <> @@ -12,13 +12,11 @@ export default function WhisperModelsTable(): React.JSX.Element { {architecture} {models[0].name} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} From 204c3088c8f5f2a4aa176cd2b098ba283edd6d19 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 18:52:15 +0400 Subject: [PATCH 10/12] Add lora support notes for each use case in models list --- site/docs/supported-models/index.mdx | 56 ++++++++++++++++------------ 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/site/docs/supported-models/index.mdx b/site/docs/supported-models/index.mdx index a76ad62732..7090113c8c 100644 --- a/site/docs/supported-models/index.mdx +++ b/site/docs/supported-models/index.mdx @@ -9,26 +9,22 @@ import TextRerankModelsTable from './_components/text-rerank-models-table'; # Supported Models -:::info - +:::info Models Compatibility Other models with similar architectures may also work successfully even if not explicitly validated. Consider testing any unlisted models to verify compatibility with your specific use case. - ::: ## Large Language Models (LLMs) - - -:::info - -LoRA adapters are supported. - +:::tip LoRA Support +LLM pipeline supports LoRA adapters. ::: + + ::::info -The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. +The LLM pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion: 1. `input_ids` contains the tokens. @@ -50,6 +46,10 @@ Models should belong to the same family and have the same tokenizers. ## Visual Language Models (VLMs) +:::info LoRA Support +VLM pipeline does **not** support LoRA adapters. +::: + :::warning VLM Models Notes @@ -62,7 +62,7 @@ pip install timm einops ``` #### MiniCPMO {#minicpm-o-notes} -1. `openbmb/MiniCPM-o-2_6` doesn't support transformers>=4.52 which is required for `optimum-cli` export. +1. `openbmb/MiniCPM-o-2_6` doesn't support `transformers>=4.52` which is required for `optimum-cli` export. 2. `--task image-text-to-text` is required for `optimum-cli export openvino --trust-remote-code` because `image-text-to-text` isn't `MiniCPM-o-2_6`'s native task. #### phi3_v {#phi3_v-notes} @@ -73,42 +73,52 @@ generation_config.set_eos_token_id(pipe.get_tokenizer().get_eos_token_id()) ``` #### phi4mm {#phi4mm-notes} -Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/files to fix the model export for transformers>=4.50 +Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/files to fix the model export for `transformers>=4.50` ::: ## Speech Recognition Models (Whisper-based) +:::info LoRA Support +Speech recognition pipeline does **not** support LoRA adapters. +::: + ## Speech Generation Models +:::info LoRA Support +Speech generation pipeline does **not** support LoRA adapters. +::: + ## Text Embeddings Models - - -:::info -LoRA adapters are not supported. +:::info LoRA Support +Text embeddings pipeline does **not** support LoRA adapters. ::: -:::info + + +:::warning Text Embeddings Models Notes Qwen3 Embedding models require `--task feature-extraction` during the conversion with `optimum-cli`. ::: ## Text Rerank Models - - -:::info -LoRA adapters are not supported. +:::info LoRA Support +Text rerank pipeline does **not** support LoRA adapters. ::: -:::info + + +:::warning Text Rerank Models Notes Text Rerank models require appropriate `--task` provided during the conversion with `optimum-cli`. Task can be found in the table above. ::: -:::info +___ + +:::info Hugging Face Notes Some models may require access request submission on the Hugging Face page to be downloaded. If https://huggingface.co/ is down, the conversion step won't be able to download the models. From 508bc8523c200073c945b1ee61df0b0c8128078c Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 18:52:57 +0400 Subject: [PATCH 11/12] Apply copilot wording suggestion --- site/docs/guides/debug-logging.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/site/docs/guides/debug-logging.mdx b/site/docs/guides/debug-logging.mdx index fd91dd8d82..3e8a32a4f3 100644 --- a/site/docs/guides/debug-logging.mdx +++ b/site/docs/guides/debug-logging.mdx @@ -77,7 +77,7 @@ Accepted token rate, %: 51 Request_id: 0 ||| 40 0 40 20 0 0 40 40 0 20 20 20 0 40 0 0 20 80 0 80 20 0 0 0 40 80 0 40 60 40 80 0 0 0 0 40 20 20 0 40 20 40 0 20 0 0 0 ``` -When GGUF model passed to pipeline, the details debug info will be also printed. +When a GGUF model is passed to the pipeline, the detailed debug info will also be printed. ```sh title="Output:" [GGUF Reader]: Loading and unpacking model from: gguf_models/qwen2.5-0.5b-instruct-q4_0.gguf From 7ee2c735785a736ad5b553731d84f29fa472d385 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 7 Nov 2025 18:58:07 +0400 Subject: [PATCH 12/12] Replace duplicated tokenization section in readme with docs link --- src/README.md | 95 +-------------------------------------------------- 1 file changed, 1 insertion(+), 94 deletions(-) diff --git a/src/README.md b/src/README.md index d43cd35352..6026af738b 100644 --- a/src/README.md +++ b/src/README.md @@ -442,100 +442,7 @@ Structured output enforcement guarantees correct JSON formatting, but does not e ### Tokenization -OpenVINO™ GenAI provides a way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library. - -It can be initialized from the path, in-memory IR representation or obtained from the `ov::genai::LLMPipeline` object. - -```cpp -// Initialize from the path -#include "openvino/genai/llm_pipeline.hpp" -auto tokenizer = ov::genai::Tokenizer(models_path); - -// Get instance of Tokenizer from LLMPipeline. -auto pipe = ov::genai::LLMPipeline pipe(models_path, "CPU"); -auto tokenzier = pipe.get_tokenizer(); -```` - -```python -import openvino_genai as ov_genai -tokenizer = ov_genai.Tokenizer(models_path) - -# Or from LLMPipeline. -pipe = ov_genai.LLMPipeline(models_path, "CPU") -tokenizer = pipe.get_tokenizer() -``` - -`Tokenizer` has `encode` and `decode` methods which support the following arguments: `add_special_tokens`, `skip_special_tokens`, `pad_to_max_length`, `max_length` arguments. - -In order to disable adding special tokens do the following, in C++: -```cpp -auto tokens = tokenizer.encode("The Sun is yellow because", ov::genai::add_special_tokens(false)); -``` - -In Python: -```python -tokens = tokenizer.encode("The Sun is yellow because", add_special_tokens=False) -``` -The `encode` method returns a `TokenizedInputs` object containing `input_ids` and `attention_mask`, both stored as ov::Tensor. Since ov::Tensor requires fixed-length sequences, padding is applied to match the longest sequence in a batch, ensuring a uniform shape. Also resulting sequence is truncated by `max_length`. If this value is not defined by used, it's is taken from the IR. - -Both padding and `max_length` can be controlled by the user. If `pad_to_max_length` is set to true, then instead of padding to the longest sequence it will be padded to the `max_length`. - -Below are example how padding can be controlled, in C++: -```cpp -#include "openvino/genai/llm_pipeline.hpp" -auto tokenizer = ov::genai::Tokenizer(models_path); -std::vector prompts = {"The Sun is yellow because", "The"}; - -// Since prompt is definitely shorter than maximal length (which is taken from IR) will not affect shape. -// Resulting shape is defined by length of the longest tokens sequence. -// Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="longest", truncation=True) -tokens = tokenizer.encode({"The Sun is yellow because", "The"}) -// or is equivalent to -tokens = tokenizer.encode({"The Sun is yellow because", "The"}, ov::genai::pad_to_max_length(False)) -// out_shape: [2, 6] - -// Resulting tokens tensor will be padded to 1024. -// Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="max_length", truncation=True, max_length=1024) -tokens = tokenizer.encode({"The Sun is yellow because", - "The", - std::string(2000, 'n')}, ov::genai::pad_to_max_length(True), ov::genai::max_length(1024)) -// out_shape: [3, 1024] - -// For single string prompts truncation and padding are also applied. -tokens = tokenizer.encode({"The Sun is yellow because"}, ov::genai::pad_to_max_length(True), ov::genai::max_length(1024)) -// out_shape: [1, 128] -``` - -In Python: -```python -import openvino_genai as ov_genai - -tokenizer = ov_genai.Tokenizer(models_path) -prompts = ["The Sun is yellow because", "The"] - -# Since prompt is definitely shorter than maximal length (which is taken from IR) will not affect shape. -# Resulting shape is defined by length of the longest tokens sequence. -# Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="longest", truncation=True) -tokens = tokenizer.encode(["The Sun is yellow because", "The"]) -# or is equivalent to -tokens = tokenizer.encode(["The Sun is yellow because", "The"], pad_to_max_length=False) -print(tokens.input_ids.shape) -# out_shape: [2, 6] - -# Resulting tokens tensor will be padded to 1024, sequences which exceed this length will be truncated. -# Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="max_length", truncation=True, max_length=1024) -tokens = tokenizer.encode(["The Sun is yellow because", - "The" - "The longest string ever" * 2000], pad_to_max_length=True, max_length=1024) -print(tokens.input_ids.shape) -# out_shape: [3, 1024] - -# For single string prompts truncation and padding are also applied. -tokens = tokenizer.encode("The Sun is yellow because", pad_to_max_length=True, max_length=128) -print(tokens.input_ids.shape) -# out_shape: [1, 128] - -``` +Refer to the [Tokenization](https://openvinotoolkit.github.io/openvino.genai/docs/guides/tokenization) page for details and usage examples. ## How It Works