diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 55d6e655db..407f422029 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,5 +1,5 @@ -1. See [pull_request_template.md](./pull_request_template.md) for pull request (PR) requirements. -2. See [BUILD.md](../src/docs/BUILD.md) for instructions on how to build `OpenVINO™ GenAI`. +1. See [pull_request_template.md](/.github/pull_request_template.md) for pull request (PR) requirements. +2. See [BUILD.md](/src/docs/BUILD.md) for instructions on how to build `OpenVINO™ GenAI`. 3. Code style is determined by the file the change is made in. If ambiguous, look into the neighboring files of the same type. In case of contradiction, pick any of the options but stay consistent in your choice. 4. Don't push branches directly to the upstream repository. Once a branch is pushed to upstream, non-admins lose push access to it, preventing you from updating your changes. Instead, push to your fork and open PRs from there. 5. Your PR will be tested after one of the developers approves the tests run. diff --git a/README.md b/README.md index 19b060d4b2..0d72465ab9 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ ![Python](https://img.shields.io/badge/python-3.10+-green) ![OS](https://img.shields.io/badge/OS-Linux_|_Windows_|_MacOS-blue) -![](src/docs/openvino_genai.svg) +![](site/static/img/openvino-genai-workflow.svg) diff --git a/site/docs/guides/debug-logging.mdx b/site/docs/guides/debug-logging.mdx index 9a774b9fd0..3e8a32a4f3 100644 --- a/site/docs/guides/debug-logging.mdx +++ b/site/docs/guides/debug-logging.mdx @@ -76,3 +76,13 @@ Accepted token rate, %: 51 =============================== Request_id: 0 ||| 40 0 40 20 0 0 40 40 0 20 20 20 0 40 0 0 20 80 0 80 20 0 0 0 40 80 0 40 60 40 80 0 0 0 0 40 20 20 0 40 20 40 0 20 0 0 0 ``` + +When a GGUF model is passed to the pipeline, the detailed debug info will also be printed. + +```sh title="Output:" +[GGUF Reader]: Loading and unpacking model from: gguf_models/qwen2.5-0.5b-instruct-q4_0.gguf +[GGUF Reader]: Loading and unpacking model done. Time: 196ms +[GGUF Reader]: Start generating OpenVINO model... +[GGUF Reader]: Save generated OpenVINO model to: gguf_models/openvino_model.xml done. Time: 466 ms +[GGUF Reader]: Model generation done. Time: 757ms +``` diff --git a/site/docs/supported-models/_components/speech-generation-models-table/index.tsx b/site/docs/supported-models/_components/speech-generation-models-table/index.tsx index f6d5d87b55..a9e6a0bf99 100644 --- a/site/docs/supported-models/_components/speech-generation-models-table/index.tsx +++ b/site/docs/supported-models/_components/speech-generation-models-table/index.tsx @@ -1,9 +1,9 @@ import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { SPEECH_GENERATION_MODELS } from './models'; export default function SpeechGenerationModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = SPEECH_GENERATION_MODELS.map(({ architecture, models }) => ( <> @@ -12,13 +12,11 @@ export default function SpeechGenerationModelsTable(): React.JSX.Element { {architecture} {models[0].name} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} diff --git a/site/docs/supported-models/_components/speech-generation-models-table/models.ts b/site/docs/supported-models/_components/speech-generation-models-table/models.ts index 4e415bb473..285f8c613a 100644 --- a/site/docs/supported-models/_components/speech-generation-models-table/models.ts +++ b/site/docs/supported-models/_components/speech-generation-models-table/models.ts @@ -2,7 +2,6 @@ type SpeechGenerationModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; }>; }; @@ -13,7 +12,6 @@ export const SPEECH_GENERATION_MODELS: SpeechGenerationModelType[] = [ models: [ { name: 'SpeechT5 TTS', - loraSupport: false, links: ['https://huggingface.co/microsoft/speecht5_tts'], }, ], diff --git a/site/docs/supported-models/_components/vlm-models-table/index.tsx b/site/docs/supported-models/_components/vlm-models-table/index.tsx index 765a3fd8fb..210d4afd12 100644 --- a/site/docs/supported-models/_components/vlm-models-table/index.tsx +++ b/site/docs/supported-models/_components/vlm-models-table/index.tsx @@ -1,10 +1,10 @@ import Link from '@docusaurus/Link'; import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { VLM_MODELS } from './models'; export default function VLMModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = VLM_MODELS.map(({ architecture, models }) => ( <> @@ -20,13 +20,11 @@ export default function VLMModelsTable(): React.JSX.Element { )} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} diff --git a/site/docs/supported-models/_components/vlm-models-table/models.ts b/site/docs/supported-models/_components/vlm-models-table/models.ts index 347fb02b0f..423369a7ba 100644 --- a/site/docs/supported-models/_components/vlm-models-table/models.ts +++ b/site/docs/supported-models/_components/vlm-models-table/models.ts @@ -2,7 +2,6 @@ type VLMModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; notesLink?: string; }>; @@ -14,7 +13,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'InternVLChatModel', - loraSupport: false, links: [ 'https://huggingface.co/OpenGVLab/InternVL2-1B', 'https://huggingface.co/OpenGVLab/InternVL2-2B', @@ -39,7 +37,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-v1.5', - loraSupport: false, links: ['https://huggingface.co/llava-hf/llava-1.5-7b-hf'], }, ], @@ -49,12 +46,10 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'nanoLLaVA', - loraSupport: false, links: ['https://huggingface.co/qnguyen3/nanoLLaVA'], }, { name: 'nanoLLaVA-1.5', - loraSupport: false, links: ['https://huggingface.co/qnguyen3/nanoLLaVA-1.5'], }, ], @@ -64,7 +59,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-v1.6', - loraSupport: false, links: [ 'https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf', 'https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf', @@ -78,7 +72,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'LLaVA-Next-Video', - loraSupport: false, links: [ 'https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf', ], @@ -90,7 +83,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'MiniCPM-o-2_6', - loraSupport: false, links: ['https://huggingface.co/openbmb/MiniCPM-o-2_6'], notesLink: '#minicpm-o-notes', }, @@ -101,7 +93,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'MiniCPM-V-2_6', - loraSupport: false, links: ['https://huggingface.co/openbmb/MiniCPM-V-2_6'], }, ], @@ -111,7 +102,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'phi3_v', - loraSupport: false, links: [ 'https://huggingface.co/microsoft/Phi-3-vision-128k-instruct', 'https://huggingface.co/microsoft/Phi-3.5-vision-instruct', @@ -125,7 +115,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'phi4mm', - loraSupport: false, links: [ 'https://huggingface.co/microsoft/Phi-4-multimodal-instruct', ], @@ -138,7 +127,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'Qwen2-VL', - loraSupport: false, links: [ 'https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct', 'https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct', @@ -153,7 +141,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'Qwen2.5-VL', - loraSupport: false, links: [ 'https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct', 'https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct', @@ -166,7 +153,6 @@ export const VLM_MODELS: VLMModelType[] = [ models: [ { name: 'gemma3', - loraSupport: false, links: [ 'https://huggingface.co/google/gemma-3-4b-it', 'https://huggingface.co/google/gemma-3-12b-it', diff --git a/site/docs/supported-models/_components/whisper-models-table/index.tsx b/site/docs/supported-models/_components/whisper-models-table/index.tsx index b48b1dfb54..65a1ae5fd4 100644 --- a/site/docs/supported-models/_components/whisper-models-table/index.tsx +++ b/site/docs/supported-models/_components/whisper-models-table/index.tsx @@ -1,9 +1,9 @@ import React from 'react'; -import { BaseModelsTable, LinksCell, StatusCell } from '../base-models-table'; +import { BaseModelsTable, LinksCell } from '../base-models-table'; import { WHISPER_MODELS } from './models'; export default function WhisperModelsTable(): React.JSX.Element { - const headers = ['Architecture', 'Models', 'LoRA Support', 'Example HuggingFace Models']; + const headers = ['Architecture', 'Models', 'Example HuggingFace Models']; const rows = WHISPER_MODELS.map(({ architecture, models }) => ( <> @@ -12,13 +12,11 @@ export default function WhisperModelsTable(): React.JSX.Element { {architecture} {models[0].name} - - {models.slice(1).map(({ name, loraSupport, links }) => ( + {models.slice(1).map(({ name, links }) => ( {name} - ))} diff --git a/site/docs/supported-models/_components/whisper-models-table/models.ts b/site/docs/supported-models/_components/whisper-models-table/models.ts index 44177796b7..ac84eaeb98 100644 --- a/site/docs/supported-models/_components/whisper-models-table/models.ts +++ b/site/docs/supported-models/_components/whisper-models-table/models.ts @@ -2,7 +2,6 @@ type WhisperModelType = { architecture: string; models: Array<{ name: string; - loraSupport: boolean; links: string[]; }>; }; @@ -13,7 +12,6 @@ export const WHISPER_MODELS: WhisperModelType[] = [ models: [ { name: 'Whisper', - loraSupport: false, links: [ 'https://huggingface.co/openai/whisper-tiny', 'https://huggingface.co/openai/whisper-tiny.en', @@ -28,7 +26,6 @@ export const WHISPER_MODELS: WhisperModelType[] = [ }, { name: 'Distil-Whisper', - loraSupport: false, links: [ 'https://huggingface.co/distil-whisper/distil-small.en', 'https://huggingface.co/distil-whisper/distil-medium.en', diff --git a/site/docs/supported-models/index.mdx b/site/docs/supported-models/index.mdx index a76ad62732..7090113c8c 100644 --- a/site/docs/supported-models/index.mdx +++ b/site/docs/supported-models/index.mdx @@ -9,26 +9,22 @@ import TextRerankModelsTable from './_components/text-rerank-models-table'; # Supported Models -:::info - +:::info Models Compatibility Other models with similar architectures may also work successfully even if not explicitly validated. Consider testing any unlisted models to verify compatibility with your specific use case. - ::: ## Large Language Models (LLMs) - - -:::info - -LoRA adapters are supported. - +:::tip LoRA Support +LLM pipeline supports LoRA adapters. ::: + + ::::info -The pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. +The LLM pipeline can work with other similar topologies produced by `optimum-intel` with the same model signature. The model is required to have the following inputs after the conversion: 1. `input_ids` contains the tokens. @@ -50,6 +46,10 @@ Models should belong to the same family and have the same tokenizers. ## Visual Language Models (VLMs) +:::info LoRA Support +VLM pipeline does **not** support LoRA adapters. +::: + :::warning VLM Models Notes @@ -62,7 +62,7 @@ pip install timm einops ``` #### MiniCPMO {#minicpm-o-notes} -1. `openbmb/MiniCPM-o-2_6` doesn't support transformers>=4.52 which is required for `optimum-cli` export. +1. `openbmb/MiniCPM-o-2_6` doesn't support `transformers>=4.52` which is required for `optimum-cli` export. 2. `--task image-text-to-text` is required for `optimum-cli export openvino --trust-remote-code` because `image-text-to-text` isn't `MiniCPM-o-2_6`'s native task. #### phi3_v {#phi3_v-notes} @@ -73,42 +73,52 @@ generation_config.set_eos_token_id(pipe.get_tokenizer().get_eos_token_id()) ``` #### phi4mm {#phi4mm-notes} -Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/files to fix the model export for transformers>=4.50 +Apply https://huggingface.co/microsoft/Phi-4-multimodal-instruct/discussions/78/files to fix the model export for `transformers>=4.50` ::: ## Speech Recognition Models (Whisper-based) +:::info LoRA Support +Speech recognition pipeline does **not** support LoRA adapters. +::: + ## Speech Generation Models +:::info LoRA Support +Speech generation pipeline does **not** support LoRA adapters. +::: + ## Text Embeddings Models - - -:::info -LoRA adapters are not supported. +:::info LoRA Support +Text embeddings pipeline does **not** support LoRA adapters. ::: -:::info + + +:::warning Text Embeddings Models Notes Qwen3 Embedding models require `--task feature-extraction` during the conversion with `optimum-cli`. ::: ## Text Rerank Models - - -:::info -LoRA adapters are not supported. +:::info LoRA Support +Text rerank pipeline does **not** support LoRA adapters. ::: -:::info + + +:::warning Text Rerank Models Notes Text Rerank models require appropriate `--task` provided during the conversion with `optimum-cli`. Task can be found in the table above. ::: -:::info +___ + +:::info Hugging Face Notes Some models may require access request submission on the Hugging Face page to be downloaded. If https://huggingface.co/ is down, the conversion step won't be able to download the models. diff --git a/src/README.md b/src/README.md index d54ed69da0..6026af738b 100644 --- a/src/README.md +++ b/src/README.md @@ -442,104 +442,11 @@ Structured output enforcement guarantees correct JSON formatting, but does not e ### Tokenization -OpenVINO™ GenAI provides a way to tokenize and detokenize text using the `ov::genai::Tokenizer` class. The `Tokenizer` is a high level abstraction over the OpenVINO Tokenizers library. - -It can be initialized from the path, in-memory IR representation or obtained from the `ov::genai::LLMPipeline` object. - -```cpp -// Initialize from the path -#include "openvino/genai/llm_pipeline.hpp" -auto tokenizer = ov::genai::Tokenizer(models_path); - -// Get instance of Tokenizer from LLMPipeline. -auto pipe = ov::genai::LLMPipeline pipe(models_path, "CPU"); -auto tokenzier = pipe.get_tokenizer(); -```` - -```python -import openvino_genai as ov_genai -tokenizer = ov_genai.Tokenizer(models_path) - -# Or from LLMPipeline. -pipe = ov_genai.LLMPipeline(models_path, "CPU") -tokenizer = pipe.get_tokenizer() -``` - -`Tokenizer` has `encode` and `decode` methods which support the following arguments: `add_special_tokens`, `skip_special_tokens`, `pad_to_max_length`, `max_length` arguments. - -In order to disable adding special tokens do the following, in C++: -```cpp -auto tokens = tokenizer.encode("The Sun is yellow because", ov::genai::add_special_tokens(false)); -``` - -In Python: -```python -tokens = tokenizer.encode("The Sun is yellow because", add_special_tokens=False) -``` -The `encode` method returns a `TokenizedInputs` object containing `input_ids` and `attention_mask`, both stored as ov::Tensor. Since ov::Tensor requires fixed-length sequences, padding is applied to match the longest sequence in a batch, ensuring a uniform shape. Also resulting sequence is truncated by `max_length`. If this value is not defined by used, it's is taken from the IR. - -Both padding and `max_length` can be controlled by the user. If `pad_to_max_length` is set to true, then instead of padding to the longest sequence it will be padded to the `max_length`. - -Below are example how padding can be controlled, in C++: -```cpp -#include "openvino/genai/llm_pipeline.hpp" -auto tokenizer = ov::genai::Tokenizer(models_path); -std::vector prompts = {"The Sun is yellow because", "The"}; - -// Since prompt is definitely shorter than maximal length (which is taken from IR) will not affect shape. -// Resulting shape is defined by length of the longest tokens sequence. -// Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="longest", truncation=True) -tokens = tokenizer.encode({"The Sun is yellow because", "The"}) -// or is equivalent to -tokens = tokenizer.encode({"The Sun is yellow because", "The"}, ov::genai::pad_to_max_length(False)) -// out_shape: [2, 6] - -// Resulting tokens tensor will be padded to 1024. -// Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="max_length", truncation=True, max_length=1024) -tokens = tokenizer.encode({"The Sun is yellow because", - "The", - std::string(2000, 'n')}, ov::genai::pad_to_max_length(True), ov::genai::max_length(1024)) -// out_shape: [3, 1024] - -// For single string prompts truncation and padding are also applied. -tokens = tokenizer.encode({"The Sun is yellow because"}, ov::genai::pad_to_max_length(True), ov::genai::max_length(1024)) -// out_shape: [1, 128] -``` - -In Python: -```python -import openvino_genai as ov_genai - -tokenizer = ov_genai.Tokenizer(models_path) -prompts = ["The Sun is yellow because", "The"] - -# Since prompt is definitely shorter than maximal length (which is taken from IR) will not affect shape. -# Resulting shape is defined by length of the longest tokens sequence. -# Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="longest", truncation=True) -tokens = tokenizer.encode(["The Sun is yellow because", "The"]) -# or is equivalent to -tokens = tokenizer.encode(["The Sun is yellow because", "The"], pad_to_max_length=False) -print(tokens.input_ids.shape) -# out_shape: [2, 6] - -# Resulting tokens tensor will be padded to 1024, sequences which exceed this length will be truncated. -# Equivalent of HuggingFace hf_tokenizer.encode(prompt, padding="max_length", truncation=True, max_length=1024) -tokens = tokenizer.encode(["The Sun is yellow because", - "The" - "The longest string ever" * 2000], pad_to_max_length=True, max_length=1024) -print(tokens.input_ids.shape) -# out_shape: [3, 1024] - -# For single string prompts truncation and padding are also applied. -tokens = tokenizer.encode("The Sun is yellow because", pad_to_max_length=True, max_length=128) -print(tokens.input_ids.shape) -# out_shape: [1, 128] - -``` +Refer to the [Tokenization](https://openvinotoolkit.github.io/openvino.genai/docs/guides/tokenization) page for details and usage examples. ## How It Works -For information on how OpenVINO™ GenAI works, refer to the [How It Works Section](./docs/HOW_IT_WORKS.md). +For information on how OpenVINO™ GenAI works, refer to the [How It Works](https://openvinotoolkit.github.io/openvino.genai/docs/concepts/how-it-works) page. ## Supported Models @@ -547,4 +454,4 @@ For a list of supported models, refer to the [Supported Models](https://openvino ## Debug Log -For using debug log, refer to [DEBUG Log](./doc/DEBUG_LOG.md). +For using debug log, refer to the [Debug Logging](https://openvinotoolkit.github.io/openvino.genai/docs/guides/debug-logging) page. diff --git a/src/docs/DEBUG_LOG.md b/src/docs/DEBUG_LOG.md deleted file mode 100644 index 50b59b6eae..0000000000 --- a/src/docs/DEBUG_LOG.md +++ /dev/null @@ -1,79 +0,0 @@ -## 1. Using Debug Log - -There are six levels of logs, which can be called explicitly or set via the ``OPENVINO_LOG_LEVEL`` environment variable: - -0 - ``ov::log::Level::NO`` -1 - ``ov::log::Level::ERR`` -2 - ``ov::log::Level::WARNING`` -3 - ``ov::log::Level::INFO`` -4 - ``ov::log::Level::DEBUG`` -5 - ``ov::log::Level::TRACE`` - -When setting the environment variable OPENVINO_LOG_LEVEL > ov::log::Level::WARNING, the properties of the compiled model can be printed. - -For example: - -Linux - export OPENVINO_LOG_LEVEL=3 -Windows - set OPENVINO_LOG_LEVEL=3 - -the properties of the compiled model are printed as follows: -```sh - NETWORK_NAME: Model0 - OPTIMAL_NUMBER_OF_INFER_REQUESTS: 1 - NUM_STREAMS: 1 - INFERENCE_NUM_THREADS: 48 - PERF_COUNT: NO - INFERENCE_PRECISION_HINT: bf16 - PERFORMANCE_HINT: LATENCY - EXECUTION_MODE_HINT: PERFORMANCE - PERFORMANCE_HINT_NUM_REQUESTS: 0 - ENABLE_CPU_PINNING: YES - SCHEDULING_CORE_TYPE: ANY_CORE - MODEL_DISTRIBUTION_POLICY: - ENABLE_HYPER_THREADING: NO - EXECUTION_DEVICES: CPU - CPU_DENORMALS_OPTIMIZATION: NO - LOG_LEVEL: LOG_NONE - CPU_SPARSE_WEIGHTS_DECOMPRESSION_RATE: 1 - DYNAMIC_QUANTIZATION_GROUP_SIZE: 32 - KV_CACHE_PRECISION: f16 - AFFINITY: CORE - EXECUTION_DEVICES: - CPU: Intel(R) Xeon(R) Platinum 8468 -``` - -When Speculative Decoding or Prompt Lookup pipeline is executed, performance metrics will be also printed. - -For example: - -``` -=============================== -Total duration, sec: 26.6217 -Draft model duration, sec: 1.60329 -Main model duration, sec: 25.0184 -Draft model duration, %: 6.02248 -Main model duration, %: 93.9775 -AVG acceptance rate, %: 21.6809 -=============================== -REQUEST_ID: 0 -Main model iterations: 47 -Token per sec: 3.75633 -AVG acceptance rate, %: 21.6809 -Accepted tokens by draft model: 51 -Generated tokens: 100 -Accepted token rate, %: 51 -=============================== -Request_id: 0 ||| 40 0 40 20 0 0 40 40 0 20 20 20 0 40 0 0 20 80 0 80 20 0 0 0 40 80 0 40 60 40 80 0 0 0 0 40 20 20 0 40 20 40 0 20 0 0 0 -``` - - -When GGUF model passed to LLMPipeline, the details debug info will be also printed. - -For example: -``` -[GGUF Reader]: Loading and unpacking model from: gguf_models/qwen2.5-0.5b-instruct-q4_0.gguf -[GGUF Reader]: Loading and unpacking model done. Time: 196ms -[GGUF Reader]: Start generating OpenVINO model... -[GGUF Reader]: Save generated OpenVINO model to: gguf_models/openvino_model.xml done. Time: 466 ms -[GGUF Reader]: Model generation done. Time: 757ms -``` diff --git a/src/docs/DOCKER.md b/src/docs/DOCKER.md deleted file mode 100644 index 3d27eba19e..0000000000 --- a/src/docs/DOCKER.md +++ /dev/null @@ -1,88 +0,0 @@ -# Building openvino_llm:latest genai docker image -```Bash -git clone --branch ct-beam-search https://github.com/ilya-lavrenov/openvino.genai.git -git submodule update --remote --init -cd text_generation/causal_lm/cpp/continuous_batching/ -make -``` - -```Bash -cd ../../../.. -docker run -it -v `pwd`:/workspace/openvino.genai/ openvino_llm:latest -cd /workspace/openvino.genai/ -cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ && cmake --build ./build/ -j -``` - -# Downloading LLM models -```Bash -cd /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ -optimum-cli export openvino --model facebook/opt-125m ./ov_model -``` - -# Running throuput benchmark application -```Bash -cd /workspace/openvino.genai/ -./build/text_generation/causal_lm/cpp/continuous_batching/apps/throughput_benchmark --model /workspace/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/ov_model --dataset /workspace/ShareGPT_V3_unfiltered_cleaned_split.json --dynamic_split_fuse --num_prompts 100 --device CPU --plugin_config {/"ENABLE_PROFILING/":true} -``` - - -# How to create environment to debug and develop continuous batching project with OpenVINO: - -1. Build OpenVINO with python bindings: -``` -cd /path/to/openvino -mkdir build -cd build -cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. -make -j24 -``` -2. Set PYTHONPATH, LD_LIBRARY_PATH and OpenVINO_DIR environment variables: -``` -export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/path/to/openvino/bin/intel64/{ov_build_type} -export PYTHONPATH=${PYTHONPATH}:/path/to/openvino/bin/intel64/Release/python:/path/to/openvino/tools/ovc -export OpenVINO_DIR=/path/to/openvino/{ov_build_type} -``` -3. Build OpenVINO tokenizers: -``` -cd /path/to/openvino.genai/thirdparty/openvino_tokenizers -mkdir build -cd build -cmake -DCMAKE_BUILD_TYPE={ov_build_type} .. -make -j24 -``` -4. Create virtual environment to generate models and run python tests: -> NOTE: Comment installation of `openvino` and `openvino_tokenizers` to your env in `/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/python/tests/requirements.txt -``` -cd /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching -python3 -m venv .env -source .env/bin/activate -pip3 install -r python/tests/requirements.txt -``` -5. Install `openvino_tokenizers` to your virtual environment: -``` -cd /path/to/openvino.genai/thirdparty/openvino_tokenizers -export OpenVINO_DIR=/path/to/openvino/build -pip install --no-deps . -``` -6. Create build directory in `continuous batching` project: -``` -mkdir /path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build -``` -7. Generate cmake project: -``` -cd build -cmake -DCMAKE_BUILD_TYPE=Debug -DOpenVINO_DIR=/path/to/openvino/build .. -``` -8. Build the project -``` -make -j24 -``` -9. Extend `PYTHONPATH` by `continuous batching`: -``` -export PYTHONPATH=${PYTHONPATH}:/path/to/openvino.genai/text_generation/causal_lm/cpp/continuous_batching/build/python -``` -10. Run python tests: -``` -cd python/tests -pytest . -``` diff --git a/src/docs/HOW_IT_WORKS.md b/src/docs/HOW_IT_WORKS.md deleted file mode 100644 index 075ab1e9c2..0000000000 --- a/src/docs/HOW_IT_WORKS.md +++ /dev/null @@ -1,25 +0,0 @@ -# OpenVINO™ GenAI: How it works - -## Stateful LLM - -A common optimization for LLM inference is using a past KV (key/value)-cache. This cache is represented by the corresponding inputs and outputs in a model originally implemented in a DL framework (e.g. PyTorch models from Hugging Face). For further optimization and easier use, the model is transformed to a stateful form. This transformation improves inference performance and decreases the allocated runtime memory in long-running text generation scenarios. It is achieved by hiding inputs and outputs of the model that represent past KV-cache tensors and handling them inside the model in a more efficient way. Although the cache is still accessible with state API. It is opposed to stateless model approach requiring manipulating these inputs and outputs explicitly. An introduction to the stateful models can be found in the [Stateful Models article](https://docs.openvino.ai/2025/openvino-workflow/running-inference/stateful-models.html). - -Hiding KV-cache introduces a peculiarity for beam search algorithm. Beam search suggests batched inference of multiple beams. The design described here so far would result in generating multiple independent sequences of tokens. Beam search algorithm, on the other hand, requires removing some of the ongoing beams and splitting other beams to multiple branches. Beam removal requires deleting corresponding KV-cache entry and beam splitting requires copying corresponding KV-cache values. - -To provide the possibility to implement beam search without accessing model's internal state, a stateful LLM converted with `optimum-intel` or [llm_bench](../../tools/llm_bench) introduces an additional 1-dimentional `beam_idx` input. `beam_idx` must contain indexes of elements in a batch which are intended to be selected and will evolve during the next beam search iteration. There's only one beam when the generation starts. That beam corresponds to the initial prompt. `beam_idx` must have values: `[0, 0]` to keep the initial beam and introduce its copy. The dynamic batch size enables to change the number of beams dynamically. `beam_idx` must have `[1]` as the value to remove zeroth sequence and keep the second beam only. - -Assume there are two running beams. To proceed with generating both beams at the next iteration, `beam_idx` values must be `[0, 1]`, pointing to batch elements `0` and `1`. To drop the last beam and split the other beam in two, `beam_idx` must be set to `[0, 0]`. This results in utilizing only the part of KV cache corresponding to the zeroth element in the batch. The process of selecting proper entries in cache is called Cache Reorder. - -![](beam_idx-fork.gif) -![](beam_idx-drop.gif) - -The images below represent stateless and stateful LLM pipelines. The model has 4 inputs: -1. `input_ids` contains the next selected token -2. `attention_mask` is filled with `1` -3. `position_ids` encodes a position of currently generating token in the sequence -4. `beam_idx` selects beams - -The model has 1 output `logits` describing the predicted distribution over the next tokens. And there's KV cache state. - -![](stateless.jpg) -![](stateful.jpg) diff --git a/src/docs/beam_idx-drop.gif b/src/docs/beam_idx-drop.gif deleted file mode 100644 index 1c0f596d06..0000000000 --- a/src/docs/beam_idx-drop.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:701839c28ac1e05c1c9e23823c74a10149a343210192e51df36e563ff6e257e4 -size 5700875 diff --git a/src/docs/beam_idx-fork.gif b/src/docs/beam_idx-fork.gif deleted file mode 100644 index 6255595bfd..0000000000 --- a/src/docs/beam_idx-fork.gif +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:292753b30a2153c92cedf16672ba182a851ec30c95c309cdaca13173f00fe700 -size 6062552 diff --git a/src/docs/openvino_genai.svg b/src/docs/openvino_genai.svg deleted file mode 100644 index 1517985d4c..0000000000 --- a/src/docs/openvino_genai.svg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:703732cd6a85f2cbcfd0915d63c10483114f05b71b834d2228501700074d0053 -size 1053573 diff --git a/src/docs/stateful.jpg b/src/docs/stateful.jpg deleted file mode 100644 index 11e7f68e23..0000000000 --- a/src/docs/stateful.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6fb5ab9990c845eef8847bdf76799fcaefe0a9afa10fb9d07f6df4394a9e2ad -size 129471 diff --git a/src/docs/stateless.jpg b/src/docs/stateless.jpg deleted file mode 100644 index 0e8823e77e..0000000000 --- a/src/docs/stateless.jpg +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:20904ff7a8793359b978cfcdc85c482e0764291af17b572936955f586e202ea9 -size 113440