openvinotoolkit
diff --git a/‎prepare_llm_models.sh‎
Lines changed: 16 additions & 7 deletions b/‎prepare_llm_models.sh‎
Lines changed: 16 additions & 7 deletions
diff --git a/‎src/llm/servable.cpp‎
Lines changed: 6 additions & 1 deletion b/‎src/llm/servable.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/llm/servable_initializer.cpp‎
Lines changed: 42 additions & 71 deletions b/‎src/llm/servable_initializer.cpp‎
Lines changed: 42 additions & 71 deletions
diff --git a/‎src/llm/servable_initializer.hpp‎
Lines changed: 0 additions & 4 deletions b/‎src/llm/servable_initializer.hpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/test/llm/assisted_decoding_test.cpp‎
Lines changed: 15 additions & 17 deletions b/‎src/test/llm/assisted_decoding_test.cpp‎
Lines changed: 15 additions & 17 deletions
@@ -20,7 +20,7 @@ if [ -z "$1" ]; then
   exit 1
 fi
 
-CB_MODEL="facebook/opt-125m"
+TEXT_GENERATION_MODEL="facebook/opt-125m"
 TOKENIZER_FILE="openvino_tokenizer.bin"
 LEGACY_MODEL_FILE="1/model.bin"
 EMBEDDING_MODEL="thenlper/gte-small"
@@ -35,7 +35,7 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
 MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
 GPT_OSS="openai/gpt-oss-20b"
 
-MODELS=("$CB_MODEL/$TOKENIZER_FILE" "$EMBEDDING_MODEL/embeddings/$LEGACY_MODEL_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
+MODELS=("$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" "$EMBEDDING_MODEL/embeddings/$LEGACY_MODEL_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
 
 all_exist=true
 for model in "${MODELS[@]}"; do
@@ -69,13 +69,22 @@ else
 fi
 mkdir -p $1
 
-if [ -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
-  echo "Models file $1/$CB_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
+if [ -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
+  echo "Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
 else
-  python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
+  python3 demos/common/export_models/export_model.py text_generation --source_model "$TEXT_GENERATION_MODEL" --weight-format int8 --model_repository_path $1
+  if [ ! -f "$1/$TEXT_GENERATION_MODEL/chat_template.jinja" ]; then
+    dummy_chat_template="{% for message in messages %}\
+{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}\
+{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}\
+{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}\
+{% endif %}\
+{% endfor %}"
+    echo "$dummy_chat_template" > "$1/$TEXT_GENERATION_MODEL/chat_template.jinja"
+  fi
 fi
-if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
-  echo "[ERROR] Models file $1/$CB_MODEL/$TOKENIZER_FILE does not exist."
+if [ ! -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
+  echo "[ERROR] Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE does not exist."
   exit 1
 fi
 
 
@@ -124,7 +124,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
 #else
         ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
         constexpr bool add_generation_prompt = true;  // confirm it should be hardcoded
-        inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+        try {
+            inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+        } catch (const std::exception& e) {
+            SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
+            return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
+        }
 #endif
         if (inputText.size() == 0) {
             return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
 
@@ -51,15 +51,17 @@
 namespace ovms {
 
 static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
-static const std::string DEFAULT_CHAT_TEMPLATE = R"({% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }})";
 
 void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
 #if (PYTHON_DISABLE == 0)
     ExtraGenerationInfo extraGenInfo = readExtraGenerationInfo(properties, chatTemplateDirectory);
     loadPyTemplateProcessor(properties, extraGenInfo);
 #else
-    loadDefaultTemplateProcessorIfNeeded(properties);
+    if (properties->tokenizer.get_chat_template().empty()) {
+        SPDLOG_LOGGER_DEBUG(modelmanager_logger, CHAT_TEMPLATE_WARNING_MESSAGE);
+    }
 #endif
+    // In non-python build, GenAI handles chat template loading
 }
 
 #if (PYTHON_DISABLE == 0)
@@ -123,29 +125,34 @@ ExtraGenerationInfo GenAiServableInitializer::readExtraGenerationInfo(std::share
 }
 
 void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo) {
-    // GGUF models specific validation
-    if (extraGenInfo.isGgufModel) {
-        bool errorFound = false;
-        if (extraGenInfo.eosTokenFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer eos token not found in tokenizer nor in vocabulary but required for GGUF models.");
-            errorFound = true;
-        }
-        if (extraGenInfo.bosTokenFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer bos token not found in tokenizer nor in vocabulary but required for GGUF models.");
-            errorFound = true;
-        }
-        if (extraGenInfo.chatTemplateFromTokenizer.empty()) {
-            SPDLOG_ERROR("Tokenizer chat template not found in tokenizer but required for GGUF models.");
-            errorFound = true;
-        }
-        if (errorFound)
-            return;
+    // At this point tokenizer cannot be uninitialized as we need to access its methods for prepare for chat template processing
+    if (properties->tokenizer == ov::genai::Tokenizer()) {
+        SPDLOG_LOGGER_ERROR(modelmanager_logger, "Tokenizer is not initialized. Cannot load chat template processor.");
+        return;
+    }
+    std::string chatTemplate = properties->tokenizer.get_original_chat_template();
+    std::string bosToken = properties->tokenizer.get_bos_token();
+    std::string eosToken = properties->tokenizer.get_eos_token();
+    if (bosToken.empty()) {
+        SPDLOG_ERROR("BOS token was not found in model files.");
+        return;
     }
+    if (eosToken.empty()) {
+        SPDLOG_ERROR("EOS token was not found in model files.");
+        return;
+    }
+    if (chatTemplate.empty()) {
+        SPDLOG_ERROR("Chat template was not found in model files.");
+        return;
+    }
+
+    properties->templateProcessor.bosToken = bosToken;
+    properties->templateProcessor.eosToken = eosToken;
+
     py::gil_scoped_acquire acquire;
     try {
-        auto locals = py::dict("tokenizer_template"_a = extraGenInfo.chatTemplateFromTokenizer,
-            "templates_directory"_a = extraGenInfo.chatTemplateDirectory,
-            "is_gguf_model"_a = extraGenInfo.isGgufModel);
+        auto locals = py::dict("chat_template"_a = chatTemplate,
+            "templates_directory"_a = extraGenInfo.chatTemplateDirectory);
         py::exec(R"(
             # Following the logic from:
             # https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/tokenization_utils_base.py#L1837
@@ -214,71 +221,44 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
                         self._rendered_blocks = None
                         self._generation_indices = None
 
-
-            # Default chat template accepts only single message and outputs only it's 'content'
-            # effectively turning it into a regular prompt. 
-            default_chat_template = "{% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }}"
-
-            bos_token = ""
-            eos_token = ""
-            chat_template = default_chat_template
+            
+            # Optional dedicated tool chat template (might not be present)
             tool_chat_template = None
 
+            # Variables needed to be set at the end of this script execution
             template = None
             tool_template = None
 
-            # Try to read template from template.jinja file
-            jinja_file = Path(templates_directory + "/chat_template.jinja")
-            jinja_file_legacy = Path(templates_directory + "/template.jinja")
+            # Load Jinja2 environment
             template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
             jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
             jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
             jinja_env.globals["raise_exception"] = raise_exception
             jinja_env.globals["strftime_now"] = strftime_now
             jinja_env.filters["from_json"] = json.loads
-            if jinja_file.is_file():
-                template = jinja_env.get_template("chat_template.jinja")
-            elif jinja_file_legacy.is_file():
-                template = jinja_env.get_template("template.jinja")
 
-            # Try to read data from tokenizer_config.json
+            # Try to read data from tokenizer_config.json to get additional tool chat template if present
             tokenizer_config_file = Path(templates_directory + "/tokenizer_config.json")
             if tokenizer_config_file.is_file():
                 f = open(templates_directory + "/tokenizer_config.json", "r", encoding="utf-8")
                 data = json.load(f)
-                bos_token = data.get("bos_token", "")
-                bos_token = "" if bos_token is None else bos_token  # Null token conversion to empty string.
-                eos_token = data.get("eos_token", "")
-                eos_token = "" if eos_token is None else eos_token  # Null token conversion to empty string.
-
-                chat_template = data.get("chat_template", default_chat_template)
-                if isinstance(chat_template, list):
-                    for template_entry in chat_template:
+
+                chat_template_from_tokenizer_config = data.get("chat_template", None)
+                if isinstance(chat_template_from_tokenizer_config, list):
+                    for template_entry in chat_template_from_tokenizer_config:
                         if isinstance(template_entry, dict):
-                            if template_entry.get("name") == "default":
-                                chat_template = template_entry.get("template")
-                            elif template_entry.get("name") == "tool_use":
+                            if template_entry.get("name") == "tool_use":
                                 tool_chat_template = template_entry.get("template")
-            if template is None:
-                if is_gguf_model and (chat_template == default_chat_template):
-                    # GGUF model directory might not contain files with chat template and in that case we use template read from the tokenizer 
-                    template = jinja_env.from_string(tokenizer_template)
-                else:
-                    template = jinja_env.from_string(chat_template)
+            
+            # Load templates from strings
+            template = jinja_env.from_string(chat_template)
             if tool_chat_template is not None:
                 tool_template = jinja_env.from_string(tool_chat_template)
             else:
                 tool_template = template
         )",
             py::globals(), locals);
 
-        if (extraGenInfo.isGgufModel) {
-            properties->templateProcessor.bosToken = extraGenInfo.bosTokenFromTokenizer;
-            properties->templateProcessor.eosToken = extraGenInfo.eosTokenFromTokenizer;
-        } else {
-            properties->templateProcessor.bosToken = locals["bos_token"].cast<std::string>();
-            properties->templateProcessor.eosToken = locals["eos_token"].cast<std::string>();
-        }
         properties->templateProcessor.chatTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["template"]);
         properties->templateProcessor.toolTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["tool_template"]);
     } catch (const pybind11::error_already_set& e) {
@@ -298,15 +278,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
         SPDLOG_DEBUG("Chat template loading failed with an unexpected error");
     }
 }
-
-#else
-void GenAiServableInitializer::loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties) {
-    const std::string modelChatTemplate = properties->tokenizer.get_chat_template();
-    if (modelChatTemplate.empty()) {
-        SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Could not load model chat template. Using default template.");
-        properties->tokenizer.set_chat_template(DEFAULT_CHAT_TEMPLATE);
-    }
-}
 #endif
 
 Status parseModelsPath(std::string& outPath, std::string modelsPath, std::string graphPath) {
 
@@ -52,10 +52,6 @@ class GenAiServableInitializer {
     // Use Python Jinja module for template processing
     static void loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo);
     static ExtraGenerationInfo readExtraGenerationInfo(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory);
-#else
-    // In C++ only version we use GenAI for template processing, but to have the same behavior as in Python-enabled version
-    // we use default template if model does not have its own, so that servable can also work on chat/completion endpoint.
-    static void loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties);
 #endif
     /*
     initialize method implementation MUST fill servable with all required properties i.e. pipeline, tokenizer, configs etc. based on mediapipe node options.
 
@@ -94,8 +94,12 @@ class AssistedDecodingPipelinesHttpTest : public ::testing::Test {
         }
     }
 
-    int generateExpectedText(std::string prompt, bool addSpecialTokens) {
+    int generateExpectedText(std::string prompt, bool addSpecialTokens, bool applyChatTemplate) {
         try {
+            if (applyChatTemplate) {
+                ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
+                prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
+            }
             ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
             std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
             auto generationHandle = cbPipe->add_request(
@@ -162,7 +166,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     // Static number of candidates
@@ -185,8 +189,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     auto& choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 
     // Dynamic number of candidates
     requestBody = R"(
@@ -208,15 +211,14 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDecoding) {
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     // Static number of candidates
@@ -247,8 +249,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 
     // Dynamic number of candidates
     requestBody = R"(
@@ -278,8 +279,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, speculativeDecodingExclusiveParametersProvided) {
@@ -318,7 +318,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     std::string requestBody = R"(
@@ -341,15 +341,14 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     auto& choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDecoding) {
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     auto requestBody = R"(
@@ -380,8 +379,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDe
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 }
 
 // Consider parametrization of negative tests with request body and endpoint as parameters