Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/lib_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ def check_dir(start_dir):
"results.txt",
"windows_bdba.bat",
"windows_sign.bat",
"dummy_facebook_template.jinja",
]

exclude_directories = ['/dist/', 'release_files/thirdparty-licenses', 'extras/chat_template_examples']
Expand Down
35 changes: 12 additions & 23 deletions prepare_llm_models.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ if [ -z "$1" ]; then
exit 1
fi

CB_MODEL="facebook/opt-125m"
TEXT_GENERATION_MODEL="facebook/opt-125m"
TOKENIZER_FILE="openvino_tokenizer.bin"
LEGACY_MODEL_FILE="1/model.bin"
EMBEDDING_MODEL="thenlper/gte-small"
Expand All @@ -35,23 +35,6 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
GPT_OSS="openai/gpt-oss-20b"

MODELS=("$CB_MODEL/$TOKENIZER_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")

all_exist=true
for model in "${MODELS[@]}"; do
if [ ! -f "$1/$model" ]; then
echo "Model file does not exist $1/$model"
all_exist=false
break
fi
echo "Model file exist $1/$model"
done

if $all_exist; then
echo "All model directories exist in $1. Skipping downloading models."
exit 0
fi

if [ "$(python3 -c 'import sys; print(sys.version_info[1])')" -le "8" ]; then echo "Prepare models with python > 3.8."; exit 1 ; fi

echo "Downloading LLM testing models to directory $1"
Expand All @@ -69,16 +52,22 @@ else
fi
mkdir -p $1

if [ -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
echo "Models file $1/$CB_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
if [ -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
echo "Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
else
python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
python3 demos/common/export_models/export_model.py text_generation --source_model "$TEXT_GENERATION_MODEL" --weight-format int8 --model_repository_path $1
fi
if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
echo "[ERROR] Models file $1/$CB_MODEL/$TOKENIZER_FILE does not exist."

if [ ! -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
echo "[ERROR] Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE does not exist."
exit 1
fi

if [ ! -f "$1/$TEXT_GENERATION_MODEL/chat_template.jinja" ]; then
echo "Copying dummy chat template to $TEXT_GENERATION_MODEL model directory."
cp src/test/llm/dummy_facebook_template.jinja "$1/$TEXT_GENERATION_MODEL/chat_template.jinja"
fi

if [ -f "$1/$VLM_MODEL/$TOKENIZER_FILE" ]; then
echo "Model file $1/$VLM_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
else
Expand Down
7 changes: 6 additions & 1 deletion src/llm/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
#else
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
try {
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
} catch (const std::exception& e) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
}
#endif
if (inputText.size() == 0) {
return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");
Expand Down
123 changes: 52 additions & 71 deletions src/llm/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,17 @@
namespace ovms {

static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})";

void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
#if (PYTHON_DISABLE == 0)
ExtraGenerationInfo extraGenInfo = readExtraGenerationInfo(properties, chatTemplateDirectory);
loadPyTemplateProcessor(properties, extraGenInfo);
#else
loadDefaultTemplateProcessorIfNeeded(properties);
if (properties->tokenizer.get_chat_template().empty()) {
SPDLOG_LOGGER_DEBUG(modelmanager_logger, CHAT_TEMPLATE_WARNING_MESSAGE);
}
#endif
// In non-python build, GenAI handles chat template loading
}

#if (PYTHON_DISABLE == 0)
Expand Down Expand Up @@ -123,29 +125,37 @@ ExtraGenerationInfo GenAiServableInitializer::readExtraGenerationInfo(std::share
}

void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo) {
// GGUF models specific validation
if (extraGenInfo.isGgufModel) {
bool errorFound = false;
if (extraGenInfo.eosTokenFromTokenizer.empty()) {
SPDLOG_ERROR("Tokenizer eos token not found in tokenizer nor in vocabulary but required for GGUF models.");
errorFound = true;
}
if (extraGenInfo.bosTokenFromTokenizer.empty()) {
SPDLOG_ERROR("Tokenizer bos token not found in tokenizer nor in vocabulary but required for GGUF models.");
errorFound = true;
}
if (extraGenInfo.chatTemplateFromTokenizer.empty()) {
SPDLOG_ERROR("Tokenizer chat template not found in tokenizer but required for GGUF models.");
errorFound = true;
}
if (errorFound)
return;
// At this point tokenizer cannot be uninitialized as we need to access its methods for prepare for chat template processing
if (properties->tokenizer == ov::genai::Tokenizer()) {
Copy link

Copilot AI Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] Comparing tokenizer equality against a default-constructed instance may not reliably detect uninitialized state. Consider adding an explicit is_initialized() or is_valid() method to the Tokenizer class, or check for null/empty internal state instead.

Suggested change
if (properties->tokenizer == ov::genai::Tokenizer()) {
// Use a more robust check for tokenizer initialization
if (properties->tokenizer.get_vocab_size() == 0) {

Copilot uses AI. Check for mistakes.
SPDLOG_LOGGER_ERROR(modelmanager_logger, "Tokenizer is not initialized. Cannot load chat template processor.");
return;
}
std::string chatTemplate = properties->tokenizer.get_original_chat_template();
std::string bosToken = properties->tokenizer.get_bos_token();
std::string eosToken = properties->tokenizer.get_eos_token();
if (bosToken.empty()) {
SPDLOG_ERROR("BOS token was not found in model files.");
return;
}
if (eosToken.empty()) {
SPDLOG_ERROR("EOS token was not found in model files.");
return;
}
if (chatTemplate.empty()) {
SPDLOG_ERROR("Chat template was not found in model files.");
Comment on lines +137 to +145
Copy link

Copilot AI Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Error messages lack context about the impact of these failures. Consider indicating that chat template loading will be skipped or that the servable will not support chat completions endpoint when these tokens/templates are missing.

Suggested change
SPDLOG_ERROR("BOS token was not found in model files.");
return;
}
if (eosToken.empty()) {
SPDLOG_ERROR("EOS token was not found in model files.");
return;
}
if (chatTemplate.empty()) {
SPDLOG_ERROR("Chat template was not found in model files.");
SPDLOG_ERROR("BOS token was not found in model files. {}", CHAT_TEMPLATE_WARNING_MESSAGE);
return;
}
if (eosToken.empty()) {
SPDLOG_ERROR("EOS token was not found in model files. {}", CHAT_TEMPLATE_WARNING_MESSAGE);
return;
}
if (chatTemplate.empty()) {
SPDLOG_ERROR("Chat template was not found in model files. {}", CHAT_TEMPLATE_WARNING_MESSAGE);

Copilot uses AI. Check for mistakes.
return;
}

properties->templateProcessor.bosToken = bosToken;
properties->templateProcessor.eosToken = eosToken;

SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Loading Python Jinja template processor with chat template from tokenizer. Bos token: {}, Eos token: {}, chat template: \n{}",
bosToken, eosToken, chatTemplate);

py::gil_scoped_acquire acquire;
try {
auto locals = py::dict("tokenizer_template"_a = extraGenInfo.chatTemplateFromTokenizer,
"templates_directory"_a = extraGenInfo.chatTemplateDirectory,
"is_gguf_model"_a = extraGenInfo.isGgufModel);
auto locals = py::dict("chat_template"_a = chatTemplate,
"templates_directory"_a = extraGenInfo.chatTemplateDirectory);
py::exec(R"(
# Following the logic from:
# https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/tokenization_utils_base.py#L1837
Expand Down Expand Up @@ -214,71 +224,51 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
self._rendered_blocks = None
self._generation_indices = None


# Default chat template accepts only single message and outputs only it's 'content'
# effectively turning it into a regular prompt.
default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"

bos_token = ""
eos_token = ""
chat_template = default_chat_template

# Optional dedicated tool chat template (might not be present)
tool_chat_template = None

# Variables needed to be set at the end of this script execution
template = None
tool_template = None

# Try to read template from template.jinja file
jinja_file = Path(templates_directory + "/chat_template.jinja")
jinja_file_legacy = Path(templates_directory + "/template.jinja")
# Load Jinja2 environment
template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
jinja_env.globals["raise_exception"] = raise_exception
jinja_env.globals["strftime_now"] = strftime_now
jinja_env.filters["from_json"] = json.loads
if jinja_file.is_file():
template = jinja_env.get_template("chat_template.jinja")
elif jinja_file_legacy.is_file():
template = jinja_env.get_template("template.jinja")

# Try to read data from tokenizer_config.json
# Try to read data from tokenizer_config.json to get additional tool chat template if present
tokenizer_config_file = Path(templates_directory + "/tokenizer_config.json")
if tokenizer_config_file.is_file():
f = open(templates_directory + "/tokenizer_config.json", "r", encoding="utf-8")
data = json.load(f)
bos_token = data.get("bos_token", "")
bos_token = "" if bos_token is None else bos_token # Null token conversion to empty string.
eos_token = data.get("eos_token", "")
eos_token = "" if eos_token is None else eos_token # Null token conversion to empty string.

chat_template = data.get("chat_template", default_chat_template)
if isinstance(chat_template, list):
for template_entry in chat_template:

chat_template_from_tokenizer_config = data.get("chat_template", None)
if isinstance(chat_template_from_tokenizer_config, list):
for template_entry in chat_template_from_tokenizer_config:
if isinstance(template_entry, dict):
if template_entry.get("name") == "default":
chat_template = template_entry.get("template")
elif template_entry.get("name") == "tool_use":
if template_entry.get("name") == "tool_use":
tool_chat_template = template_entry.get("template")
if template is None:
if is_gguf_model and (chat_template == default_chat_template):
# GGUF model directory might not contain files with chat template and in that case we use template read from the tokenizer
template = jinja_env.from_string(tokenizer_template)
else:
template = jinja_env.from_string(chat_template)

# Try read tool_use.jinja template file from additional_chat_templates directory if exists
additional_templates_dir = Path(templates_directory + "/additional_chat_templates")
tool_use_template_file = additional_templates_dir / "tool_use.jinja"
if tool_use_template_file.is_file():
with open(tool_use_template_file, "r", encoding="utf-8") as f:
tool_chat_template = f.read()

# Load templates from strings
template = jinja_env.from_string(chat_template)
if tool_chat_template is not None:
tool_template = jinja_env.from_string(tool_chat_template)
else:
tool_template = template
)",
py::globals(), locals);

if (extraGenInfo.isGgufModel) {
properties->templateProcessor.bosToken = extraGenInfo.bosTokenFromTokenizer;
properties->templateProcessor.eosToken = extraGenInfo.eosTokenFromTokenizer;
} else {
properties->templateProcessor.bosToken = locals["bos_token"].cast<std::string>();
properties->templateProcessor.eosToken = locals["eos_token"].cast<std::string>();
}
properties->templateProcessor.chatTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["template"]);
properties->templateProcessor.toolTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["tool_template"]);
} catch (const pybind11::error_already_set& e) {
Expand All @@ -298,15 +288,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
SPDLOG_DEBUG("Chat template loading failed with an unexpected error");
}
}

#else
void GenAiServableInitializer::loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties) {
const std::string modelChatTemplate = properties->tokenizer.get_chat_template();
if (modelChatTemplate.empty()) {
SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Could not load model chat template. Using default template.");
properties->tokenizer.set_chat_template(DEFAULT_CHAT_TEMPLATE);
}
}
#endif

Status parseModelsPath(std::string& outPath, std::string modelsPath, std::string graphPath) {
Expand Down
4 changes: 0 additions & 4 deletions src/llm/servable_initializer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,6 @@ class GenAiServableInitializer {
// Use Python Jinja module for template processing
static void loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo);
static ExtraGenerationInfo readExtraGenerationInfo(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory);
#else
// In C++ only version we use GenAI for template processing, but to have the same behavior as in Python-enabled version
// we use default template if model does not have its own, so that servable can also work on chat/completion endpoint.
static void loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties);
#endif
/*
initialize method implementation MUST fill servable with all required properties i.e. pipeline, tokenizer, configs etc. based on mediapipe node options.
Expand Down
1 change: 1 addition & 0 deletions src/test/llm/dummy_facebook_template.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}
Loading