Skip to content

Commit 2853346

Browse files
committed
init
1 parent 4236db5 commit 2853346

File tree

8 files changed

+151
-416
lines changed

8 files changed

+151
-416
lines changed

prepare_llm_models.sh

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ if [ -z "$1" ]; then
2020
exit 1
2121
fi
2222

23-
CB_MODEL="facebook/opt-125m"
23+
TEXT_GENERATION_MODEL="facebook/opt-125m"
2424
TOKENIZER_FILE="openvino_tokenizer.bin"
2525
LEGACY_MODEL_FILE="1/model.bin"
2626
EMBEDDING_MODEL="thenlper/gte-small"
@@ -35,7 +35,7 @@ PHI4_MODEL="microsoft/Phi-4-mini-instruct"
3535
MISTRAL_MODEL="mistralai/Mistral-7B-Instruct-v0.3"
3636
GPT_OSS="openai/gpt-oss-20b"
3737

38-
MODELS=("$CB_MODEL/$TOKENIZER_FILE" "$EMBEDDING_MODEL/embeddings/$LEGACY_MODEL_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
38+
MODELS=("$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" "$EMBEDDING_MODEL/embeddings/$LEGACY_MODEL_FILE" "$RERANK_MODEL/rerank/$LEGACY_MODEL_FILE" "$VLM_MODEL/$TOKENIZER_FILE" "$QWEN3_MODEL/$TOKENIZER_FILE" "$LLAMA3_MODEL/$TOKENIZER_FILE" "$HERMES3_MODEL/$TOKENIZER_FILE" "$PHI4_MODEL/$TOKENIZER_FILE" "$MISTRAL_MODEL/$TOKENIZER_FILE" "$GPT_OSS/$TOKENIZER_FILE" "$EMBEDDING_MODEL/ov/$TOKENIZER_FILE" "$RERANK_MODEL/ov/$TOKENIZER_FILE")
3939

4040
all_exist=true
4141
for model in "${MODELS[@]}"; do
@@ -69,13 +69,22 @@ else
6969
fi
7070
mkdir -p $1
7171

72-
if [ -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
73-
echo "Models file $1/$CB_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
72+
if [ -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
73+
echo "Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE exists. Skipping downloading models."
7474
else
75-
python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
75+
python3 demos/common/export_models/export_model.py text_generation --source_model "$TEXT_GENERATION_MODEL" --weight-format int8 --model_repository_path $1
76+
if [ ! -f "$1/$TEXT_GENERATION_MODEL/chat_template.jinja" ]; then
77+
dummy_chat_template="{% for message in messages %}\
78+
{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}\
79+
{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}\
80+
{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}\
81+
{% endif %}\
82+
{% endfor %}"
83+
echo "$dummy_chat_template" > "$1/$TEXT_GENERATION_MODEL/chat_template.jinja"
84+
fi
7685
fi
77-
if [ ! -f "$1/$CB_MODEL/$TOKENIZER_FILE" ]; then
78-
echo "[ERROR] Models file $1/$CB_MODEL/$TOKENIZER_FILE does not exist."
86+
if [ ! -f "$1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE" ]; then
87+
echo "[ERROR] Models file $1/$TEXT_GENERATION_MODEL/$TOKENIZER_FILE does not exist."
7988
exit 1
8089
fi
8190

src/llm/servable.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,12 @@ absl::Status GenAiServable::prepareInputs(std::shared_ptr<GenAiServableExecution
124124
#else
125125
ov::genai::ChatHistory& chatHistory = executionContext->apiHandler->getChatHistory();
126126
constexpr bool add_generation_prompt = true; // confirm it should be hardcoded
127-
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
127+
try {
128+
inputText = getProperties()->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
129+
} catch (const std::exception& e) {
130+
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Failed to apply chat template: {}", e.what());
131+
return absl::Status(absl::StatusCode::kInvalidArgument, "Failed to apply chat template. The model either does not have chat template or has an invalid one.");
132+
}
128133
#endif
129134
if (inputText.size() == 0) {
130135
return absl::Status(absl::StatusCode::kInvalidArgument, "Final prompt after applying chat template is empty");

src/llm/servable_initializer.cpp

Lines changed: 42 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -51,15 +51,17 @@
5151
namespace ovms {
5252

5353
static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
54-
static const std::string DEFAULT_CHAT_TEMPLATE = R"({% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }})";
5554

5655
void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
5756
#if (PYTHON_DISABLE == 0)
5857
ExtraGenerationInfo extraGenInfo = readExtraGenerationInfo(properties, chatTemplateDirectory);
5958
loadPyTemplateProcessor(properties, extraGenInfo);
6059
#else
61-
loadDefaultTemplateProcessorIfNeeded(properties);
60+
if (properties->tokenizer.get_chat_template().empty()) {
61+
SPDLOG_LOGGER_DEBUG(modelmanager_logger, CHAT_TEMPLATE_WARNING_MESSAGE);
62+
}
6263
#endif
64+
// In non-python build, GenAI handles chat template loading
6365
}
6466

6567
#if (PYTHON_DISABLE == 0)
@@ -123,29 +125,34 @@ ExtraGenerationInfo GenAiServableInitializer::readExtraGenerationInfo(std::share
123125
}
124126

125127
void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo) {
126-
// GGUF models specific validation
127-
if (extraGenInfo.isGgufModel) {
128-
bool errorFound = false;
129-
if (extraGenInfo.eosTokenFromTokenizer.empty()) {
130-
SPDLOG_ERROR("Tokenizer eos token not found in tokenizer nor in vocabulary but required for GGUF models.");
131-
errorFound = true;
132-
}
133-
if (extraGenInfo.bosTokenFromTokenizer.empty()) {
134-
SPDLOG_ERROR("Tokenizer bos token not found in tokenizer nor in vocabulary but required for GGUF models.");
135-
errorFound = true;
136-
}
137-
if (extraGenInfo.chatTemplateFromTokenizer.empty()) {
138-
SPDLOG_ERROR("Tokenizer chat template not found in tokenizer but required for GGUF models.");
139-
errorFound = true;
140-
}
141-
if (errorFound)
142-
return;
128+
// At this point tokenizer cannot be uninitialized as we need to access its methods for prepare for chat template processing
129+
if (properties->tokenizer == ov::genai::Tokenizer()) {
130+
SPDLOG_LOGGER_ERROR(modelmanager_logger, "Tokenizer is not initialized. Cannot load chat template processor.");
131+
return;
132+
}
133+
std::string chatTemplate = properties->tokenizer.get_original_chat_template();
134+
std::string bosToken = properties->tokenizer.get_bos_token();
135+
std::string eosToken = properties->tokenizer.get_eos_token();
136+
if (bosToken.empty()) {
137+
SPDLOG_ERROR("BOS token was not found in model files.");
138+
return;
143139
}
140+
if (eosToken.empty()) {
141+
SPDLOG_ERROR("EOS token was not found in model files.");
142+
return;
143+
}
144+
if (chatTemplate.empty()) {
145+
SPDLOG_ERROR("Chat template was not found in model files.");
146+
return;
147+
}
148+
149+
properties->templateProcessor.bosToken = bosToken;
150+
properties->templateProcessor.eosToken = eosToken;
151+
144152
py::gil_scoped_acquire acquire;
145153
try {
146-
auto locals = py::dict("tokenizer_template"_a = extraGenInfo.chatTemplateFromTokenizer,
147-
"templates_directory"_a = extraGenInfo.chatTemplateDirectory,
148-
"is_gguf_model"_a = extraGenInfo.isGgufModel);
154+
auto locals = py::dict("chat_template"_a = chatTemplate,
155+
"templates_directory"_a = extraGenInfo.chatTemplateDirectory);
149156
py::exec(R"(
150157
# Following the logic from:
151158
# https://github.com/huggingface/transformers/blob/25245ec26dc29bcf6102e1b4ddd0dfd02e720cf5/src/transformers/tokenization_utils_base.py#L1837
@@ -214,71 +221,44 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
214221
self._rendered_blocks = None
215222
self._generation_indices = None
216223
217-
218-
# Default chat template accepts only single message and outputs only it's 'content'
219-
# effectively turning it into a regular prompt.
220-
default_chat_template = "{% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }}"
221-
222-
bos_token = ""
223-
eos_token = ""
224-
chat_template = default_chat_template
224+
225+
# Optional dedicated tool chat template (might not be present)
225226
tool_chat_template = None
226227
228+
# Variables needed to be set at the end of this script execution
227229
template = None
228230
tool_template = None
229231
230-
# Try to read template from template.jinja file
231-
jinja_file = Path(templates_directory + "/chat_template.jinja")
232-
jinja_file_legacy = Path(templates_directory + "/template.jinja")
232+
# Load Jinja2 environment
233233
template_loader = jinja2.FileSystemLoader(searchpath=templates_directory)
234234
jinja_env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True, extensions=[AssistantTracker, jinja2.ext.loopcontrols], loader=template_loader)
235235
jinja_env.policies["json.dumps_kwargs"]["ensure_ascii"] = False
236236
jinja_env.globals["raise_exception"] = raise_exception
237237
jinja_env.globals["strftime_now"] = strftime_now
238238
jinja_env.filters["from_json"] = json.loads
239-
if jinja_file.is_file():
240-
template = jinja_env.get_template("chat_template.jinja")
241-
elif jinja_file_legacy.is_file():
242-
template = jinja_env.get_template("template.jinja")
243239
244-
# Try to read data from tokenizer_config.json
240+
# Try to read data from tokenizer_config.json to get additional tool chat template if present
245241
tokenizer_config_file = Path(templates_directory + "/tokenizer_config.json")
246242
if tokenizer_config_file.is_file():
247243
f = open(templates_directory + "/tokenizer_config.json", "r", encoding="utf-8")
248244
data = json.load(f)
249-
bos_token = data.get("bos_token", "")
250-
bos_token = "" if bos_token is None else bos_token # Null token conversion to empty string.
251-
eos_token = data.get("eos_token", "")
252-
eos_token = "" if eos_token is None else eos_token # Null token conversion to empty string.
253-
254-
chat_template = data.get("chat_template", default_chat_template)
255-
if isinstance(chat_template, list):
256-
for template_entry in chat_template:
245+
246+
chat_template_from_tokenizer_config = data.get("chat_template", None)
247+
if isinstance(chat_template_from_tokenizer_config, list):
248+
for template_entry in chat_template_from_tokenizer_config:
257249
if isinstance(template_entry, dict):
258-
if template_entry.get("name") == "default":
259-
chat_template = template_entry.get("template")
260-
elif template_entry.get("name") == "tool_use":
250+
if template_entry.get("name") == "tool_use":
261251
tool_chat_template = template_entry.get("template")
262-
if template is None:
263-
if is_gguf_model and (chat_template == default_chat_template):
264-
# GGUF model directory might not contain files with chat template and in that case we use template read from the tokenizer
265-
template = jinja_env.from_string(tokenizer_template)
266-
else:
267-
template = jinja_env.from_string(chat_template)
252+
253+
# Load templates from strings
254+
template = jinja_env.from_string(chat_template)
268255
if tool_chat_template is not None:
269256
tool_template = jinja_env.from_string(tool_chat_template)
270257
else:
271258
tool_template = template
272259
)",
273260
py::globals(), locals);
274261

275-
if (extraGenInfo.isGgufModel) {
276-
properties->templateProcessor.bosToken = extraGenInfo.bosTokenFromTokenizer;
277-
properties->templateProcessor.eosToken = extraGenInfo.eosTokenFromTokenizer;
278-
} else {
279-
properties->templateProcessor.bosToken = locals["bos_token"].cast<std::string>();
280-
properties->templateProcessor.eosToken = locals["eos_token"].cast<std::string>();
281-
}
282262
properties->templateProcessor.chatTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["template"]);
283263
properties->templateProcessor.toolTemplate = std::make_unique<PyObjectWrapper<py::object>>(locals["tool_template"]);
284264
} catch (const pybind11::error_already_set& e) {
@@ -298,15 +278,6 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
298278
SPDLOG_DEBUG("Chat template loading failed with an unexpected error");
299279
}
300280
}
301-
302-
#else
303-
void GenAiServableInitializer::loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties) {
304-
const std::string modelChatTemplate = properties->tokenizer.get_chat_template();
305-
if (modelChatTemplate.empty()) {
306-
SPDLOG_LOGGER_DEBUG(modelmanager_logger, "Could not load model chat template. Using default template.");
307-
properties->tokenizer.set_chat_template(DEFAULT_CHAT_TEMPLATE);
308-
}
309-
}
310281
#endif
311282

312283
Status parseModelsPath(std::string& outPath, std::string modelsPath, std::string graphPath) {

src/llm/servable_initializer.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,10 +52,6 @@ class GenAiServableInitializer {
5252
// Use Python Jinja module for template processing
5353
static void loadPyTemplateProcessor(std::shared_ptr<GenAiServableProperties> properties, const ExtraGenerationInfo& extraGenInfo);
5454
static ExtraGenerationInfo readExtraGenerationInfo(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory);
55-
#else
56-
// In C++ only version we use GenAI for template processing, but to have the same behavior as in Python-enabled version
57-
// we use default template if model does not have its own, so that servable can also work on chat/completion endpoint.
58-
static void loadDefaultTemplateProcessorIfNeeded(std::shared_ptr<GenAiServableProperties> properties);
5955
#endif
6056
/*
6157
initialize method implementation MUST fill servable with all required properties i.e. pipeline, tokenizer, configs etc. based on mediapipe node options.

src/test/llm/assisted_decoding_test.cpp

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -94,8 +94,12 @@ class AssistedDecodingPipelinesHttpTest : public ::testing::Test {
9494
}
9595
}
9696

97-
int generateExpectedText(std::string prompt, bool addSpecialTokens) {
97+
int generateExpectedText(std::string prompt, bool addSpecialTokens, bool applyChatTemplate) {
9898
try {
99+
if (applyChatTemplate) {
100+
ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
101+
prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
102+
}
99103
ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
100104
std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
101105
auto generationHandle = cbPipe->add_request(
@@ -162,7 +166,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
162166
// Generate reference from the base model (unassisted generation)
163167
config.max_new_tokens = 10;
164168
config.temperature = 0;
165-
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
169+
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
166170
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
167171

168172
// Static number of candidates
@@ -185,8 +189,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
185189
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
186190
auto& choice = parsedResponse["choices"].GetArray()[0];
187191
ASSERT_TRUE(choice["text"].IsString());
188-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
189-
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
192+
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
190193

191194
// Dynamic number of candidates
192195
requestBody = R"(
@@ -208,15 +211,14 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
208211
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
209212
choice = parsedResponse["choices"].GetArray()[0];
210213
ASSERT_TRUE(choice["text"].IsString());
211-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
212-
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
214+
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
213215
}
214216

215217
TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDecoding) {
216218
// Generate reference from the base model (unassisted generation)
217219
config.max_new_tokens = 10;
218220
config.temperature = 0;
219-
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
221+
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
220222
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
221223

222224
// Static number of candidates
@@ -247,8 +249,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
247249
ASSERT_TRUE(choice["message"]["content"].IsString());
248250
ASSERT_TRUE(choice["finish_reason"].IsString());
249251
ASSERT_FALSE(choice["logprobs"].IsObject());
250-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
251-
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
252+
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
252253

253254
// Dynamic number of candidates
254255
requestBody = R"(
@@ -278,8 +279,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
278279
ASSERT_TRUE(choice["message"]["content"].IsString());
279280
ASSERT_TRUE(choice["finish_reason"].IsString());
280281
ASSERT_FALSE(choice["logprobs"].IsObject());
281-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
282-
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
282+
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
283283
}
284284

285285
TEST_F(AssistedDecodingPipelinesHttpTest, speculativeDecodingExclusiveParametersProvided) {
@@ -318,7 +318,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
318318
// Generate reference from the base model (unassisted generation)
319319
config.max_new_tokens = 10;
320320
config.temperature = 0;
321-
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
321+
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
322322
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
323323

324324
std::string requestBody = R"(
@@ -341,15 +341,14 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
341341
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
342342
auto& choice = parsedResponse["choices"].GetArray()[0];
343343
ASSERT_TRUE(choice["text"].IsString());
344-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
345-
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
344+
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
346345
}
347346

348347
TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDecoding) {
349348
// Generate reference from the base model (unassisted generation)
350349
config.max_new_tokens = 10;
351350
config.temperature = 0;
352-
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
351+
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
353352
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
354353

355354
auto requestBody = R"(
@@ -380,8 +379,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDe
380379
ASSERT_TRUE(choice["message"]["content"].IsString());
381380
ASSERT_TRUE(choice["finish_reason"].IsString());
382381
ASSERT_FALSE(choice["logprobs"].IsObject());
383-
// TODO: awaiting OV/GenAI fix, uncomment when fixed
384-
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
382+
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
385383
}
386384

387385
// Consider parametrization of negative tests with request body and endpoint as parameters

0 commit comments

Comments
 (0)