Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/llm/servable_initializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
namespace ovms {

static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
static const std::string DEFAULT_CHAT_TEMPLATE = R"({% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }})";
static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})";

void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
#if (PYTHON_DISABLE == 0)
Expand Down Expand Up @@ -217,7 +217,7 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ

# Default chat template accepts only single message and outputs only it's 'content'
# effectively turning it into a regular prompt.
default_chat_template = "{% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }}"
default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"

bos_token = ""
eos_token = ""
Expand Down
34 changes: 17 additions & 17 deletions src/test/llm/assisted_decoding_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,12 @@ class AssistedDecodingPipelinesHttpTest : public ::testing::Test {
}
}

int generateExpectedText(std::string prompt, bool addSpecialTokens) {
int generateExpectedText(std::string prompt, bool addSpecialTokens, bool applyChatTemplate) {
try {
if (applyChatTemplate) {
ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
}
ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
auto generationHandle = cbPipe->add_request(
Expand Down Expand Up @@ -162,7 +166,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
// Generate reference from the base model (unassisted generation)
config.max_new_tokens = 10;
config.temperature = 0;
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());

// Static number of candidates
Expand All @@ -185,8 +189,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
auto& choice = parsedResponse["choices"].GetArray()[0];
ASSERT_TRUE(choice["text"].IsString());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());

// Dynamic number of candidates
requestBody = R"(
Expand All @@ -208,15 +211,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
choice = parsedResponse["choices"].GetArray()[0];
ASSERT_TRUE(choice["text"].IsString());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
}

TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDecoding) {
GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers
// Generate reference from the base model (unassisted generation)
config.max_new_tokens = 10;
config.temperature = 0;
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());

// Static number of candidates
Expand Down Expand Up @@ -247,8 +250,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
ASSERT_TRUE(choice["message"]["content"].IsString());
ASSERT_TRUE(choice["finish_reason"].IsString());
ASSERT_FALSE(choice["logprobs"].IsObject());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);

// Dynamic number of candidates
requestBody = R"(
Expand Down Expand Up @@ -278,8 +280,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
ASSERT_TRUE(choice["message"]["content"].IsString());
ASSERT_TRUE(choice["finish_reason"].IsString());
ASSERT_FALSE(choice["logprobs"].IsObject());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
}

TEST_F(AssistedDecodingPipelinesHttpTest, speculativeDecodingExclusiveParametersProvided) {
Expand Down Expand Up @@ -318,7 +319,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
// Generate reference from the base model (unassisted generation)
config.max_new_tokens = 10;
config.temperature = 0;
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());

std::string requestBody = R"(
Expand All @@ -341,15 +342,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
auto& choice = parsedResponse["choices"].GetArray()[0];
ASSERT_TRUE(choice["text"].IsString());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
}

TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDecoding) {
GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers
// Generate reference from the base model (unassisted generation)
config.max_new_tokens = 10;
config.temperature = 0;
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());

auto requestBody = R"(
Expand Down Expand Up @@ -380,8 +381,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDe
ASSERT_TRUE(choice["message"]["content"].IsString());
ASSERT_TRUE(choice["finish_reason"].IsString());
ASSERT_FALSE(choice["logprobs"].IsObject());
// TODO: awaiting OV/GenAI fix, uncomment when fixed
// ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
}

// Consider parametrization of negative tests with request body and endpoint as parameters
Expand Down
54 changes: 13 additions & 41 deletions src/test/llm/llmnode_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,12 @@ class LLMFlowHttpTest : public ::testing::Test {
}
}

int generateExpectedText(std::string prompt, bool addSpecialTokens = true) {
int generateExpectedText(std::string prompt, bool addSpecialTokens = true, bool applyChatTemplate = false) {
try {
if (applyChatTemplate) {
ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
}
ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
auto generationHandle = cbPipe->add_request(
Expand Down Expand Up @@ -737,14 +741,15 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonNFail) {
}

TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonN) {
GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers
auto params = GetParam();
config.max_new_tokens = 5;
config.rng_seed = 1;
config.num_beams = 16;
config.num_return_sequences = 8;
config.echo = false;
if (params.generateExpectedOutput) {
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
}
std::string requestBody = R"(
Expand Down Expand Up @@ -2754,34 +2759,7 @@ TEST_P(LLMHttpParametersValidationTest, missingContentInMessage) {
)";

ovms::Status status = handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser);
#if (PYTHON_DISABLE == 0)
bool genAiTemplateParsing = false; // With Python enabled, we use native Jinja2 template parsing
#else
bool genAiTemplateParsing = true; // With Python disabled, we use GenAI template parsing
#endif

if (params.modelName.find("vlm") != std::string::npos) {
ASSERT_EQ(status.getCode(), ovms::StatusCode::OK); // GenAI accepts such messages, so we expect a successful response
return;
}

if (genAiTemplateParsing) {
/*
This test checks if API handler validation allows messages without content.
The reason why we expect generic error here is that with GenAI template rendering missing content is unexpected.
On the API handler level this is a positive path as this test confirms that request reaches template processing phase.
*/
ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
ASSERT_NE(status.string().find("Response generation failed"), std::string::npos);
} else {
/*
This test checks if API handler validation allows messages without content.
The reason why we expect error here is that for the tested LLM model, lack of content means that pipeline input is empty.
On the API handler level this is a positive path as this test confirms that request reaches template processing phase.
*/
ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
ASSERT_NE(status.string().find("Final prompt after applying chat template is empty"), std::string::npos);
}
ASSERT_EQ(status.getCode(), ovms::StatusCode::OK);
}

TEST_P(LLMHttpParametersValidationTest, roleNotAString) {
Expand Down Expand Up @@ -3267,19 +3245,13 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithOnlyRole) {
{
"model": ")" + params.modelName +
R"(",
"messages": [{"role": "abc"}]
"messages": [{"role": "user"}]
}
)";

if (params.modelName.find("vlm") != std::string::npos) {
ASSERT_EQ(
handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
ovms::StatusCode::OK); // GenAI supports such messages
} else {
ASSERT_EQ(
handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
}
ASSERT_EQ(
handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
ovms::StatusCode::OK); // GenAI supports such messages
}

TEST_P(LLMHttpParametersValidationTest, SpeculativeDecodingNoSDSpecificParametersProvided) {
Expand Down Expand Up @@ -3345,7 +3317,7 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithMoreMessageFields) {
"model": ")" + params.modelName +
R"(",
"max_tokens": 1,
"messages": [{"role": "123", "content": "def", "unexpected": "123"}]
"messages": [{"role": "user", "content": "def", "unexpected": "123"}]
}
)";

Expand Down
Loading