diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp index 6af03ba9ba..2af9f690e0 100644 --- a/src/llm/servable_initializer.cpp +++ b/src/llm/servable_initializer.cpp @@ -51,7 +51,7 @@ namespace ovms { static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint."; -static const std::string DEFAULT_CHAT_TEMPLATE = R"({% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }})"; +static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})"; void GenAiServableInitializer::loadChatTemplate(std::shared_ptr properties, const std::string& chatTemplateDirectory) { #if (PYTHON_DISABLE == 0) @@ -217,7 +217,7 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptrget_tokenizer().apply_chat_template(chatHistory, true); + } ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids; std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl; auto generationHandle = cbPipe->add_request( @@ -162,7 +166,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin // Generate reference from the base model (unassisted generation) config.max_new_tokens = 10; config.temperature = 0; - ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0); + ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0); ASSERT_EQ(config.num_return_sequences, expectedMessages.size()); // Static number of candidates @@ -185,8 +189,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin ASSERT_EQ(parsedResponse["choices"].Capacity(), 1); auto& choice = parsedResponse["choices"].GetArray()[0]; ASSERT_TRUE(choice["text"].IsString()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); + EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); // Dynamic number of candidates requestBody = R"( @@ -208,15 +211,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin ASSERT_EQ(parsedResponse["choices"].Capacity(), 1); choice = parsedResponse["choices"].GetArray()[0]; ASSERT_TRUE(choice["text"].IsString()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); + EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); } TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDecoding) { + GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers // Generate reference from the base model (unassisted generation) config.max_new_tokens = 10; config.temperature = 0; - ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0); + ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0); ASSERT_EQ(config.num_return_sequences, expectedMessages.size()); // Static number of candidates @@ -247,8 +250,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec ASSERT_TRUE(choice["message"]["content"].IsString()); ASSERT_TRUE(choice["finish_reason"].IsString()); ASSERT_FALSE(choice["logprobs"].IsObject()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); + ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); // Dynamic number of candidates requestBody = R"( @@ -278,8 +280,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec ASSERT_TRUE(choice["message"]["content"].IsString()); ASSERT_TRUE(choice["finish_reason"].IsString()); ASSERT_FALSE(choice["logprobs"].IsObject()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); + ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); } TEST_F(AssistedDecodingPipelinesHttpTest, speculativeDecodingExclusiveParametersProvided) { @@ -318,7 +319,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi // Generate reference from the base model (unassisted generation) config.max_new_tokens = 10; config.temperature = 0; - ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0); + ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0); ASSERT_EQ(config.num_return_sequences, expectedMessages.size()); std::string requestBody = R"( @@ -341,15 +342,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi ASSERT_EQ(parsedResponse["choices"].Capacity(), 1); auto& choice = parsedResponse["choices"].GetArray()[0]; ASSERT_TRUE(choice["text"].IsString()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); + EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str()); } TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDecoding) { + GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers // Generate reference from the base model (unassisted generation) config.max_new_tokens = 10; config.temperature = 0; - ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0); + ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0); ASSERT_EQ(config.num_return_sequences, expectedMessages.size()); auto requestBody = R"( @@ -380,8 +381,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDe ASSERT_TRUE(choice["message"]["content"].IsString()); ASSERT_TRUE(choice["finish_reason"].IsString()); ASSERT_FALSE(choice["logprobs"].IsObject()); - // TODO: awaiting OV/GenAI fix, uncomment when fixed - // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); + ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]); } // Consider parametrization of negative tests with request body and endpoint as parameters diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp index 1a96fabb99..f0ca554a7e 100644 --- a/src/test/llm/llmnode_test.cpp +++ b/src/test/llm/llmnode_test.cpp @@ -108,8 +108,12 @@ class LLMFlowHttpTest : public ::testing::Test { } } - int generateExpectedText(std::string prompt, bool addSpecialTokens = true) { + int generateExpectedText(std::string prompt, bool addSpecialTokens = true, bool applyChatTemplate = false) { try { + if (applyChatTemplate) { + ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}}); + prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true); + } ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids; std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl; auto generationHandle = cbPipe->add_request( @@ -737,6 +741,7 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonNFail) { } TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonN) { + GTEST_SKIP(); // TODO: Temporary skip to synchronize CI workers auto params = GetParam(); config.max_new_tokens = 5; config.rng_seed = 1; @@ -744,7 +749,7 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonN) { config.num_return_sequences = 8; config.echo = false; if (params.generateExpectedOutput) { - ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0); + ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0); ASSERT_EQ(config.num_return_sequences, expectedMessages.size()); } std::string requestBody = R"( @@ -2754,34 +2759,7 @@ TEST_P(LLMHttpParametersValidationTest, missingContentInMessage) { )"; ovms::Status status = handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser); -#if (PYTHON_DISABLE == 0) - bool genAiTemplateParsing = false; // With Python enabled, we use native Jinja2 template parsing -#else - bool genAiTemplateParsing = true; // With Python disabled, we use GenAI template parsing -#endif - - if (params.modelName.find("vlm") != std::string::npos) { - ASSERT_EQ(status.getCode(), ovms::StatusCode::OK); // GenAI accepts such messages, so we expect a successful response - return; - } - - if (genAiTemplateParsing) { - /* - This test checks if API handler validation allows messages without content. - The reason why we expect generic error here is that with GenAI template rendering missing content is unexpected. - On the API handler level this is a positive path as this test confirms that request reaches template processing phase. - */ - ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR); - ASSERT_NE(status.string().find("Response generation failed"), std::string::npos); - } else { - /* - This test checks if API handler validation allows messages without content. - The reason why we expect error here is that for the tested LLM model, lack of content means that pipeline input is empty. - On the API handler level this is a positive path as this test confirms that request reaches template processing phase. - */ - ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR); - ASSERT_NE(status.string().find("Final prompt after applying chat template is empty"), std::string::npos); - } + ASSERT_EQ(status.getCode(), ovms::StatusCode::OK); } TEST_P(LLMHttpParametersValidationTest, roleNotAString) { @@ -3267,19 +3245,13 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithOnlyRole) { { "model": ")" + params.modelName + R"(", - "messages": [{"role": "abc"}] + "messages": [{"role": "user"}] } )"; - if (params.modelName.find("vlm") != std::string::npos) { - ASSERT_EQ( - handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), - ovms::StatusCode::OK); // GenAI supports such messages - } else { - ASSERT_EQ( - handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), - ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR); - } + ASSERT_EQ( + handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), + ovms::StatusCode::OK); // GenAI supports such messages } TEST_P(LLMHttpParametersValidationTest, SpeculativeDecodingNoSDSpecificParametersProvided) { @@ -3345,7 +3317,7 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithMoreMessageFields) { "model": ")" + params.modelName + R"(", "max_tokens": 1, - "messages": [{"role": "123", "content": "def", "unexpected": "123"}] + "messages": [{"role": "user", "content": "def", "unexpected": "123"}] } )"; diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp index 848d795a05..3c49439ac7 100644 --- a/src/test/llm/llmtemplate_test.cpp +++ b/src/test/llm/llmtemplate_test.cpp @@ -37,6 +37,7 @@ #include "../../llm/py_jinja_template_processor.hpp" #include "../../mediapipe_internal/mediapipegraphdefinition.hpp" #include "../../server.hpp" +#include "../platform_utils.hpp" #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" @@ -75,7 +76,7 @@ class LLMChatTemplateTest : public TestWithTempDir { void SetUp() { TestWithTempDir::SetUp(); tokenizerConfigFilePath = directoryPath + "/tokenizer_config.json"; - jinjaConfigFilePath = directoryPath + "/template.jinja"; + jinjaConfigFilePath = directoryPath + "/chat_template.jinja"; } void TearDown() { @@ -110,9 +111,8 @@ TEST_F(LLMChatTemplateTest, ChatTemplateEmptyMessage) { "messages": [] } )"; - std::string errorOutput = "This servable accepts only single message requests"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, errorOutput); + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_TRUE(finalPrompt.empty()); } TEST_F(LLMChatTemplateTest, ChatTemplateMessageWithEmptyObject) { @@ -134,10 +134,10 @@ TEST_F(LLMChatTemplateTest, ChatTemplateDefault) { std::string finalPrompt = ""; std::string payloadBody = R"( { - "messages": [{ "content": "How can I help you?" }] + "messages": [{ "role": "user", "content": "How can I help you?" }] } )"; - std::string expectedOutput = "How can I help you?"; + std::string expectedOutput = "User: How can I help you?"; ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -147,12 +147,12 @@ TEST_F(LLMChatTemplateTest, ChatTemplateMultiMessage) { std::string finalPrompt = ""; std::string payloadBody = R"( { - "messages": [{ "content": "How can I help you?" }, { "content": "2How can I help you?" }] + "messages": [{ "role": "user", "content": "How can I help you?" }, { "role": "user", "content": "2How can I help you?" }] } )"; - std::string errorOutput = "This servable accepts only single message requests"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, errorOutput); + std::string expectedOutput = "User: How can I help you?User: 2How can I help you?"; + ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); + ASSERT_EQ(finalPrompt, expectedOutput); } TEST_F(LLMChatTemplateTest, ChatTemplateComplexMessage) { @@ -165,7 +165,7 @@ TEST_F(LLMChatTemplateTest, ChatTemplateComplexMessage) { "messages": [{"role": "user", "content": "hello"}] } )"; - std::string expectedOutput = "hello"; + std::string expectedOutput = "User: hello"; ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); ASSERT_EQ(finalPrompt, expectedOutput); } @@ -205,196 +205,6 @@ TEST_F(LLMChatTemplateTest, ChatTemplateJinjaException) { ASSERT_EQ(finalPrompt, errorOutput); } -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerDefault) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "hello"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerBosNull) { - std::string tokenizerJson = R"({ - "bos_token": null, - "eos_token": "" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "hello"; - // Expect no issues with chat template since non string bos token is ignored - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerBosDict) { - std::string tokenizerJson = R"({ - "bos_token": {"bos" : "INVALID"}, - "eos_token": "" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedError = "Error: Chat template not loaded correctly, so it cannot be applied"; - // Expect no issues with chat template since non string bos token is ignored - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, expectedError); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerEosNull) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": null - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "hello"; - // Expect no issues with chat template since non string eos token is ignored - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerException) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "", - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "Error: Chat template not loaded correctly, so it cannot be applied"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerUpperCase) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "", - "chat_template": "{{ \"Hi, \" + messages[0]['content'] | upper }}" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "Hi, HELLO"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerTemplateException) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "", - "chat_template": "{{ \"Hi, \" + messages[3]['content'] | upper }}" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = "list object has no element 3"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, expectedOutput); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTokenizerTemplateBadVariable) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "", - "chat_template": {} - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedError = "Error: Chat template not loaded correctly, so it cannot be applied"; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), false); - ASSERT_EQ(finalPrompt, expectedError); -} - -TEST_F(LLMChatTemplateTest, ChatTemplateTwoConfigs) { - std::string tokenizerJson = R"({ - "bos_token": "", - "eos_token": "", - "chat_template": "{{ \"Hi, \" + messages[0]['content'] | lower }}" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); - std::string jinjaTemplate = R"( {{ "Hi, " + messages[0]['content'] | upper }} )"; - ASSERT_EQ(CreateJinjaConfig(jinjaTemplate), true); - LoadTemplateProcessor(); - std::string finalPrompt = ""; - std::string payloadBody = R"( - { - "model": "gpt", - "stream": false, - "messages": [{"role": "user", "content": "hello"}] - } - )"; - std::string expectedOutput = " Hi, HELLO "; - ASSERT_EQ(PyJinjaTemplateProcessor::applyChatTemplate(servable->getProperties()->templateProcessor, servable->getProperties()->modelsPath, payloadBody, finalPrompt), true); - ASSERT_EQ(finalPrompt, expectedOutput); -} - TEST_F(LLMChatTemplateTest, ChatTemplateComparePythonAndGenAiProcessors) { GTEST_SKIP() << "Skipping test due to GenAI template processor not being able to compare values of different types (no implicit conversion). Enable when resolved."; // Using modified Llama2 template to work with limited tokenizer object (with no models loaded) @@ -461,10 +271,9 @@ std::string CreatePayloadBodyWithChatTemplateKwargs(const std::string& chat_temp } TEST_F(LLMChatTemplateTest, ChatTemplateKwargsPositive) { - std::string tokenizerJson = R"({ - "chat_template": "{% if enable_thinking %}Thinking is on{% else %}Thinking is off{% endif %}" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); + std::string templateJinja = R"({% if enable_thinking %}Thinking is on{% else %}Thinking is off{% endif %})"; + + ASSERT_EQ(CreateJinjaConfig(templateJinja), true); LoadTemplateProcessor(); std::string finalPrompt = ""; std::string payloadBody; @@ -514,10 +323,8 @@ TEST_F(LLMChatTemplateTest, ChatTemplateKwargsPositive) { } TEST_F(LLMChatTemplateTest, ChatTemplateKwargsNegative) { - std::string tokenizerJson = R"({ - "chat_template": "{% if enable_thinking %}Thinking is on{% else %}Thinking is off{% endif %}" - })"; - ASSERT_EQ(CreateTokenizerConfig(tokenizerJson), true); + std::string templateJinja = R"({% if enable_thinking %}Thinking is on{% else %}Thinking is off{% endif %})"; + ASSERT_EQ(CreateJinjaConfig(templateJinja), true); LoadTemplateProcessor(); // chat_template_kwargs must be an object // This is a negative test case to ensure that the template processor correctly handles invalid chat_template @@ -678,10 +485,6 @@ std::unique_ptr LLMChatTemplateHttpTest::t; std::string fullResponse; -// static void ConcatenateResponse(const std::string& partial) { -// fullResponse += partial; -// } - class LLMJinjaChatTemplateHttpTest : public LLMChatTemplateHttpTest { public: static std::unique_ptr t; @@ -752,12 +555,6 @@ TEST_F(LLMJinjaChatTemplateHttpTest, inferChatCompletionsUnary) { ASSERT_EQ( handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); - // Assertion split in two parts to avoid timestamp mismatch - // const size_t timestampLength = 10; - std::string expectedResponsePart1 = R"({"choices":[{"finish_reason":"stop","index":0,"logprobs":null,"message":{"content":"\nOpenVINO is","role":"assistant"}}],"created":)"; - std::string expectedResponsePart2 = R"(,"model":"lm_cb_regular","object":"chat.completion"})"; - // TODO: New output ASSERT_EQ(response.compare(0, expectedResponsePart1.length(), expectedResponsePart1), 0); - // TODO: New output ASSERT_EQ(response.compare(expectedResponsePart1.length() + timestampLength, expectedResponsePart2.length(), expectedResponsePart2), 0); } TEST_F(LLMJinjaChatTemplateHttpTest, inferCompletionsUnary) { @@ -774,12 +571,6 @@ TEST_F(LLMJinjaChatTemplateHttpTest, inferCompletionsUnary) { ASSERT_EQ( handler->dispatchToProcessor(endpointCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); - // Assertion split in two parts to avoid timestamp mismatch - // const size_t timestampLength = 10; - std::string expectedResponsePart1 = R"({"choices":[{"finish_reason":"stop","index":0,"logprobs":null,"text":"\n\nThe first thing"}],"created":)"; - std::string expectedResponsePart2 = R"(,"model":"lm_cb_regular","object":"text_completion"})"; - // TODO: New output ASSERT_EQ(response.compare(0, expectedResponsePart1.length(), expectedResponsePart1), 0); - // TODO: New output ASSERT_EQ(response.compare(expectedResponsePart1.length() + timestampLength, expectedResponsePart2.length(), expectedResponsePart2), 0); } TEST_F(LLMJinjaChatTemplateHttpTest, inferChatCompletionsStream) { @@ -793,33 +584,11 @@ TEST_F(LLMJinjaChatTemplateHttpTest, inferChatCompletionsStream) { } )"; - // TODO: New output EXPECT_CALL(writer, PartialReplyEnd()).Times(1); - /* TODO: New output EXPECT_CALL(writer, PartialReply(::testing::_)) - .WillRepeatedly([](std::string response) { - rapidjson::Document responseJson; - const int dataHeaderSize = 6; - std::string jsonResponse = response.substr(dataHeaderSize); - rapidjson::ParseResult ok = responseJson.Parse(jsonResponse.c_str()); - if (response.find("[DONE]") == std::string::npos) { - ASSERT_EQ(ok.Code(), 0); - auto m = responseJson.FindMember("choices"); - ASSERT_NE(m, responseJson.MemberEnd()); - auto& choices = m->value.GetArray()[0]; - auto modelOutput = choices.GetObject()["text"].GetString(); - ConcatenateResponse(modelOutput); - } - }); - */ - // TODO: New output EXPECT_CALL(writer, WriteResponseString(::testing::_)).Times(0); - // TODO: New output EXPECT_CALL(writer, IsDisconnected()).Times(7); - ASSERT_EQ( handler->dispatchToProcessor(endpointCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::PARTIAL_END); ASSERT_EQ(response, ""); - - // TODO: New output ASSERT_EQ(fullResponse, "\n\nThe first thing "); } TEST_F(LLMJinjaChatTemplateHttpTest, inferCompletionsStream) { @@ -833,33 +602,11 @@ TEST_F(LLMJinjaChatTemplateHttpTest, inferCompletionsStream) { } )"; - // TODO: New output EXPECT_CALL(writer, PartialReplyEnd()).Times(1); - /* TODO: New output EXPECT_CALL(writer, PartialReply(::testing::_)) - .WillRepeatedly([](std::string response) { - rapidjson::Document responseJson; - const int dataHeaderSize = 6; - std::string jsonResponse = response.substr(dataHeaderSize); - rapidjson::ParseResult ok = responseJson.Parse(jsonResponse.c_str()); - if (response.find("[DONE]") == std::string::npos) { - ASSERT_EQ(ok.Code(), 0); - auto m = responseJson.FindMember("choices"); - ASSERT_NE(m, responseJson.MemberEnd()); - auto& choices = m->value.GetArray()[0]; - auto modelOutput = choices.GetObject()["text"].GetString(); - ConcatenateResponse(modelOutput); - } - }); - */ - // TODO: New output EXPECT_CALL(writer, WriteResponseString(::testing::_)).Times(0); - // TODO: New output EXPECT_CALL(writer, IsDisconnected()).Times(7); - ASSERT_EQ( handler->dispatchToProcessor(endpointCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::PARTIAL_END); ASSERT_EQ(response, ""); - - // ASSERT_EQ(fullResponse, "\n\nThe first thing "); } TEST_F(LLMJinjaChatTemplateHttpTest, inferDefaultChatCompletionsUnary) { @@ -882,10 +629,4 @@ TEST_F(LLMJinjaChatTemplateHttpTest, inferDefaultChatCompletionsUnary) { ASSERT_EQ( handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser), ovms::StatusCode::OK); - // Assertion split in two parts to avoid timestamp mismatch - // const size_t timestampLength = 10; - std::string expectedResponsePart1 = R"({"choices":[{"finish_reason":"stop","index":0,"logprobs":null,"message":{"content":"\nOpenVINO is","role":"assistant"}}],"created":)"; - std::string expectedResponsePart2 = R"(,"model":"lm_cb_regular","object":"chat.completion"})"; - // TODO: New output ASSERT_EQ(response.compare(0, expectedResponsePart1.length(), expectedResponsePart1), 0); - // TODO: New output ASSERT_EQ(response.compare(expectedResponsePart1.length() + timestampLength, expectedResponsePart2.length(), expectedResponsePart2), 0); }