[Bugfix] Improve GLM4 MoE Reasoning Parser's is_reasoning_end Condition (vllm-project#25355)

frankwang28 · gemini-code-assist[bot] · chaunceyjiang · lywa1998 · commit c2774c4a232e · 2025-10-20T14:19:33.000+08:00
Signed-off-by: frankwang28 &lt;frank.wbb@hotmail.com&gt;
Signed-off-by: Frank Wang &lt;41319051+frankwang28@users.noreply.github.com&gt;
Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
Co-authored-by: Chauncey &lt;chaunceyjiang@gmail.com&gt;
diff --git a/tests/reasoning/test_glm4_moe_reasoning_parser.py b/tests/reasoning/test_glm4_moe_reasoning_parser.py
@@ -0,0 +1,203 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+from transformers import AutoTokenizer
+
+from tests.reasoning.utils import run_reasoning_extraction
+from vllm.reasoning import ReasoningParser, ReasoningParserManager
+
+parser_name = "glm45"
+start_token = "<think>"
+end_token = "</think>"
+
+REASONING_MODEL_NAME = "zai-org/GLM-4.5"
+
+
+@pytest.fixture(scope="module")
+def glm45_tokenizer():
+    return AutoTokenizer.from_pretrained(REASONING_MODEL_NAME)
+
+
+WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+WITH_THINK_STREAM = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+    "is_reasoning_end": True,
+}
+
+WITHOUT_THINK = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
+}
+
+WITHOUT_THINK_STREAM = {
+    "output": "This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+    "is_reasoning_end": False,
+}
+
+COMPLETE_REASONING = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": True,
+}
+MULTILINE_REASONING = {
+    "output":
+    "<think>This is a reasoning\nsection</think>This is the rest\nThat",
+    "reasoning_content": "This is a reasoning\nsection",
+    "content": "This is the rest\nThat",
+    "is_reasoning_end": True,
+}
+ONLY_OPEN_TAG = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": None,
+    "content": "<think>This is a reasoning section",
+    "is_reasoning_end": False,
+}
+
+ONLY_OPEN_TAG_STREAM = {
+    "output": "<think>This is a reasoning section",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+    "is_reasoning_end": False,
+}
+
+TEST_CASES = [
+    pytest.param(
+        False,
+        WITH_THINK,
+        id="with_think",
+    ),
+    pytest.param(
+        True,
+        WITH_THINK_STREAM,
+        id="with_think_stream",
+    ),
+    pytest.param(
+        False,
+        WITHOUT_THINK,
+        id="without_think",
+    ),
+    pytest.param(
+        True,
+        WITHOUT_THINK_STREAM,
+        id="without_think_stream",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING,
+        id="complete_reasoning",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING,
+        id="complete_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        MULTILINE_REASONING,
+        id="multiline_reasoning",
+    ),
+    pytest.param(
+        True,
+        MULTILINE_REASONING,
+        id="multiline_reasoning_stream",
+    ),
+    pytest.param(
+        False,
+        ONLY_OPEN_TAG,
+        id="only_open_tag",
+    ),
+    pytest.param(
+        True,
+        ONLY_OPEN_TAG_STREAM,
+        id="only_open_tag_stream",
+    ),
+]
+
+STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think>The user is asking for the capital of"""
+
+DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think>The user is asking for the capital of France.</think>
+The capital of France is Paris."""
+
+MULTI_TURN_STILL_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think></think>
+The capital of France is Paris.<|user|>
+What about Chile?<|assistant|>
+<think>The user is asking for the capital of"""
+
+MULTI_TURN_DONE_REASONING_PROMPT = """[gMASK]<sop><|system|>
+You are a helpful assistant.<|user|>
+What is the capital of France?<|assistant|>
+<think></think>
+The capital of France is Paris.<|user|>
+What about Chile?<|assistant|>
+<think>The user is asking for the capital of Chile.</think>
+The capital of Chile is Santiago."""
+
+REASONING_END_TEST_CASES = [
+    pytest.param(STILL_REASONING_PROMPT, False, id="still_reasoning"),
+    pytest.param(DONE_REASONING_PROMPT, True, id="done_reasoning"),
+    pytest.param(MULTI_TURN_STILL_REASONING_PROMPT,
+                 False,
+                 id="multi_turn_still_reasoning"),
+    pytest.param(MULTI_TURN_DONE_REASONING_PROMPT,
+                 True,
+                 id="multi_turn_done_reasoning")
+]
+
+
+@pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
+def test_reasoning(
+    streaming: bool,
+    param_dict: dict,
+    glm45_tokenizer,
+):
+    output = glm45_tokenizer.tokenize(param_dict["output"])
+    output_tokens: list[str] = [
+        glm45_tokenizer.convert_tokens_to_string([token]) for token in output
+    ]
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(glm45_tokenizer)
+
+    reasoning, content = run_reasoning_extraction(parser,
+                                                  output_tokens,
+                                                  streaming=streaming)
+
+    assert reasoning == param_dict["reasoning_content"]
+    assert content == param_dict["content"]
+
+    output_ids = glm45_tokenizer.convert_tokens_to_ids(output)
+    is_reasoning_end = parser.is_reasoning_end(output_ids)
+    assert is_reasoning_end == param_dict["is_reasoning_end"]
+
+
+@pytest.mark.parametrize("prompt, is_reasoning_end", REASONING_END_TEST_CASES)
+def test_is_reasoning_end_full_prompt(prompt: str, is_reasoning_end: bool,
+                                      glm45_tokenizer):
+    parser: ReasoningParser = ReasoningParserManager.get_reasoning_parser(
+        parser_name)(glm45_tokenizer)
+    tokens = glm45_tokenizer.tokenize(prompt)
+    token_ids = glm45_tokenizer.convert_tokens_to_ids(tokens)
+    check_is_reasoning_end = parser.is_reasoning_end(token_ids)
+    assert check_is_reasoning_end == is_reasoning_end
diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py
@@ -30,6 +30,7 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
         super().__init__(tokenizer, *args, **kwargs)
         self.think_start_token = "<think>"
         self.think_end_token = "</think>"
+        self.assistant_token = "<|assistant|>"
 
         if not self.model_tokenizer:
             raise ValueError(
@@ -38,14 +39,26 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
 
         self.think_start_token_id = self.vocab.get(self.think_start_token)
         self.think_end_token_id = self.vocab.get(self.think_end_token)
+        self.assistant_token_id = self.vocab.get(self.assistant_token)
         if (self.think_start_token_id is None
-                or self.think_end_token_id is None):
+                or self.think_end_token_id is None
+                or self.assistant_token_id is None):
             raise RuntimeError(
                 "Glm4MoeModel reasoning parser could not locate "
-                "think start/end tokens in the tokenizer!")
+                "think start/end or assistant tokens in the tokenizer!")
 
     def is_reasoning_end(self, input_ids: list[int]) -> bool:
-        return self.think_end_token_id in input_ids
+        """
+        GLM's chat template has <think></think> tokens after every
+        <|assistant|> token. Thus, we need to check if </think> is
+        after the most recent <|assistant|> token (if present).
+        """
+        for token_id in input_ids[::-1]:
+            if token_id == self.think_end_token_id:
+                return True
+            elif token_id == self.assistant_token_id:
+                return False
+        return False
 
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """