fix max_seq_len is less than max_num_tokens during profiling

johncalesp · johncalesp · commit 9cc9de7c6aa8 · 2025-09-16T17:29:29.000Z
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -65,13 +65,13 @@ def __init__(self, *, model_engine: PyTorchModelEngine,
         self._max_kv_tokens_in = self._kv_cache_config.max_tokens
         self._max_num_tokens = max_num_tokens
         self._max_beam_width = max_beam_width
+        self._max_seq_len = max_seq_len
         self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
                                                                1)
         self._kv_connector_manager = kv_connector_manager
         self._pytorch_backend_config = pytorch_backend_config
         self._speculative_config = speculative_config
         self._tokens_per_block = tokens_per_block
-        self._max_seq_len = max_seq_len
         self._max_batch_size = max_batch_size
         self._profiling_stage_data = profiling_stage_data
 
@@ -175,8 +175,11 @@ def _create_dummy_mm_context_request(
         max_num_tokens = len(prompt_token_ids)
         remaining_tokens = max(max_num_tokens, input_seq_len)
         if remaining_tokens > input_seq_len:
-            logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len." \
-                           f"Multimedia prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
+            logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
+                           f"Multimodal prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
+        ## add + 1 to avoid error: RuntimeError: The max KV cache length of input sequences (X + 1) exceeds the KV cache manager's maximum supported length X.
+        ## at line "/code/tensorrt_llm/tensorrt_llm/_torch/attention_backend/trtllm.py", line 837
+        self._max_seq_len = remaining_tokens + 1
         while remaining_tokens > 0:
             req_mm_input = trtllm.MultimodalInput(
                 multimodal_hashes=multimodal_input.multimodal_hashes,
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -8,6 +8,10 @@
 from tensorrt_llm.llmapi.llm_args import (CudaGraphConfig, KvCacheConfig,
                                           TorchLlmArgs)
 
+# isort: off
+from .test_llm import get_model_path
+# isort: on
+
 pytestmark = pytest.mark.threadleak(enabled=False)
 
 
@@ -16,10 +20,11 @@ def test_profile_kvcache():
                                     free_gpu_memory_fraction=0.9)
     cuda_graph_config = CudaGraphConfig(max_batch_size=512)
 
-    VLM_MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
-    VLM_MODEL_PATH = "/workspace/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3"
-    LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
-    LLM_MODEL_PATH = "/workspace/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1"
+    VLM_MODEL = "Qwen2.5-VL-7B-Instruct"
+    VLM_MODEL_PATH = get_model_path(VLM_MODEL)
+    LLM_MODEL = "Qwen2.5-7B-Instruct"
+    LLM_MODEL_PATH = get_model_path(LLM_MODEL)
+
     build_config = BuildConfig(max_batch_size=2048,
                                max_num_tokens=8192,
                                max_beam_width=1,