change logic to get initial input_seq_len

johncalesp · johncalesp · commit fa247444f4e8 · 2025-10-14T12:38:13.000-04:00
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -160,13 +160,10 @@ def _create_dummy_mm_context_request(
 
         max_num_tokens = len(prompt_token_ids)
         assert max_num_tokens > 0, "the length of the prompt of the dummy mm req is less than or equal to 0"
-        remaining_tokens = max(max_num_tokens, input_seq_len)
+        remaining_tokens = min(max_num_tokens, input_seq_len)
         if remaining_tokens > input_seq_len:
             logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
                            f"Multimodal prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
-        ## add + 1 to avoid error: RuntimeError: The max KV cache length of input sequences (X + 1) exceeds the KV cache manager's maximum supported length X.
-        ## at line "/code/tensorrt_llm/tensorrt_llm/_torch/attention_backend/trtllm.py", line 837
-        self._max_seq_len = remaining_tokens + 1
         while remaining_tokens > 0:
             req_mm_input = trtllm.MultimodalInput(
                 multimodal_hashes=multimodal_input.multimodal_hashes,
@@ -181,6 +178,9 @@ def _create_dummy_mm_context_request(
                                      output_config=trtllm.OutputConfig(),
                                      end_id=-1,
                                      multimodal_input=req_mm_input)
+            # TODO:
+            # create_input_processor_with_hash shouldn’t be required during profiling,
+            # but is temporarily needed due to the multimodal input dependency for chunked prefill
             request.py_multimodal_data = multimodal_data
             remaining_tokens -= max_num_tokens
             requests.append(request)
@@ -193,11 +193,10 @@ def _create_dummy_mm_context_request(
     def _create_dummy_context_requests(
             self, input_seq_len: int) -> List[trtllm.Request]:
         requests = []
-        if hasattr(
-                self._model_engine.model,
-                "original_arch") and MODEL_CLASS_VISION_ENCODER_MAPPING.get(
-                    self._model_engine.model.original_arch, None
-                ) and self._model_engine.attn_runtime_features.chunked_prefill:
+        if hasattr(self._model_engine.model,
+                   "original_arch") and MODEL_CLASS_VISION_ENCODER_MAPPING.get(
+                       self._model_engine.model.original_arch, None):
+            input_seq_len = min(self._max_num_tokens, input_seq_len)
             requests = self._create_dummy_mm_context_request(input_seq_len)
         # if succeed profiling with multimodal requests then return, otherwise profile
         # with default case
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -22,9 +22,7 @@ def test_profile_kvcache():
     VLM_MODEL = "Qwen2.5-VL-7B-Instruct"
     VLM_MODEL_PATH = get_model_path(VLM_MODEL)
 
-    build_config = BuildConfig(max_batch_size=2048,
-                               max_beam_width=1,
-                               max_seq_len=8192)
+    build_config = BuildConfig(max_beam_width=1, max_num_tokens=16384)
     dynamic_batch_config = DynamicBatchConfig(
         enable_batch_size_tuning=True,
         enable_max_num_tokens_tuning=False,