Fix rebase to 1cdb0b6

johncalesp · johncalesp · commit 5410711e6d48 · 2025-10-15T10:02:03.000-04:00
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -322,9 +322,8 @@ def get_dummy_prompt(self, input_seq_len: int):
             # reduce img resolution
             img_max_dim = img_max_dim >> 1
 
-            image = self.get_dummy_image(
-                max_width=img_max_dim, max_height=img_max_dim
-            )  #w, h is sqrt of min_pixels value (3136)
+            image = self.get_dummy_image(max_width=img_max_dim,
+                                         max_height=img_max_dim)
 
             test_mm_prompt = default_multimodal_input_loader(
                 tokenizer=self.tokenizer,
@@ -342,7 +341,7 @@ def get_dummy_prompt(self, input_seq_len: int):
         # on how many tokens we need to complete the input_seq_len, the output of
         # default_multimodal_input_loader may give more tokens then the input_seq_len and this
         # can lead to errors.
-        # That is why we try to clipped the variable text_token_left to a lower threshold
+        # That is why we try to clip the variable text_token_left to a lower threshold
         # but close enough to the actual input_seq_len
         text_generation_perc_threshold = 0.95
         text_token_left = int((input_seq_len - len_prompt_tokens_ids) *
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -61,24 +61,17 @@ def get_kv_cache_manager_cls(model_config: ModelConfig):
 class KvCacheCreator:
     """Groups together logic related to KV cache construction."""
 
-    def __init__(
-        self,
-        *,
-        model_engine: PyTorchModelEngine,
-        draft_model_engine: Optional[PyTorchModelEngine],
-        mapping: Mapping,
-        net_max_seq_len: int,
-        kv_connector_manager: Optional[KvCacheConnectorManager],
-        max_num_tokens: int,
-        max_beam_width: int,
-        tokens_per_block: int,
-        max_seq_len: int,
-        max_batch_size: int,
-        kv_cache_config: KvCacheConfig,
-        pytorch_backend_config: PyTorchConfig,
-        speculative_config: SpeculativeConfig,
-        sparse_attention_config: SparseAttentionConfig,
-    ):
+    def __init__(self, *, model_engine: PyTorchModelEngine,
+                 draft_model_engine: Optional[PyTorchModelEngine],
+                 mapping: Mapping, net_max_seq_len: int,
+                 kv_connector_manager: Optional[KvCacheConnectorManager],
+                 max_num_tokens: int, max_beam_width: int,
+                 tokens_per_block: int, max_seq_len: int, max_batch_size: int,
+                 kv_cache_config: KvCacheConfig,
+                 pytorch_backend_config: PyTorchConfig,
+                 speculative_config: SpeculativeConfig,
+                 sparse_attention_config: SparseAttentionConfig,
+                 profiling_stage_data: Optional[dict]):
         self._model_engine = model_engine
         self._draft_model_engine = draft_model_engine
         self._mapping = mapping
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -52,7 +52,6 @@ def test_profile_kvcache():
         "postprocess_tokenizer_dir": VLM_MODEL,
         "reasoning_parser": None,
         "fail_fast_on_attention_window_too_large": False,
-        "enable_chunked_prefill": True,
         "cuda_graph_config": cuda_graph_config,
     }
 

Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,6 @@ def test_profile_kvcache():`
`52`	`52`	`"postprocess_tokenizer_dir": VLM_MODEL,`
`53`	`53`	`"reasoning_parser": None,`
`54`	`54`	`"fail_fast_on_attention_window_too_large": False,`
`55`		`- "enable_chunked_prefill": True,`
`56`	`55`	`"cuda_graph_config": cuda_graph_config,`
`57`	`56`	`}`
`58`	`57`