Skip to content

Commit 5410711

Browse files
committed
Fix rebase to 1cdb0b6
Signed-off-by: John Calderon <jcalderon@nvidia.com>
1 parent f011861 commit 5410711

File tree

3 files changed

+14
-23
lines changed

3 files changed

+14
-23
lines changed

tensorrt_llm/_torch/models/modeling_qwen2vl.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,8 @@ def get_dummy_prompt(self, input_seq_len: int):
322322
# reduce img resolution
323323
img_max_dim = img_max_dim >> 1
324324

325-
image = self.get_dummy_image(
326-
max_width=img_max_dim, max_height=img_max_dim
327-
) #w, h is sqrt of min_pixels value (3136)
325+
image = self.get_dummy_image(max_width=img_max_dim,
326+
max_height=img_max_dim)
328327

329328
test_mm_prompt = default_multimodal_input_loader(
330329
tokenizer=self.tokenizer,
@@ -342,7 +341,7 @@ def get_dummy_prompt(self, input_seq_len: int):
342341
# on how many tokens we need to complete the input_seq_len, the output of
343342
# default_multimodal_input_loader may give more tokens then the input_seq_len and this
344343
# can lead to errors.
345-
# That is why we try to clipped the variable text_token_left to a lower threshold
344+
# That is why we try to clip the variable text_token_left to a lower threshold
346345
# but close enough to the actual input_seq_len
347346
text_generation_perc_threshold = 0.95
348347
text_token_left = int((input_seq_len - len_prompt_tokens_ids) *

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,24 +61,17 @@ def get_kv_cache_manager_cls(model_config: ModelConfig):
6161
class KvCacheCreator:
6262
"""Groups together logic related to KV cache construction."""
6363

64-
def __init__(
65-
self,
66-
*,
67-
model_engine: PyTorchModelEngine,
68-
draft_model_engine: Optional[PyTorchModelEngine],
69-
mapping: Mapping,
70-
net_max_seq_len: int,
71-
kv_connector_manager: Optional[KvCacheConnectorManager],
72-
max_num_tokens: int,
73-
max_beam_width: int,
74-
tokens_per_block: int,
75-
max_seq_len: int,
76-
max_batch_size: int,
77-
kv_cache_config: KvCacheConfig,
78-
pytorch_backend_config: PyTorchConfig,
79-
speculative_config: SpeculativeConfig,
80-
sparse_attention_config: SparseAttentionConfig,
81-
):
64+
def __init__(self, *, model_engine: PyTorchModelEngine,
65+
draft_model_engine: Optional[PyTorchModelEngine],
66+
mapping: Mapping, net_max_seq_len: int,
67+
kv_connector_manager: Optional[KvCacheConnectorManager],
68+
max_num_tokens: int, max_beam_width: int,
69+
tokens_per_block: int, max_seq_len: int, max_batch_size: int,
70+
kv_cache_config: KvCacheConfig,
71+
pytorch_backend_config: PyTorchConfig,
72+
speculative_config: SpeculativeConfig,
73+
sparse_attention_config: SparseAttentionConfig,
74+
profiling_stage_data: Optional[dict]):
8275
self._model_engine = model_engine
8376
self._draft_model_engine = draft_model_engine
8477
self._mapping = mapping

tests/unittest/llmapi/test_memory_profiling.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ def test_profile_kvcache():
5252
"postprocess_tokenizer_dir": VLM_MODEL,
5353
"reasoning_parser": None,
5454
"fail_fast_on_attention_window_too_large": False,
55-
"enable_chunked_prefill": True,
5655
"cuda_graph_config": cuda_graph_config,
5756
}
5857

0 commit comments

Comments
 (0)