address comments: change unit test, and add more asserts

johncalesp · johncalesp · commit f60cf914bb98 · 2025-09-22T15:43:35.000Z
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -1,8 +1,8 @@
 import copy
 import os
-import random
 from typing import Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
 import torch
 import torch.nn as nn
 from PIL import Image
@@ -229,10 +229,12 @@ def get_rope_index(
         return position_ids, mrope_position_deltas
 
     def get_dummy_text(self, input_seq_len: int):
-        return self.tokenizer.decode([
-            random.randint(0, self.model_config.vocab_size - 1)
-            for _ in range(input_seq_len)
-        ])
+        return self.tokenizer.decode(
+            np.random.randint(
+                low=0,
+                high=self.model_config.
+                vocab_size,  # Note: high is exclusive in NumPy
+                size=input_seq_len))
 
     def get_dummy_images(self, max_width: int, max_height: int,
                          num_images: int):
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -66,14 +66,14 @@ def __init__(self, *, model_engine: PyTorchModelEngine,
         self._max_num_tokens = max_num_tokens
         self._max_beam_width = max_beam_width
         self._max_seq_len = max_seq_len
+        self._profiling_stage_data = profiling_stage_data
         self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
                                                                1)
         self._kv_connector_manager = kv_connector_manager
         self._pytorch_backend_config = pytorch_backend_config
         self._speculative_config = speculative_config
         self._tokens_per_block = tokens_per_block
         self._max_batch_size = max_batch_size
-        self._profiling_stage_data = profiling_stage_data
 
     @staticmethod
     def _get_cache_size_per_token(model_config: ModelConfig,
@@ -152,16 +152,20 @@ def _cal_max_memory(self, peak_memory, total_gpu_memory, fraction,
     def _create_dummy_mm_context_request(
             self, input_seq_len: int) -> List[trtllm.Request]:
         requests = []
-        self._model_name_or_path = getattr(self._model_engine.model,
-                                           "name_or_path", None)
-        self._tokenizer = AutoTokenizer.from_pretrained(
-            self._model_name_or_path)
-        input_processor = create_input_processor(self._model_name_or_path,
-                                                 self._tokenizer)
+        if isinstance(
+                self._profiling_stage_data,
+                dict) and not self._profiling_stage_data.get("enable_mm_reqs"):
+            return requests
+
+        model_name_or_path = getattr(self._model_engine.model, "name_or_path",
+                                     None)
+        assert model_name_or_path is not None, "Could not determine model name or path"
+        tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+        input_processor = create_input_processor(model_name_or_path, tokenizer)
         if not (hasattr(input_processor, "get_dummy_prompt")):
-            logger.warning("The input processor of the model does not have the method [get_prompt_for_profiling] implemented." \
+            logger.warning("The input processor of the model does not have the method [get_dummy_prompt] implemented." \
             "Profiling with the default input dummy context request. This may not take into account the memory consumption of " \
-            "ViT's encoder")
+            "the image encoder")
             return requests
         text_prompt = input_processor.get_dummy_prompt(input_seq_len,
                                                        {'image': 1})
@@ -174,6 +178,7 @@ def _create_dummy_mm_context_request(
         multimodal_data = extra_processed_inputs.get('multimodal_data')
 
         max_num_tokens = len(prompt_token_ids)
+        assert max_num_tokens > 0, "the length of the prompt of the dummy mm req is less than or equal to 0"
         remaining_tokens = max(max_num_tokens, input_seq_len)
         if remaining_tokens > input_seq_len:
             logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -22,13 +22,10 @@ def test_profile_kvcache():
 
     VLM_MODEL = "Qwen2.5-VL-7B-Instruct"
     VLM_MODEL_PATH = get_model_path(VLM_MODEL)
-    LLM_MODEL = "Qwen2.5-7B-Instruct"
-    LLM_MODEL_PATH = get_model_path(LLM_MODEL)
 
     build_config = BuildConfig(max_batch_size=2048,
-                               max_num_tokens=8192,
                                max_beam_width=1,
-                               max_seq_len=None)
+                               max_seq_len=8192)
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9, )
 
     dynamic_batch_config = DynamicBatchConfig(
@@ -66,21 +63,19 @@ def test_profile_kvcache():
 
     torchllm_args = TorchLlmArgs(**llm_args)
 
-    profiling_data = dict()
+    profiling_data = {"enable_mm_reqs": True}
     py_executor = create_py_executor(llm_args=torchllm_args,
                                      checkpoint_dir=VLM_MODEL_PATH,
                                      profiling_stage_data=profiling_data)
-    vlm_max_gpu_total_bytes = profiling_data["max_gpu_total_bytes"]
+    vlm_max_gpu_total_bytes_with_mm_reqs = profiling_data["max_gpu_total_bytes"]
     py_executor.shutdown()
     torch.cuda.empty_cache()
 
-    profiling_data = dict()
-    llm_args["model"] = LLM_MODEL
-    llm_args["postprocess_tokenizer_dir"] = LLM_MODEL
+    profiling_data = {"enable_mm_reqs": False}
     torchllm_args = TorchLlmArgs(**llm_args)
     create_py_executor(llm_args=torchllm_args,
-                       checkpoint_dir=LLM_MODEL_PATH,
+                       checkpoint_dir=VLM_MODEL_PATH,
                        profiling_stage_data=profiling_data)
-    llm_max_gpu_total_bytes = profiling_data["max_gpu_total_bytes"]
+    vlm_max_gpu_total_bytes_no_mm_reqs = profiling_data["max_gpu_total_bytes"]
 
-    assert vlm_max_gpu_total_bytes < llm_max_gpu_total_bytes, f"available KVCache for VLMs is expected to be less than LLMs, but got {vlm_max_gpu_total_bytes} for VLM and {llm_max_gpu_total_bytes} for LLM"
+    assert vlm_max_gpu_total_bytes_with_mm_reqs < vlm_max_gpu_total_bytes_no_mm_reqs, f"available KVCache for VLMs is expected to be less when profiling with mm reqs, but got {vlm_max_gpu_total_bytes_with_mm_reqs} for mm reqs and {vlm_max_gpu_total_bytes_no_mm_reqs} without mm reqs"