rebase and change logic

johncalesp · johncalesp · commit 6c094eb19411 · 2025-09-09T15:54:51.000Z
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -7,6 +7,7 @@
 from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig,
                           PreTrainedModel, Qwen2_5_VLForConditionalGeneration,
                           Qwen2VLForConditionalGeneration)
+from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
@@ -222,6 +223,27 @@ def get_rope_index(
             mrope_position_deltas, device=input_ids.device).unsqueeze(1)
         return position_ids, mrope_position_deltas
 
+    def get_prompt_for_profiling(self):
+        "Send prompt with largest image resolution for profiling the worst case"
+        max_width = 9999999
+        max_height = 9999999
+        resized_height, resized_width = smart_resize(
+            height=max_height,
+            width=max_width,
+            factor=self.model_config.vision_config.patch_size *
+            self.model_config.vision_config.spatial_merge_size,
+            min_pixels=self.processor.image_processor.min_pixels,
+            max_pixels=self.processor.image_processor.max_pixels,
+        )
+        img_tensor = torch.rand([3, resized_width, resized_height],
+                                device="cpu")
+        mm_data = {"image": [img_tensor]}
+
+        text_prompt = TextPrompt(
+            prompt="<|vision_start|><|image_pad|><|vision_end|>",
+            multi_modal_data=mm_data)
+        return text_prompt
+
     def _preprocess(self, text: dict[str, any], mm_data: dict[str, any],
                     mm_processor_kwargs: Dict[str, Any]):
         images = mm_data.get("image")
@@ -438,7 +460,7 @@ def __init__(
     ) -> None:
         model_config.pretrained_config.rope_scaling['type'] = 'mrope'
         config = model_config.pretrained_config
-
+        self.original_arch = model_config.pretrained_config.architectures[0]
         assert model_config.attn_backend == 'TRTLLM', "Qwen2/2.5-VL only supports TRTLLM backend now"
         super().__init__(config)
 
@@ -643,7 +665,6 @@ class Qwen2VLModel(Qwen2VLModelBase):
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
                  **kwargs):
-        self.is_multimodal = True  # variable used during profiling
         super().__init__(model_config, *args, **kwargs)
         if not DISAGG:
             self.mm_encoder = Qwen2VisionModelBase(
@@ -665,7 +686,6 @@ class Qwen2_5_VLModel(Qwen2VLModelBase):
 
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
                  **kwargs):
-        self.is_multimodal = True  # variable used during profiling
         super().__init__(model_config, *args, **kwargs)
         if not DISAGG:
             self.mm_encoder = Qwen2VisionModelBase(
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -9,6 +9,8 @@
 import tensorrt_llm
 import tensorrt_llm.bindings.executor as trtllm
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.modeling_utils import \
+    MODEL_CLASS_VISION_ENCODER_MAPPING
 from tensorrt_llm._utils import str_dtype_to_binding, torch_dtype_to_str
 from tensorrt_llm.bindings.executor import DecodingMode, ExecutorConfig
 from tensorrt_llm.inputs.registry import (create_input_processor,
@@ -56,14 +58,8 @@ def __init__(self, *, executor_config: ExecutorConfig,
         self._draft_model_engine = draft_model_engine
         self._mapping = mapping
         self._max_kv_tokens_in = self._executor_config.kv_cache_config.max_tokens
-        self._is_multimodal = getattr(self._model_engine.model, "is_multimodal",
-                                      False)
-        if self._is_multimodal:
-            self._dummy_reqs = self._create_dummy_mm_context_request(
-                net_max_seq_len - 1)
-        else:
-            self._dummy_reqs = self._create_dummy_context_requests(
-                net_max_seq_len - 1)
+        self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
+                                                               1)
         self._kv_connector_manager = kv_connector_manager
 
     @staticmethod
@@ -142,6 +138,7 @@ def _cal_max_memory(self, peak_memory, total_gpu_memory, fraction,
 
     def _create_dummy_mm_context_request(
             self, input_seq_len: int) -> List[trtllm.Request]:
+        requests = []
         self._model_name_or_path = getattr(self._model_engine.model,
                                            "name_or_path", None)
         self._tokenizer = AutoTokenizer.from_pretrained(
@@ -152,7 +149,7 @@ def _create_dummy_mm_context_request(
             logger.warning("The input processor of the model does not have the method [get_prompt_for_profiling] implemented." \
             "Profiling with the default input dummy context request. This may not take into account the memory consumption of " \
             "ViT's encoder")
-            return self._create_dummy_context_requests(input_seq_len)
+            return requests
         text_prompt = input_processor.get_prompt_for_profiling()
         max_beam_width = self._executor_config.max_beam_width
         input_processor_with_hash = create_input_processor_with_hash(
@@ -162,7 +159,6 @@ def _create_dummy_mm_context_request(
         multimodal_input = extra_processed_inputs.get('multimodal_input')
         multimodal_data = extra_processed_inputs.get('multimodal_data')
 
-        requests = []
         max_num_tokens = len(prompt_token_ids)
         remaining_tokens = max(max_num_tokens, input_seq_len)
         # add +1 to max_num_tokens to avoid assert in line 772 of tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -195,11 +191,18 @@ def _create_dummy_mm_context_request(
 
     def _create_dummy_context_requests(
             self, input_seq_len: int) -> List[trtllm.Request]:
+        requests = []
+        if MODEL_CLASS_VISION_ENCODER_MAPPING.get(
+                self._model_engine.model.original_arch, None):
+            requests = self._create_dummy_mm_context_request(input_seq_len)
+        # if succeed profiling with multimodal requests then return, otherwise profile
+        # with default case
+        if requests:
+            return requests
         vocab_size = self._model_engine.model.model_config.pretrained_config.vocab_size
         max_num_tokens = self._executor_config.max_num_tokens
         max_beam_width = self._executor_config.max_beam_width
 
-        requests = []
         input_seq_len = min(max_num_tokens, input_seq_len)
         remaining_tokens = max_num_tokens
         while remaining_tokens > 0: