fix latest rebase from 7291cdc

johncalesp · johncalesp · commit 5cc49feec05c · 2025-10-14T12:39:10.000-04:00
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -416,8 +416,7 @@ def configure_kv_cache_capacity(self, py_executor: PyExecutor) -> None:
         # set max_gpu_total_bytes
         self._kv_cache_config.max_gpu_total_bytes = kv_cache_max_memory
         if isinstance(self._profiling_stage_data, dict):
-            self._profiling_stage_data[
-                "max_gpu_total_bytes"] = kv_cache_max_memory
+            self._profiling_stage_data["activation_bytes"] = activation_bytes
         # ---------------------------handle max_gpu_total_bytes---------------------------------
 
     def _create_kv_cache_manager(
diff --git a/tests/unittest/llmapi/test_memory_profiling.py b/tests/unittest/llmapi/test_memory_profiling.py
@@ -62,7 +62,7 @@ def test_profile_kvcache():
     py_executor = create_py_executor(llm_args=torchllm_args,
                                      checkpoint_dir=VLM_MODEL_PATH,
                                      profiling_stage_data=profiling_data)
-    vlm_max_gpu_total_bytes_with_mm_reqs = profiling_data["max_gpu_total_bytes"]
+    vlm_activation_bytes_with_mm_reqs = profiling_data["activation_bytes"]
     py_executor.shutdown()
     torch.cuda.empty_cache()
 
@@ -71,8 +71,8 @@ def test_profile_kvcache():
     py_executor_2 = create_py_executor(llm_args=torchllm_args,
                                        checkpoint_dir=VLM_MODEL_PATH,
                                        profiling_stage_data=profiling_data)
-    vlm_max_gpu_total_bytes_no_mm_reqs = profiling_data["max_gpu_total_bytes"]
+    vlm_activation_bytes_no_mm_reqs = profiling_data["activation_bytes"]
     py_executor_2.shutdown()
     torch.cuda.empty_cache()
 
-    assert vlm_max_gpu_total_bytes_with_mm_reqs < vlm_max_gpu_total_bytes_no_mm_reqs, f"available KVCache for VLMs is expected to be less when profiling with mm reqs, but got {vlm_max_gpu_total_bytes_with_mm_reqs} for mm reqs and {vlm_max_gpu_total_bytes_no_mm_reqs} without mm reqs"
+    assert vlm_activation_bytes_with_mm_reqs > vlm_activation_bytes_no_mm_reqs, f"Activation bytes should be higher with mm reqs, but got {vlm_activation_bytes_with_mm_reqs} for mm reqs and {vlm_activation_bytes_no_mm_reqs} without mm reqs"