Skip to content

Commit 9cc9de7

Browse files
committed
fix max_seq_len is less than max_num_tokens during profiling
Signed-off-by: John Calderon <jcalderon@nvidia.com>
1 parent 91da825 commit 9cc9de7

File tree

2 files changed

+15
-7
lines changed

2 files changed

+15
-7
lines changed

tensorrt_llm/_torch/pyexecutor/_util.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,13 @@ def __init__(self, *, model_engine: PyTorchModelEngine,
6565
self._max_kv_tokens_in = self._kv_cache_config.max_tokens
6666
self._max_num_tokens = max_num_tokens
6767
self._max_beam_width = max_beam_width
68+
self._max_seq_len = max_seq_len
6869
self._dummy_reqs = self._create_dummy_context_requests(net_max_seq_len -
6970
1)
7071
self._kv_connector_manager = kv_connector_manager
7172
self._pytorch_backend_config = pytorch_backend_config
7273
self._speculative_config = speculative_config
7374
self._tokens_per_block = tokens_per_block
74-
self._max_seq_len = max_seq_len
7575
self._max_batch_size = max_batch_size
7676
self._profiling_stage_data = profiling_stage_data
7777

@@ -175,8 +175,11 @@ def _create_dummy_mm_context_request(
175175
max_num_tokens = len(prompt_token_ids)
176176
remaining_tokens = max(max_num_tokens, input_seq_len)
177177
if remaining_tokens > input_seq_len:
178-
logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len." \
179-
f"Multimedia prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
178+
logger.warning(f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
179+
f"Multimodal prompt has {remaining_tokens} while the input_seq_len is: {input_seq_len}")
180+
## add + 1 to avoid error: RuntimeError: The max KV cache length of input sequences (X + 1) exceeds the KV cache manager's maximum supported length X.
181+
## at line "/code/tensorrt_llm/tensorrt_llm/_torch/attention_backend/trtllm.py", line 837
182+
self._max_seq_len = remaining_tokens + 1
180183
while remaining_tokens > 0:
181184
req_mm_input = trtllm.MultimodalInput(
182185
multimodal_hashes=multimodal_input.multimodal_hashes,

tests/unittest/llmapi/test_memory_profiling.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,10 @@
88
from tensorrt_llm.llmapi.llm_args import (CudaGraphConfig, KvCacheConfig,
99
TorchLlmArgs)
1010

11+
# isort: off
12+
from .test_llm import get_model_path
13+
# isort: on
14+
1115
pytestmark = pytest.mark.threadleak(enabled=False)
1216

1317

@@ -16,10 +20,11 @@ def test_profile_kvcache():
1620
free_gpu_memory_fraction=0.9)
1721
cuda_graph_config = CudaGraphConfig(max_batch_size=512)
1822

19-
VLM_MODEL = "Qwen/Qwen2.5-VL-3B-Instruct"
20-
VLM_MODEL_PATH = "/workspace/.cache/huggingface/hub/models--Qwen--Qwen2.5-VL-3B-Instruct/snapshots/66285546d2b821cf421d4f5eb2576359d3770cd3"
21-
LLM_MODEL = "Qwen/Qwen2.5-3B-Instruct"
22-
LLM_MODEL_PATH = "/workspace/.cache/huggingface/hub/models--Qwen--Qwen2.5-3B-Instruct/snapshots/aa8e72537993ba99e69dfaafa59ed015b17504d1"
23+
VLM_MODEL = "Qwen2.5-VL-7B-Instruct"
24+
VLM_MODEL_PATH = get_model_path(VLM_MODEL)
25+
LLM_MODEL = "Qwen2.5-7B-Instruct"
26+
LLM_MODEL_PATH = get_model_path(LLM_MODEL)
27+
2328
build_config = BuildConfig(max_batch_size=2048,
2429
max_num_tokens=8192,
2530
max_beam_width=1,

0 commit comments

Comments
 (0)