@@ -65,13 +65,13 @@ def __init__(self, *, model_engine: PyTorchModelEngine,
6565 self ._max_kv_tokens_in = self ._kv_cache_config .max_tokens
6666 self ._max_num_tokens = max_num_tokens
6767 self ._max_beam_width = max_beam_width
68+ self ._max_seq_len = max_seq_len
6869 self ._dummy_reqs = self ._create_dummy_context_requests (net_max_seq_len -
6970 1 )
7071 self ._kv_connector_manager = kv_connector_manager
7172 self ._pytorch_backend_config = pytorch_backend_config
7273 self ._speculative_config = speculative_config
7374 self ._tokens_per_block = tokens_per_block
74- self ._max_seq_len = max_seq_len
7575 self ._max_batch_size = max_batch_size
7676 self ._profiling_stage_data = profiling_stage_data
7777
@@ -175,8 +175,11 @@ def _create_dummy_mm_context_request(
175175 max_num_tokens = len (prompt_token_ids )
176176 remaining_tokens = max (max_num_tokens , input_seq_len )
177177 if remaining_tokens > input_seq_len :
178- logger .warning (f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len." \
179- f"Multimedia prompt has { remaining_tokens } while the input_seq_len is: { input_seq_len } " )
178+ logger .warning (f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
179+ f"Multimodal prompt has { remaining_tokens } while the input_seq_len is: { input_seq_len } " )
180+ ## add + 1 to avoid error: RuntimeError: The max KV cache length of input sequences (X + 1) exceeds the KV cache manager's maximum supported length X.
181+ ## at line "/code/tensorrt_llm/tensorrt_llm/_torch/attention_backend/trtllm.py", line 837
182+ self ._max_seq_len = remaining_tokens + 1
180183 while remaining_tokens > 0 :
181184 req_mm_input = trtllm .MultimodalInput (
182185 multimodal_hashes = multimodal_input .multimodal_hashes ,
0 commit comments