Skip to content
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
enable_iter_perf_stats: true
print_iter_log: false
cuda_graph_config:
max_batch_size: 16
enable_padding: false
moe_config:
backend: TRTLLM
max_num_tokens: 32768
speculative_config:
decoding_type: MTP
num_nextn_predict_layers: 3
disable_overlap_scheduler: true
enable_autotuner: true
kv_cache_config:
free_gpu_memory_fraction: 0.6
enable_block_reuse: true
enable_partial_reuse: false
enable_chunked_prefill: true
58 changes: 56 additions & 2 deletions tests/integration/defs/examples/serve/test_serve.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os
import time

import pytest
import requests
from defs.conftest import llm_models_root, skip_no_hopper
from defs.conftest import llm_models_root, skip_no_hopper, \
skip_pre_hopper, skip_post_blackwell_ultra
from defs.trt_test_alternative import popen, print_error, print_info
from openai import OpenAI
from requests.exceptions import RequestException
Expand Down Expand Up @@ -130,3 +131,56 @@ def test_extra_llm_api_options(serve_test_root):
model_name = model_path.split('/')[-1] # "Qwen3-30B-A3B-FP8"
# Test the server with OpenAI chat completion
check_openai_chat_completion(model_name=model_name)


@skip_pre_hopper
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fp4 does not support on hopper, use skip_pre_blackwell?

@skip_post_blackwell_ultra
@pytest.mark.skip_less_device(8)
def test_extra_llm_api_options_for_deepseek_r1_fp4(serve_test_root):
test_configs_root = f"{serve_test_root}/test_configs"

# moe backend = CUTLASS which only supports fp8 blockscale on Hopper
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this comment is kind of misleading

config_file = f"{test_configs_root}/DeepSeek-R1-FP4.yml"
model_path = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4"

# Assert that required files and directories exist
assert os.path.exists(
test_configs_root
), f"test_configs_root directory does not exist: {test_configs_root}"
assert os.path.exists(
config_file), f"config_file does not exist: {config_file}"
assert os.path.exists(
model_path), f"model_path does not exist: {model_path}"

cmd = [
"trtllm-serve",
model_path,
"--host",
"0.0.0.0",
"--port",
"8000",
"--backend",
"pytorch",
"--max_batch_size",
"32",
"--max_num_tokens",
"32768",
"--max_seq_len",
"163840",
"--tp_size",
"8",
"--ep_size",
"1",
"--extra_llm_api_options",
config_file,
"--log_level",
"info",
]

print_info("Launching trtllm-serve...")
with popen(cmd):
check_server_ready(timeout_timer=3600)
# Extract model name from the model path for consistency
model_name = model_path.split('/')[-1] # "Qwen3-30B-A3B-FP8"
# Test the server with OpenAI chat completion
check_openai_chat_completion(model_name=model_name)
1 change: 1 addition & 0 deletions tests/integration/test_lists/qa/llm_function_core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,7 @@ examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2]
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2]

examples/serve/test_serve.py::test_extra_llm_api_options
examples/serve/test_serve.py::test_extra_llm_api_options_for_deepseek_r1_fp4 TIMEOUT (120)
examples/serve/test_serve_negative.py::test_invalid_max_tokens
examples/serve/test_serve_negative.py::test_invalid_temperature
examples/serve/test_serve_negative.py::test_invalid_top_p[-0.1]
Expand Down
Loading