NVIDIA · fredricz-20070104 · Nov 25, 2025 · Nov 27, 2025 · Nov 27, 2025 · Nov 27, 2025
diff --git a/tests/integration/defs/examples/serve/test_configs/DeepSeek-R1-FP4.yml b/tests/integration/defs/examples/serve/test_configs/DeepSeek-R1-FP4.yml
@@ -0,0 +1,18 @@
+enable_iter_perf_stats: true
+print_iter_log: false
+cuda_graph_config:
+ max_batch_size: 16
+ enable_padding: false
+moe_config:
+ backend: TRTLLM
+ max_num_tokens: 32768
+speculative_config:
+  decoding_type: MTP
+  num_nextn_predict_layers: 3
+disable_overlap_scheduler: true
+enable_autotuner: true
+kv_cache_config:
+  free_gpu_memory_fraction: 0.6
+  enable_block_reuse: true
+  enable_partial_reuse: false
+enable_chunked_prefill: true
diff --git a/tests/integration/defs/examples/serve/test_serve.py b/tests/integration/defs/examples/serve/test_serve.py
@@ -1,8 +1,9 @@
 import os
 import time
-
+import pytest
 import requests
-from defs.conftest import llm_models_root, skip_no_hopper
+from defs.conftest import llm_models_root, skip_no_hopper, \
+    skip_pre_hopper, skip_post_blackwell_ultra
 from defs.trt_test_alternative import popen, print_error, print_info
 from openai import OpenAI
 from requests.exceptions import RequestException
@@ -130,3 +131,56 @@ def test_extra_llm_api_options(serve_test_root):
         model_name = model_path.split('/')[-1]  # "Qwen3-30B-A3B-FP8"
         # Test the server with OpenAI chat completion
         check_openai_chat_completion(model_name=model_name)
+
+
+@skip_pre_hopper
+@skip_post_blackwell_ultra
+@pytest.mark.skip_less_device(8)
+def test_extra_llm_api_options_for_deepseek_r1_fp4(serve_test_root):
+    test_configs_root = f"{serve_test_root}/test_configs"
+
+    # moe backend = CUTLASS which only supports fp8 blockscale on Hopper
+    config_file = f"{test_configs_root}/DeepSeek-R1-FP4.yml"
+    model_path = f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4"
+
+    # Assert that required files and directories exist
+    assert os.path.exists(
+        test_configs_root
+    ), f"test_configs_root directory does not exist: {test_configs_root}"
+    assert os.path.exists(
+        config_file), f"config_file does not exist: {config_file}"
+    assert os.path.exists(
+        model_path), f"model_path does not exist: {model_path}"
+
+    cmd = [
+        "trtllm-serve",
+        model_path,
+        "--host",
+        "0.0.0.0",
+        "--port",
+        "8000",
+        "--backend",
+        "pytorch",
+        "--max_batch_size",
+        "32",
+        "--max_num_tokens",
+        "32768",
+        "--max_seq_len",
+        "163840",
+        "--tp_size",
+        "8",
+        "--ep_size",
+        "1",
+        "--extra_llm_api_options",
+        config_file,
+        "--log_level", 
+        "info",
+    ]
+
+    print_info("Launching trtllm-serve...")
+    with popen(cmd):
+        check_server_ready(timeout_timer=3600)
+        # Extract model name from the model path for consistency
+        model_name = model_path.split('/')[-1]  # "Qwen3-30B-A3B-FP8"
+        # Test the server with OpenAI chat completion
+        check_openai_chat_completion(model_name=model_name)
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -755,6 +755,7 @@ examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2]
 examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2]
 
 examples/serve/test_serve.py::test_extra_llm_api_options
+examples/serve/test_serve.py::test_extra_llm_api_options_for_deepseek_r1_fp4 TIMEOUT (120)
 examples/serve/test_serve_negative.py::test_invalid_max_tokens
 examples/serve/test_serve_negative.py::test_invalid_temperature
 examples/serve/test_serve_negative.py::test_invalid_top_p[-0.1]