Skip to content

Commit 4b93904

Browse files
add cases for RCCA 5422621
add two cases for RCCA 5561153 Signed-off-by: FredricZ-2007 <226039983+fredricz-20070104@users.noreply.github.com>
1 parent 0442510 commit 4b93904

File tree

6 files changed

+286
-0
lines changed

6 files changed

+286
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
metadata:
2+
model_name: Qwen3-235B-A22B-FP8
3+
precision: fp8
4+
model_dir_name: Qwen3-235B-A22B-FP8
5+
supported_gpus:
6+
- GB200
7+
- GB300
8+
script_file: disaggr_torch.slurm
9+
benchmark_type: 1k1k
10+
config_index: 21
11+
slurm:
12+
script_file: disaggr_torch.slurm
13+
partition: <partition>
14+
account: <account>
15+
job_time: 02:00:00
16+
job_name: unified-benchmark
17+
numa_bind: true
18+
benchmark:
19+
mode: e2e
20+
use_nv_sa_benchmark: true
21+
multi_round: 8
22+
benchmark_ratio: 0.8
23+
streaming: true
24+
concurrency_list: 1 2 4 8 16 36
25+
input_length: 1024
26+
output_length: 1024
27+
dataset_file: <dataset_file>
28+
hardware:
29+
gpus_per_node: 4
30+
num_ctx_servers: 1
31+
num_gen_servers: 1
32+
environment:
33+
container_mount: <container_mount>
34+
container_image: <container_image>
35+
model_path: <model_path>
36+
trtllm_repo: ''
37+
build_wheel: false
38+
work_dir: <full_path_to_work_dir>
39+
profiling:
40+
nsys_on: false
41+
accuracy:
42+
enable_accuracy_test: false
43+
model: local-completions
44+
tasks: gsm8k
45+
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
46+
worker_config:
47+
gen:
48+
tensor_parallel_size: 4
49+
moe_expert_parallel_size: 4
50+
enable_attention_dp: false
51+
pipeline_parallel_size: 1
52+
max_batch_size: 64
53+
max_num_tokens: 2048
54+
max_seq_len: 2051
55+
cuda_graph_config:
56+
enable_padding: true
57+
max_batch_size: 128
58+
print_iter_log: true
59+
kv_cache_config:
60+
enable_block_reuse: true
61+
free_gpu_memory_fraction: 0.7
62+
dtype: fp8
63+
moe_config:
64+
backend: TRTLLM
65+
cache_transceiver_config:
66+
backend: NIXL
67+
stream_interval: 20
68+
num_postprocess_workers: 4
69+
allreduce_strategy: MNNVL
70+
disable_overlap_scheduler: false
71+
ctx:
72+
max_batch_size: 32
73+
max_num_tokens: 2048
74+
max_seq_len: 2051
75+
tensor_parallel_size: 4
76+
moe_expert_parallel_size: 4
77+
enable_attention_dp: false
78+
pipeline_parallel_size: 1
79+
print_iter_log: true
80+
cuda_graph_config: null
81+
disable_overlap_scheduler: true
82+
kv_cache_config:
83+
enable_block_reuse: true
84+
free_gpu_memory_fraction: 0.7
85+
dtype: fp8
86+
cache_transceiver_config:
87+
backend: NIXL
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
metadata:
2+
model_name: Qwen3-235B-A22B-FP8
3+
precision: fp8
4+
model_dir_name: Qwen3-235B-A22B-FP8
5+
supported_gpus:
6+
- GB200
7+
- GB300
8+
script_file: disaggr_torch.slurm
9+
benchmark_type: 1k1k
10+
config_index: 21
11+
slurm:
12+
script_file: disaggr_torch.slurm
13+
partition: <partition>
14+
account: <account>
15+
job_time: 02:00:00
16+
job_name: unified-benchmark
17+
numa_bind: true
18+
benchmark:
19+
mode: e2e
20+
use_nv_sa_benchmark: true
21+
multi_round: 8
22+
benchmark_ratio: 0.8
23+
streaming: true
24+
concurrency_list: 1 2 4 8 16 36
25+
input_length: 1024
26+
output_length: 1024
27+
dataset_file: <dataset_file>
28+
hardware:
29+
gpus_per_node: 4
30+
num_ctx_servers: 1
31+
num_gen_servers: 1
32+
environment:
33+
container_mount: <container_mount>
34+
container_image: <container_image>
35+
model_path: <model_path>
36+
trtllm_repo: ''
37+
build_wheel: false
38+
work_dir: <full_path_to_work_dir>
39+
profiling:
40+
nsys_on: false
41+
accuracy:
42+
enable_accuracy_test: false
43+
model: local-completions
44+
tasks: gsm8k
45+
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
46+
worker_config:
47+
gen:
48+
tensor_parallel_size: 4
49+
moe_expert_parallel_size: 4
50+
enable_attention_dp: false
51+
pipeline_parallel_size: 1
52+
max_batch_size: 64
53+
max_num_tokens: 2048
54+
max_seq_len: 2051
55+
cuda_graph_config:
56+
enable_padding: true
57+
max_batch_size: 128
58+
print_iter_log: true
59+
kv_cache_config:
60+
enable_block_reuse: true
61+
free_gpu_memory_fraction: 0.7
62+
dtype: fp8
63+
moe_config:
64+
backend: TRTLLM
65+
cache_transceiver_config:
66+
backend: UCX
67+
stream_interval: 20
68+
num_postprocess_workers: 4
69+
allreduce_strategy: MNNVL
70+
disable_overlap_scheduler: false
71+
ctx:
72+
max_batch_size: 32
73+
max_num_tokens: 2048
74+
max_seq_len: 2051
75+
tensor_parallel_size: 4
76+
moe_expert_parallel_size: 4
77+
enable_attention_dp: false
78+
pipeline_parallel_size: 1
79+
print_iter_log: true
80+
cuda_graph_config: null
81+
disable_overlap_scheduler: true
82+
kv_cache_config:
83+
enable_block_reuse: true
84+
free_gpu_memory_fraction: 0.7
85+
dtype: fp8
86+
cache_transceiver_config:
87+
backend: UCX
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# nvbugs: 5422621
2+
metadata:
3+
model_name: deepseek-r1-fp4
4+
precision: fp4
5+
model_dir_name: DeepSeek-R1-0528-FP4-V2
6+
supported_gpus:
7+
- GB200
8+
- GB300
9+
script_file: disaggr_torch.slurm
10+
benchmark_type: 8k1k
11+
config_index: 7
12+
dataset_file: datasets/deepseek-r1-8192-1024-200000-ratio-1_for_serve.json
13+
slurm:
14+
script_file: disaggr_torch.slurm
15+
partition: <partition>
16+
account: <account>
17+
job_time: 02:00:00
18+
job_name: unified-benchmark
19+
numa_bind: true
20+
benchmark:
21+
mode: gen_only
22+
use_nv_sa_benchmark: false
23+
multi_round: 8
24+
benchmark_ratio: 0.8
25+
streaming: true
26+
concurrency_list: '12288'
27+
input_length: 1024
28+
output_length: 1024
29+
dataset_file: <dataset_file>
30+
hardware:
31+
gpus_per_node: 4
32+
num_ctx_servers: 2
33+
num_gen_servers: 1
34+
environment:
35+
container_mount: <container_mount>
36+
container_image: <container_image>
37+
model_path: <model_path>
38+
trtllm_repo: ''
39+
build_wheel: false
40+
work_dir: <full_path_to_work_dir>
41+
profiling:
42+
nsys_on: false
43+
accuracy:
44+
enable_accuracy_test: false
45+
model: local-completions
46+
tasks: gsm8k
47+
model_args_extra: num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096
48+
worker_config:
49+
gen:
50+
enable_layerwise_nvtx_marker: true
51+
tensor_parallel_size: 48
52+
moe_expert_parallel_size: 48
53+
enable_attention_dp: true
54+
enable_lm_head_tp_in_adp: true
55+
pipeline_parallel_size: 1
56+
max_batch_size: 1024
57+
max_num_tokens: 1024
58+
max_seq_len: 2176
59+
cuda_graph_config:
60+
enable_padding: true
61+
batch_sizes:
62+
- 1
63+
- 2
64+
- 4
65+
- 8
66+
- 16
67+
- 32
68+
- 64
69+
- 128
70+
- 256
71+
- 512
72+
- 768
73+
- 1024
74+
- 2048
75+
print_iter_log: true
76+
kv_cache_config:
77+
enable_block_reuse: false
78+
free_gpu_memory_fraction: 0.7
79+
dtype: fp8
80+
moe_config:
81+
backend: WIDEEP
82+
load_balancer:
83+
num_slots: 288
84+
layer_updates_per_iter: 1
85+
cache_transceiver_config:
86+
max_tokens_in_buffer: 8320
87+
backend: DEFAULT
88+
stream_interval: 20
89+
ctx:
90+
enable_layerwise_nvtx_marker: true
91+
max_batch_size: 4
92+
max_num_tokens: 4480
93+
max_seq_len: 2176
94+
tensor_parallel_size: 4
95+
moe_expert_parallel_size: 4
96+
enable_attention_dp: true
97+
pipeline_parallel_size: 1
98+
print_iter_log: true
99+
cuda_graph_config: null
100+
disable_overlap_scheduler: true
101+
kv_cache_config:
102+
enable_block_reuse: false
103+
free_gpu_memory_fraction: 0.85
104+
dtype: fp8
105+
cache_transceiver_config:
106+
max_tokens_in_buffer: 8320
107+
backend: DEFAULT
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
test_disagg.py::TestDisaggBenchmark::test_benchmark[Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL]
2+
test_disagg.py::TestDisaggBenchmark::test_benchmark[Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX]

tests/integration/defs/perf/disagg/testlist/disagg.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_
1616
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_deepseek-r1-fp4_8k1k_ctx1_gen3_tep8_bs16_eplb0_mtp3_ccb-UCX]
1717
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-NIXL]
1818
test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen4_tep8_bs32_eplb0_mtp0_ccb-UCX]
19+
test_disagg.py::TestDisaggBenchmark::test_benchmark[Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-NIXL]
20+
test_disagg.py::TestDisaggBenchmark::test_benchmark[Qwen3-235B-A22B-FP8_1k1k_ctx1_gen1_tep8_bs32_eplb0_mtp0_ccb-UCX]
1921
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb0_mtp3_ccb-NIXL]
2022
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-NIXL]
2123
# test_disagg.py::TestDisaggBenchmark::test_benchmark[disagg_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp1_ccb-UCX]

tests/integration/defs/perf/disagg/testlist/wideep.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_
77
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_8k1k_ctx6_gen1_dep16_bs64_eplb288_mtp0_ccb-UCX]
88
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp3_ccb-NIXL]
99
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx1_gen1_dep32_bs32_eplb288_mtp0_ccb-NIXL]
10+
test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_deepseek-r1-fp4_1k1k_ctx2_gen1_dep48_bs16_eplb288_mtp3_ccb-DEFAULT]
1011
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep16_bs64_eplb288_mtp3_ccb-NIXL]
1112
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx1_gen1_dep32_bs16_eplb288_mtp3_ccb-NIXL]
1213
# test_disagg.py::TestDisaggBenchmark::test_benchmark[wideep_perf_Qwen3-235B-A22B-FP4_1k1k_ctx2_gen1_dep16_bs128_eplb288_mtp1_ccb-NIXL]

0 commit comments

Comments
 (0)