From 1153cd24f6534b9b7cc7e055d962d084856294c2 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Wed, 26 Nov 2025 02:02:32 -0800 Subject: [PATCH 01/10] Clean up PDL logics Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../disaggregated/slurm/benchmark/config.yaml | 4 ++-- .../slurm/benchmark/disaggr_torch.slurm | 15 ++------------- .../slurm/benchmark/start_worker.sh | 16 +++++----------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index dd8705ce629..af9396dc276 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -34,8 +34,8 @@ environment: build_wheel: false # Don't build the wheel when launching multiple jobs trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead work_dir: "" - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1" - server_env_var: "" + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" # Profiling Configuration profiling: diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index ef308a822a3..4ad8142d0a5 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -169,17 +169,6 @@ elif [ -d "${trtllm_repo}" ]; then echo "TensorRT-LLM installation completed successfully" fi -# Get enable_pdl from gen config -enable_pdl=$(python3 -c "import yaml; import sys; -try: - with open('${gen_config_path}') as f: - c = yaml.safe_load(f) - print(str(not c.get('enable_attention_dp', True)).lower()) -except Exception as e: - print(f'Error reading config: {e}', file=sys.stderr) - sys.exit(1) -") - # Get node lists all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort)) total_nodes_num=${#all_nodes[@]} @@ -211,7 +200,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do --container-mounts=${container_mount} \ --mpi=pmix \ bash ${work_dir}/start_worker.sh \ - "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \ + "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \ &> ${full_logdir}/output_gen_${i}.log & done @@ -226,7 +215,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do --container-mounts=${container_mount} \ --mpi=pmix \ bash ${work_dir}/start_worker.sh \ - "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \ + "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \ &> ${full_logdir}/output_ctx_${i}.log & done diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh index 5543035ee6f..046e8d84bd0 100644 --- a/examples/disaggregated/slurm/benchmark/start_worker.sh +++ b/examples/disaggregated/slurm/benchmark/start_worker.sh @@ -9,15 +9,13 @@ model_path=${3} port=${4} benchmark_mode=${5} concurrency=${6} -enable_pdl=${7} -numa_bind=${8} -log_dir=${9} -enable_nsys=${10} -config_file=${11} -worker_env_var=${12} +numa_bind=${7} +log_dir=${8} +enable_nsys=${9} +config_file=${10} +worker_env_var=${11} unset UCX_TLS -echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}" echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}" # Export worker environment variables from config @@ -26,10 +24,6 @@ for env_var in ${worker_env_var}; do echo "Exported: ${env_var}" done -if [ "${enable_pdl}" = "true" ]; then - export TRTLLM_ENABLE_PDL=1 -fi - if [ "${numa_bind}" = "true" ]; then numa_bind_cmd="numactl -m 0,1" echo "numactl -m 0,1 - Only allocate memory from nodes on GB200" From 66566dd3a3d91f18f0166114e0c1c842f3cf496b Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Wed, 26 Nov 2025 04:17:07 -0800 Subject: [PATCH 02/10] Add extra args Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/config.yaml | 1 + examples/disaggregated/slurm/benchmark/submit.py | 1 + 2 files changed, 2 insertions(+) diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index af9396dc276..85dcf7d6895 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -5,6 +5,7 @@ slurm: account: "" job_time: "02:00:00" job_name: "" + extra_args: "" numa_bind: true # Only enable for GB200 NVL72 # Benchmark Mode diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 37efd0764b3..60fab020ba7 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -143,6 +143,7 @@ def submit_job(config): f'--ntasks={total_tasks}', f'--ntasks-per-node={hw_config["gpus_per_node"]}', f'--segment={total_nodes}', + slurm_config['extra_args'], slurm_config['script_file'], # Hardware configuration str(hw_config['gpus_per_node']), From 40d274d7eb72eec5e2e8d8c832385d4d21e299f6 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 00:23:33 -0800 Subject: [PATCH 03/10] Support iter range Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../disaggregated/slurm/benchmark/config.yaml | 2 ++ .../slurm/benchmark/disaggr_torch.slurm | 20 +++++++++++-------- .../slurm/benchmark/start_worker.sh | 14 +++++-------- .../disaggregated/slurm/benchmark/submit.py | 2 ++ 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index 85dcf7d6895..7ba8ed4e3f2 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -41,6 +41,8 @@ environment: # Profiling Configuration profiling: nsys_on: false # Set to true to enable profiling + ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers + gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers # Accuracy Configuration accuracy: diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm index 4ad8142d0a5..83baa3ffeda 100644 --- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm +++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm @@ -40,18 +40,20 @@ trtllm_wheel_path=${28} # Profiling nsys_on=${29} +ctx_profile_range=${30} +gen_profile_range=${31} # Accuracy evaluation -enable_accuracy_test=${30} -accuracy_model=${31} -accuracy_tasks=${32} -model_args_extra=${33} +enable_accuracy_test=${32} +accuracy_model=${33} +accuracy_tasks=${34} +model_args_extra=${35} # Worker environment variables -worker_env_var=${34} +worker_env_var=${36} # Server environment variables -server_env_var=${35} +server_env_var=${37} # Print all parsed arguments echo "Parsed arguments:" @@ -90,6 +92,8 @@ echo " build_wheel: ${build_wheel}" echo " trtllm_wheel_path: ${trtllm_wheel_path}" echo " work_dir: ${work_dir}" echo " nsys_on: ${nsys_on}" +echo " ctx_profile_range: ${ctx_profile_range}" +echo " gen_profile_range: ${gen_profile_range}" echo echo "Accuracy Configuration:" echo " enable_accuracy_test: ${enable_accuracy_test}" @@ -200,7 +204,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do --container-mounts=${container_mount} \ --mpi=pmix \ bash ${work_dir}/start_worker.sh \ - "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \ + "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_profile_range}" "${gen_config_path}" "${worker_env_var}" \ &> ${full_logdir}/output_gen_${i}.log & done @@ -215,7 +219,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do --container-mounts=${container_mount} \ --mpi=pmix \ bash ${work_dir}/start_worker.sh \ - "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \ + "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_profile_range}" "${ctx_config_path}" "${worker_env_var}" \ &> ${full_logdir}/output_ctx_${i}.log & done diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh index 046e8d84bd0..5d8d95cd56d 100644 --- a/examples/disaggregated/slurm/benchmark/start_worker.sh +++ b/examples/disaggregated/slurm/benchmark/start_worker.sh @@ -12,8 +12,9 @@ concurrency=${6} numa_bind=${7} log_dir=${8} enable_nsys=${9} -config_file=${10} -worker_env_var=${11} +profile_range=${10} +config_file=${11} +worker_env_var=${12} unset UCX_TLS echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}" @@ -58,13 +59,8 @@ else export TLLM_PROFILE_RECORD_GC=1 export TLLM_NVTX_DEBUG=1 nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none" - if [ "${role}" = "GEN" ]; then - export TLLM_PROFILE_START_STOP=200-250 - echo "nsys is enabled on gen_gpus" - elif [ "${role}" = "CTX" ]; then - export TLLM_PROFILE_START_STOP=10-30 - echo "nsys is enabled on ctx_gpus" - fi + export TLLM_PROFILE_START_STOP=${profile_range} + echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}" ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \ trtllm-serve ${model_path} \ --host $(hostname) --port ${port} \ diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 60fab020ba7..8f1d297b08d 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -184,6 +184,8 @@ def submit_job(config): # Profiling str(config['profiling']['nsys_on']).lower(), + config['profiling']['ctx_profile_range'], + config['profiling']['gen_profile_range'], # Accuracy evaluation str(config['accuracy']['enable_accuracy_test']).lower(), From 6de0ed39a23844ceedd0a627317e68b2ce476b38 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 00:28:16 -0800 Subject: [PATCH 04/10] Polish Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../slurm/benchmark/start_worker.sh | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh index 5d8d95cd56d..a8576725c0e 100644 --- a/examples/disaggregated/slurm/benchmark/start_worker.sh +++ b/examples/disaggregated/slurm/benchmark/start_worker.sh @@ -40,29 +40,27 @@ fi echo "config_file: ${config_file}" -# save the hostname to a file - -# if SLURM_NODEID is 0 +# if SLURM_NODEID is 0, save the hostname to a file if [ "${SLURM_NODEID}" = "0" ]; then mkdir -p ${log_dir}/hostnames/ echo $(hostname) > ${log_dir}/hostnames/${role}_${instance_id}.txt echo "hostname saved to ${log_dir}/hostnames/${role}_${instance_id}.txt" fi -#check if nsys is enabled +nsys_prefix="" if [ "${enable_nsys}" != "true" ]; then echo "nsys is not enabled, start normal flow" - trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file} else - nsys_prefix="" nsys_file=${log_dir}/nsys_worker_proc_${role}_${instance_id}_${SLURM_PROCID} export TLLM_PROFILE_RECORD_GC=1 export TLLM_NVTX_DEBUG=1 - nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none" + export NSYS_MPI_STORE_TEAMS_PER_RANK=1 export TLLM_PROFILE_START_STOP=${profile_range} echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}" - ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \ - trtllm-serve ${model_path} \ - --host $(hostname) --port ${port} \ - --extra_llm_api_options ${config_file} + nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none" fi + +${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \ + trtllm-serve ${model_path} \ + --host $(hostname) --port ${port} \ + --extra_llm_api_options ${config_file} From 923f6b13aad48efb8e621d29e1e3db2471b7d1c2 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 01:08:31 -0800 Subject: [PATCH 05/10] Minor fixes Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/config.yaml | 2 +- examples/disaggregated/slurm/benchmark/submit.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml index 7ba8ed4e3f2..c15748fe936 100644 --- a/examples/disaggregated/slurm/benchmark/config.yaml +++ b/examples/disaggregated/slurm/benchmark/config.yaml @@ -5,7 +5,7 @@ slurm: account: "" job_time: "02:00:00" job_name: "" - extra_args: "" + extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2" numa_bind: true # Only enable for GB200 NVL72 # Benchmark Mode diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 8f1d297b08d..7cfa377de5b 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -135,7 +135,6 @@ def submit_job(config): cmd = [ 'sbatch', f'--partition={slurm_config["partition"]}', - f'--gres=gpu:{hw_config["gpus_per_node"]}', f'--account={slurm_config["account"]}', f'--time={slurm_config["job_time"]}', f'--job-name={slurm_config["job_name"]}', @@ -143,7 +142,7 @@ def submit_job(config): f'--ntasks={total_tasks}', f'--ntasks-per-node={hw_config["gpus_per_node"]}', f'--segment={total_nodes}', - slurm_config['extra_args'], + *([arg for arg in slurm_config['extra_args'].split() if arg]), slurm_config['script_file'], # Hardware configuration str(hw_config['gpus_per_node']), From 14d17029ae30452b4408e02e21cb988c9aa710ea Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:09:52 -0800 Subject: [PATCH 06/10] Support custome log dir Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- .../disaggregated/slurm/benchmark/submit.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 7cfa377de5b..012bae3634e 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -6,6 +6,7 @@ import shutil import subprocess import sys +from datetime import datetime import yaml @@ -22,6 +23,10 @@ def parse_args(): '--dir', type=str, help='Directory containing YAML configuration files') + group.add_argument('--log-dir', + type=str, + default=None, + help='Log directory') return parser.parse_args() @@ -45,7 +50,7 @@ def calculate_nodes(world_size, num_servers, gpus_per_node): return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers -def submit_job(config): +def submit_job(config, log_dir): # Extract configurations slurm_config = config['slurm'] hw_config = config['hardware'] @@ -101,25 +106,28 @@ def submit_job(config): gen_enable_attention_dp = config['worker_config']['gen'][ 'enable_attention_dp'] - # Create base log directory path - log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}") + if log_dir is None: + # Create base log directory path + date_prefix = datetime.now().strftime("%Y%m%d") + log_base = os.path.join(env_config['work_dir'], f"{date_prefix}/{isl}-{osl}") - # Get eplb num_slots for gen worker - load_balancer_config = config['worker_config']['gen'].get( - 'moe_config', {}).get('load_balancer', {}) - if isinstance(load_balancer_config, str): - with open(load_balancer_config, 'r') as f: - load_balancer_config = yaml.safe_load(f) - eplb_num_slots = load_balancer_config.get('num_slots', 0) + # Get eplb num_slots for gen worker + load_balancer_config = config['worker_config']['gen'].get( + 'moe_config', {}).get('load_balancer', {}) + if isinstance(load_balancer_config, str): + with open(load_balancer_config, 'r') as f: + load_balancer_config = yaml.safe_load(f) + eplb_num_slots = load_balancer_config.get('num_slots', 0) - # Determine directory suffix based on attention_dp - if gen_enable_attention_dp: - dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}" - else: - dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}" + # Determine directory suffix based on attention_dp + if gen_enable_attention_dp: + dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}" + else: + dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}" + + # Create full log directory path + log_dir = os.path.join(log_base, dir_suffix) - # Create full log directory path - log_dir = os.path.join(log_base, dir_suffix) # Remove existing directory if it exists if os.path.exists(log_dir): shutil.rmtree(log_dir) @@ -231,7 +239,7 @@ def main(): print(f"\nProcessing: {config_file}") try: config = load_config(config_file) - submit_job(config) + submit_job(config, args.log_dir) print(f"Successfully submitted job for: {config_file}") except Exception as e: print(f"Error processing {config_file}: {e}", file=sys.stderr) From afed00771c886c80648216a882cdd96c94264a67 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 19:22:29 -0800 Subject: [PATCH 07/10] Add some more logs Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/submit.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 012bae3634e..a4381b34be8 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -130,8 +130,10 @@ def submit_job(config, log_dir): # Remove existing directory if it exists if os.path.exists(log_dir): + print(f"[WARNING] Removing existing log directory: {log_dir}") shutil.rmtree(log_dir) os.makedirs(log_dir) + print(f"Log will be saved to: {log_dir}") # Setup config file paths and save worker configs ctx_config_path = os.path.join(log_dir, 'ctx_config.yaml') @@ -236,11 +238,11 @@ def main(): # Process each config file for config_file in config_files: - print(f"\nProcessing: {config_file}") + print(f"Processing: {config_file}") try: config = load_config(config_file) submit_job(config, args.log_dir) - print(f"Successfully submitted job for: {config_file}") + print(f"Successfully submitted job for: {config_file}\n") except Exception as e: print(f"Error processing {config_file}: {e}", file=sys.stderr) # Continue processing other files even if one fails From b2c95eeafc11d8afc03844092383d45007a0ca30 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 21:19:39 -0800 Subject: [PATCH 08/10] Backward compatibility Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/submit.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index a4381b34be8..75f6ed927a1 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -53,6 +53,8 @@ def calculate_nodes(world_size, num_servers, gpus_per_node): def submit_job(config, log_dir): # Extract configurations slurm_config = config['slurm'] + slurm_config.setdefault('extra_args', '') + hw_config = config['hardware'] env_config = config['environment'] @@ -76,6 +78,11 @@ def submit_job(config, log_dir): env_config.setdefault('worker_env_var', '') env_config.setdefault('server_env_var', '') + profiling_config = config.get('profiling', {}) + profiling_config.setdefault('nsys_on', False) + profiling_config.setdefault('ctx_profile_range', '10-30') + profiling_config.setdefault('gen_profile_range', '200-250') + # Get number of servers from config ctx_num = hw_config['num_ctx_servers'] gen_num = hw_config['num_gen_servers'] @@ -192,9 +199,9 @@ def submit_job(config, log_dir): env_config['trtllm_wheel_path'], # Profiling - str(config['profiling']['nsys_on']).lower(), - config['profiling']['ctx_profile_range'], - config['profiling']['gen_profile_range'], + str(profiling_config['nsys_on']).lower(), + profiling_config['ctx_profile_range'], + profiling_config['gen_profile_range'], # Accuracy evaluation str(config['accuracy']['enable_accuracy_test']).lower(), From ddad78449aea3f5393a4c42ee1fcfe3b652e9277 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 21:22:16 -0800 Subject: [PATCH 09/10] Update Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/wide_ep/slurm_scripts/config.yaml | 26 +++++++++++----------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml index 9deb1eb9a86..12d83248bf1 100644 --- a/examples/wide_ep/slurm_scripts/config.yaml +++ b/examples/wide_ep/slurm_scripts/config.yaml @@ -5,19 +5,14 @@ slurm: account: "" job_time: "02:00:00" job_name: "" + extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2" numa_bind: true # Only enable for GB200 NVL72 -# Hardware Configuration -hardware: - gpus_per_node: 4 # Modify this with your hardware configuration - num_ctx_servers: 2 # Number of context servers - num_gen_servers: 1 # Number of generation servers - # Benchmark Mode benchmark: mode: "e2e" # Options: e2e, gen_only use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script - multi_round: 1 # Number of benchmark rounds + multi_round: 8 # Number of benchmark rounds benchmark_ratio: 0.8 # Benchmark ratio streaming: true # Enable streaming mode concurrency_list: "1024" @@ -25,6 +20,12 @@ benchmark: output_length: 1024 # Output sequence length dataset_file: "" +# Hardware Configuration +hardware: + gpus_per_node: 4 # Modify this with your hardware configuration + num_ctx_servers: 1 # Number of context servers + num_gen_servers: 1 # Number of generation servers + # Environment Configuration environment: container_mount: "" # Format: path1:path1,path2:path2 @@ -34,24 +35,24 @@ environment: build_wheel: false # Don't build the wheel when launching multiple jobs trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead work_dir: "" - worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1" # Environment variables for workers - server_env_var: "" # Environment variables for server + worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes" + server_env_var: "TRTLLM_SERVER_DISABLE_GC=1" # Profiling Configuration profiling: nsys_on: false # Set to true to enable profiling + ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers + gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers # Accuracy Configuration accuracy: enable_accuracy_test: false # Set to true to enable accuracy evaluation model: "local-completions" # Model type for lm_eval tasks: "gsm8k" # Evaluation tasks (comma-separated) - model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=False,timeout=1200,max_gen_toks=256,max_length=512" # Extra model arguments for lm_eval + model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096" # Extra model arguments for lm_eval -# Worker Configuration worker_config: gen: - enable_layerwise_nvtx_marker: true tensor_parallel_size: 32 moe_expert_parallel_size: 32 enable_attention_dp: true @@ -97,7 +98,6 @@ worker_config: decoding_type: MTP num_nextn_predict_layers: 3 ctx: - enable_layerwise_nvtx_marker: true max_batch_size: 1 max_num_tokens: 8448 max_seq_len: 8212 From d9fce1ada0b5292454583281a1058bc28f232e95 Mon Sep 17 00:00:00 2001 From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> Date: Thu, 27 Nov 2025 23:53:38 -0800 Subject: [PATCH 10/10] Fix style Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com> --- examples/disaggregated/slurm/benchmark/submit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py index 75f6ed927a1..9aa00356d2c 100644 --- a/examples/disaggregated/slurm/benchmark/submit.py +++ b/examples/disaggregated/slurm/benchmark/submit.py @@ -116,7 +116,8 @@ def submit_job(config, log_dir): if log_dir is None: # Create base log directory path date_prefix = datetime.now().strftime("%Y%m%d") - log_base = os.path.join(env_config['work_dir'], f"{date_prefix}/{isl}-{osl}") + log_base = os.path.join(env_config['work_dir'], + f"{date_prefix}/{isl}-{osl}") # Get eplb num_slots for gen worker load_balancer_config = config['worker_config']['gen'].get(