From 1153cd24f6534b9b7cc7e055d962d084856294c2 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Wed, 26 Nov 2025 02:02:32 -0800
Subject: [PATCH 01/10] Clean up PDL logics

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 .../disaggregated/slurm/benchmark/config.yaml    |  4 ++--
 .../slurm/benchmark/disaggr_torch.slurm          | 15 ++-------------
 .../slurm/benchmark/start_worker.sh              | 16 +++++-----------
 3 files changed, 9 insertions(+), 26 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
index dd8705ce629..af9396dc276 100644
--- a/examples/disaggregated/slurm/benchmark/config.yaml
+++ b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -34,8 +34,8 @@ environment:
   build_wheel: false  # Don't build the wheel when launching multiple jobs
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   work_dir: "<full_path_to_work_dir>"
-  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"
-  server_env_var: ""
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 
 # Profiling Configuration
 profiling:
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
index ef308a822a3..4ad8142d0a5 100644
--- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
+++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -169,17 +169,6 @@ elif [ -d "${trtllm_repo}" ]; then
     echo "TensorRT-LLM installation completed successfully"
 fi
 
-# Get enable_pdl from gen config
-enable_pdl=$(python3 -c "import yaml; import sys;
-try:
-    with open('${gen_config_path}') as f:
-        c = yaml.safe_load(f)
-        print(str(not c.get('enable_attention_dp', True)).lower())
-except Exception as e:
-    print(f'Error reading config: {e}', file=sys.stderr)
-    sys.exit(1)
-")
-
 # Get node lists
 all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
 total_nodes_num=${#all_nodes[@]}
@@ -211,7 +200,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
+        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_gen_${i}.log &
 done
 
@@ -226,7 +215,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
+        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_ctx_${i}.log &
 done
 
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
index 5543035ee6f..046e8d84bd0 100644
--- a/examples/disaggregated/slurm/benchmark/start_worker.sh
+++ b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -9,15 +9,13 @@ model_path=${3}
 port=${4}
 benchmark_mode=${5}
 concurrency=${6}
-enable_pdl=${7}
-numa_bind=${8}
-log_dir=${9}
-enable_nsys=${10}
-config_file=${11}
-worker_env_var=${12}
+numa_bind=${7}
+log_dir=${8}
+enable_nsys=${9}
+config_file=${10}
+worker_env_var=${11}
 
 unset UCX_TLS
-echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}"
 echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
 
 # Export worker environment variables from config
@@ -26,10 +24,6 @@ for env_var in ${worker_env_var}; do
     echo "Exported: ${env_var}"
 done
 
-if [ "${enable_pdl}" = "true" ]; then
-    export TRTLLM_ENABLE_PDL=1
-fi
-
 if [ "${numa_bind}" = "true" ]; then
     numa_bind_cmd="numactl -m 0,1"
     echo "numactl -m 0,1 - Only allocate memory from nodes on GB200"

From 66566dd3a3d91f18f0166114e0c1c842f3cf496b Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Wed, 26 Nov 2025 04:17:07 -0800
Subject: [PATCH 02/10] Add extra args

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/disaggregated/slurm/benchmark/config.yaml | 1 +
 examples/disaggregated/slurm/benchmark/submit.py   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
index af9396dc276..85dcf7d6895 100644
--- a/examples/disaggregated/slurm/benchmark/config.yaml
+++ b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -5,6 +5,7 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
+  extra_args: ""
   numa_bind: true # Only enable for GB200 NVL72
 
 # Benchmark Mode
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 37efd0764b3..60fab020ba7 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -143,6 +143,7 @@ def submit_job(config):
         f'--ntasks={total_tasks}',
         f'--ntasks-per-node={hw_config["gpus_per_node"]}',
         f'--segment={total_nodes}',
+        slurm_config['extra_args'],
         slurm_config['script_file'],
         # Hardware configuration
         str(hw_config['gpus_per_node']),

From 40d274d7eb72eec5e2e8d8c832385d4d21e299f6 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 00:23:33 -0800
Subject: [PATCH 03/10] Support iter range

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 .../disaggregated/slurm/benchmark/config.yaml |  2 ++
 .../slurm/benchmark/disaggr_torch.slurm       | 20 +++++++++++--------
 .../slurm/benchmark/start_worker.sh           | 14 +++++--------
 .../disaggregated/slurm/benchmark/submit.py   |  2 ++
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
index 85dcf7d6895..7ba8ed4e3f2 100644
--- a/examples/disaggregated/slurm/benchmark/config.yaml
+++ b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -41,6 +41,8 @@ environment:
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
+  ctx_profile_range: "10-30"  # Set TLLM_PROFILE_START_STOP for ctx workers
+  gen_profile_range: "200-250"  # Set TLLM_PROFILE_START_STOP for gen workers
 
 # Accuracy Configuration
 accuracy:
diff --git a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
index 4ad8142d0a5..83baa3ffeda 100644
--- a/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
+++ b/examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
@@ -40,18 +40,20 @@ trtllm_wheel_path=${28}
 
 # Profiling
 nsys_on=${29}
+ctx_profile_range=${30}
+gen_profile_range=${31}
 
 # Accuracy evaluation
-enable_accuracy_test=${30}
-accuracy_model=${31}
-accuracy_tasks=${32}
-model_args_extra=${33}
+enable_accuracy_test=${32}
+accuracy_model=${33}
+accuracy_tasks=${34}
+model_args_extra=${35}
 
 # Worker environment variables
-worker_env_var=${34}
+worker_env_var=${36}
 
 # Server environment variables
-server_env_var=${35}
+server_env_var=${37}
 
 # Print all parsed arguments
 echo "Parsed arguments:"
@@ -90,6 +92,8 @@ echo "  build_wheel: ${build_wheel}"
 echo "  trtllm_wheel_path: ${trtllm_wheel_path}"
 echo "  work_dir: ${work_dir}"
 echo "  nsys_on: ${nsys_on}"
+echo "  ctx_profile_range: ${ctx_profile_range}"
+echo "  gen_profile_range: ${gen_profile_range}"
 echo
 echo "Accuracy Configuration:"
 echo "  enable_accuracy_test: ${enable_accuracy_test}"
@@ -200,7 +204,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
+        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_profile_range}" "${gen_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_gen_${i}.log &
 done
 
@@ -215,7 +219,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
+        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_profile_range}" "${ctx_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_ctx_${i}.log &
 done
 
diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
index 046e8d84bd0..5d8d95cd56d 100644
--- a/examples/disaggregated/slurm/benchmark/start_worker.sh
+++ b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -12,8 +12,9 @@ concurrency=${6}
 numa_bind=${7}
 log_dir=${8}
 enable_nsys=${9}
-config_file=${10}
-worker_env_var=${11}
+profile_range=${10}
+config_file=${11}
+worker_env_var=${12}
 
 unset UCX_TLS
 echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
@@ -58,13 +59,8 @@ else
     export TLLM_PROFILE_RECORD_GC=1
     export TLLM_NVTX_DEBUG=1
     nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
-    if [ "${role}" = "GEN" ]; then
-        export TLLM_PROFILE_START_STOP=200-250
-        echo "nsys is enabled on gen_gpus"
-    elif [ "${role}" = "CTX" ]; then
-        export TLLM_PROFILE_START_STOP=10-30
-        echo "nsys is enabled on ctx_gpus"
-    fi
+    export TLLM_PROFILE_START_STOP=${profile_range}
+    echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}"
     ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
         trtllm-serve ${model_path} \
             --host $(hostname) --port ${port} \
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 60fab020ba7..8f1d297b08d 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -184,6 +184,8 @@ def submit_job(config):
 
         # Profiling
         str(config['profiling']['nsys_on']).lower(),
+        config['profiling']['ctx_profile_range'],
+        config['profiling']['gen_profile_range'],
 
         # Accuracy evaluation
         str(config['accuracy']['enable_accuracy_test']).lower(),

From 6de0ed39a23844ceedd0a627317e68b2ce476b38 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 00:28:16 -0800
Subject: [PATCH 04/10] Polish

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 .../slurm/benchmark/start_worker.sh           | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/start_worker.sh b/examples/disaggregated/slurm/benchmark/start_worker.sh
index 5d8d95cd56d..a8576725c0e 100644
--- a/examples/disaggregated/slurm/benchmark/start_worker.sh
+++ b/examples/disaggregated/slurm/benchmark/start_worker.sh
@@ -40,29 +40,27 @@ fi
 
 echo "config_file: ${config_file}"
 
-# save the hostname to a file
-
-# if SLURM_NODEID is 0
+# if SLURM_NODEID is 0, save the hostname to a file
 if [ "${SLURM_NODEID}" = "0" ]; then
     mkdir -p ${log_dir}/hostnames/
     echo $(hostname) > ${log_dir}/hostnames/${role}_${instance_id}.txt
     echo "hostname saved to ${log_dir}/hostnames/${role}_${instance_id}.txt"
 fi
 
-#check if nsys is enabled
+nsys_prefix=""
 if [ "${enable_nsys}" != "true" ]; then
     echo "nsys is not enabled, start normal flow"
-    trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
 else
-    nsys_prefix=""
     nsys_file=${log_dir}/nsys_worker_proc_${role}_${instance_id}_${SLURM_PROCID}
     export TLLM_PROFILE_RECORD_GC=1
     export TLLM_NVTX_DEBUG=1
-    nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
+    export NSYS_MPI_STORE_TEAMS_PER_RANK=1
     export TLLM_PROFILE_START_STOP=${profile_range}
     echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}"
-    ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
-        trtllm-serve ${model_path} \
-            --host $(hostname) --port ${port} \
-            --extra_llm_api_options ${config_file}
+    nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
 fi
+
+${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
+    trtllm-serve ${model_path} \
+        --host $(hostname) --port ${port} \
+        --extra_llm_api_options ${config_file}

From 923f6b13aad48efb8e621d29e1e3db2471b7d1c2 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 01:08:31 -0800
Subject: [PATCH 05/10] Minor fixes

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/disaggregated/slurm/benchmark/config.yaml | 2 +-
 examples/disaggregated/slurm/benchmark/submit.py   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/config.yaml b/examples/disaggregated/slurm/benchmark/config.yaml
index 7ba8ed4e3f2..c15748fe936 100644
--- a/examples/disaggregated/slurm/benchmark/config.yaml
+++ b/examples/disaggregated/slurm/benchmark/config.yaml
@@ -5,7 +5,7 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
-  extra_args: ""
+  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
   numa_bind: true # Only enable for GB200 NVL72
 
 # Benchmark Mode
diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 8f1d297b08d..7cfa377de5b 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -135,7 +135,6 @@ def submit_job(config):
     cmd = [
         'sbatch',
         f'--partition={slurm_config["partition"]}',
-        f'--gres=gpu:{hw_config["gpus_per_node"]}',
         f'--account={slurm_config["account"]}',
         f'--time={slurm_config["job_time"]}',
         f'--job-name={slurm_config["job_name"]}',
@@ -143,7 +142,7 @@ def submit_job(config):
         f'--ntasks={total_tasks}',
         f'--ntasks-per-node={hw_config["gpus_per_node"]}',
         f'--segment={total_nodes}',
-        slurm_config['extra_args'],
+        *([arg for arg in slurm_config['extra_args'].split() if arg]),
         slurm_config['script_file'],
         # Hardware configuration
         str(hw_config['gpus_per_node']),

From 14d17029ae30452b4408e02e21cb988c9aa710ea Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 19:09:52 -0800
Subject: [PATCH 06/10] Support custome log dir

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 .../disaggregated/slurm/benchmark/submit.py   | 44 +++++++++++--------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 7cfa377de5b..012bae3634e 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+from datetime import datetime
 
 import yaml
 
@@ -22,6 +23,10 @@ def parse_args():
                        '--dir',
                        type=str,
                        help='Directory containing YAML configuration files')
+    group.add_argument('--log-dir',
+                       type=str,
+                       default=None,
+                       help='Log directory')
     return parser.parse_args()
 
 
@@ -45,7 +50,7 @@ def calculate_nodes(world_size, num_servers, gpus_per_node):
     return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
 
 
-def submit_job(config):
+def submit_job(config, log_dir):
     # Extract configurations
     slurm_config = config['slurm']
     hw_config = config['hardware']
@@ -101,25 +106,28 @@ def submit_job(config):
     gen_enable_attention_dp = config['worker_config']['gen'][
         'enable_attention_dp']
 
-    # Create base log directory path
-    log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}")
+    if log_dir is None:
+        # Create base log directory path
+        date_prefix = datetime.now().strftime("%Y%m%d")
+        log_base = os.path.join(env_config['work_dir'], f"{date_prefix}/{isl}-{osl}")
 
-    # Get eplb num_slots for gen worker
-    load_balancer_config = config['worker_config']['gen'].get(
-        'moe_config', {}).get('load_balancer', {})
-    if isinstance(load_balancer_config, str):
-        with open(load_balancer_config, 'r') as f:
-            load_balancer_config = yaml.safe_load(f)
-    eplb_num_slots = load_balancer_config.get('num_slots', 0)
+        # Get eplb num_slots for gen worker
+        load_balancer_config = config['worker_config']['gen'].get(
+            'moe_config', {}).get('load_balancer', {})
+        if isinstance(load_balancer_config, str):
+            with open(load_balancer_config, 'r') as f:
+                load_balancer_config = yaml.safe_load(f)
+        eplb_num_slots = load_balancer_config.get('num_slots', 0)
 
-    # Determine directory suffix based on attention_dp
-    if gen_enable_attention_dp:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
-    else:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+        # Determine directory suffix based on attention_dp
+        if gen_enable_attention_dp:
+            dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+        else:
+            dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+
+        # Create full log directory path
+        log_dir = os.path.join(log_base, dir_suffix)
 
-    # Create full log directory path
-    log_dir = os.path.join(log_base, dir_suffix)
     # Remove existing directory if it exists
     if os.path.exists(log_dir):
         shutil.rmtree(log_dir)
@@ -231,7 +239,7 @@ def main():
         print(f"\nProcessing: {config_file}")
         try:
             config = load_config(config_file)
-            submit_job(config)
+            submit_job(config, args.log_dir)
             print(f"Successfully submitted job for: {config_file}")
         except Exception as e:
             print(f"Error processing {config_file}: {e}", file=sys.stderr)

From afed00771c886c80648216a882cdd96c94264a67 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 19:22:29 -0800
Subject: [PATCH 07/10] Add some more logs

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/disaggregated/slurm/benchmark/submit.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 012bae3634e..a4381b34be8 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -130,8 +130,10 @@ def submit_job(config, log_dir):
 
     # Remove existing directory if it exists
     if os.path.exists(log_dir):
+        print(f"[WARNING] Removing existing log directory: {log_dir}")
         shutil.rmtree(log_dir)
     os.makedirs(log_dir)
+    print(f"Log will be saved to: {log_dir}")
 
     # Setup config file paths and save worker configs
     ctx_config_path = os.path.join(log_dir, 'ctx_config.yaml')
@@ -236,11 +238,11 @@ def main():
 
     # Process each config file
     for config_file in config_files:
-        print(f"\nProcessing: {config_file}")
+        print(f"Processing: {config_file}")
         try:
             config = load_config(config_file)
             submit_job(config, args.log_dir)
-            print(f"Successfully submitted job for: {config_file}")
+            print(f"Successfully submitted job for: {config_file}\n")
         except Exception as e:
             print(f"Error processing {config_file}: {e}", file=sys.stderr)
             # Continue processing other files even if one fails

From b2c95eeafc11d8afc03844092383d45007a0ca30 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 21:19:39 -0800
Subject: [PATCH 08/10] Backward compatibility

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/disaggregated/slurm/benchmark/submit.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index a4381b34be8..75f6ed927a1 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -53,6 +53,8 @@ def calculate_nodes(world_size, num_servers, gpus_per_node):
 def submit_job(config, log_dir):
     # Extract configurations
     slurm_config = config['slurm']
+    slurm_config.setdefault('extra_args', '')
+
     hw_config = config['hardware']
     env_config = config['environment']
 
@@ -76,6 +78,11 @@ def submit_job(config, log_dir):
     env_config.setdefault('worker_env_var', '')
     env_config.setdefault('server_env_var', '')
 
+    profiling_config = config.get('profiling', {})
+    profiling_config.setdefault('nsys_on', False)
+    profiling_config.setdefault('ctx_profile_range', '10-30')
+    profiling_config.setdefault('gen_profile_range', '200-250')
+
     # Get number of servers from config
     ctx_num = hw_config['num_ctx_servers']
     gen_num = hw_config['num_gen_servers']
@@ -192,9 +199,9 @@ def submit_job(config, log_dir):
         env_config['trtllm_wheel_path'],
 
         # Profiling
-        str(config['profiling']['nsys_on']).lower(),
-        config['profiling']['ctx_profile_range'],
-        config['profiling']['gen_profile_range'],
+        str(profiling_config['nsys_on']).lower(),
+        profiling_config['ctx_profile_range'],
+        profiling_config['gen_profile_range'],
 
         # Accuracy evaluation
         str(config['accuracy']['enable_accuracy_test']).lower(),

From ddad78449aea3f5393a4c42ee1fcfe3b652e9277 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 21:22:16 -0800
Subject: [PATCH 09/10] Update

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/wide_ep/slurm_scripts/config.yaml | 26 +++++++++++-----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/wide_ep/slurm_scripts/config.yaml b/examples/wide_ep/slurm_scripts/config.yaml
index 9deb1eb9a86..12d83248bf1 100644
--- a/examples/wide_ep/slurm_scripts/config.yaml
+++ b/examples/wide_ep/slurm_scripts/config.yaml
@@ -5,19 +5,14 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
+  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
   numa_bind: true # Only enable for GB200 NVL72
 
-# Hardware Configuration
-hardware:
-  gpus_per_node: 4  # Modify this with your hardware configuration
-  num_ctx_servers: 2  # Number of context servers
-  num_gen_servers: 1  # Number of generation servers
-
 # Benchmark Mode
 benchmark:
   mode: "e2e"  # Options: e2e, gen_only
   use_nv_sa_benchmark: false  # Whether to use NVIDIA SA benchmark script
-  multi_round: 1  # Number of benchmark rounds
+  multi_round: 8  # Number of benchmark rounds
   benchmark_ratio: 0.8  # Benchmark ratio
   streaming: true  # Enable streaming mode
   concurrency_list: "1024"
@@ -25,6 +20,12 @@ benchmark:
   output_length: 1024  # Output sequence length
   dataset_file: "<dataset_file>"
 
+# Hardware Configuration
+hardware:
+  gpus_per_node: 4  # Modify this with your hardware configuration
+  num_ctx_servers: 1  # Number of context servers
+  num_gen_servers: 1  # Number of generation servers
+
 # Environment Configuration
 environment:
   container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
@@ -34,24 +35,24 @@ environment:
   build_wheel: false  # Don't build the wheel when launching multiple jobs
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   work_dir: "<full_path_to_work_dir>"
-  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"  # Environment variables for workers
-  server_env_var: ""  # Environment variables for server
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
+  ctx_profile_range: "10-30"  # Set TLLM_PROFILE_START_STOP for ctx workers
+  gen_profile_range: "200-250"  # Set TLLM_PROFILE_START_STOP for gen workers
 
 # Accuracy Configuration
 accuracy:
   enable_accuracy_test: false  # Set to true to enable accuracy evaluation
   model: "local-completions"  # Model type for lm_eval
   tasks: "gsm8k"  # Evaluation tasks (comma-separated)
-  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=False,timeout=1200,max_gen_toks=256,max_length=512"  # Extra model arguments for lm_eval
+  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096"  # Extra model arguments for lm_eval
 
-# Worker Configuration
 worker_config:
   gen:
-    enable_layerwise_nvtx_marker: true
     tensor_parallel_size: 32
     moe_expert_parallel_size: 32
     enable_attention_dp: true
@@ -97,7 +98,6 @@ worker_config:
       decoding_type: MTP
       num_nextn_predict_layers: 3
   ctx:
-    enable_layerwise_nvtx_marker: true
     max_batch_size: 1
     max_num_tokens: 8448
     max_seq_len: 8212

From d9fce1ada0b5292454583281a1058bc28f232e95 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Thu, 27 Nov 2025 23:53:38 -0800
Subject: [PATCH 10/10] Fix style

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 examples/disaggregated/slurm/benchmark/submit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/disaggregated/slurm/benchmark/submit.py b/examples/disaggregated/slurm/benchmark/submit.py
index 75f6ed927a1..9aa00356d2c 100644
--- a/examples/disaggregated/slurm/benchmark/submit.py
+++ b/examples/disaggregated/slurm/benchmark/submit.py
@@ -116,7 +116,8 @@ def submit_job(config, log_dir):
     if log_dir is None:
         # Create base log directory path
         date_prefix = datetime.now().strftime("%Y%m%d")
-        log_base = os.path.join(env_config['work_dir'], f"{date_prefix}/{isl}-{osl}")
+        log_base = os.path.join(env_config['work_dir'],
+                                f"{date_prefix}/{isl}-{osl}")
 
         # Get eplb num_slots for gen worker
         load_balancer_config = config['worker_config']['gen'].get(