NVIDIA · kaiyux · Nov 28, 2025 · Nov 26, 2025 · Nov 26, 2025 · Nov 27, 2025
@@ -5,6 +5,7 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
+  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
   numa_bind: true # Only enable for GB200 NVL72
 
 # Benchmark Mode
@@ -34,12 +35,14 @@ environment:
   build_wheel: false  # Don't build the wheel when launching multiple jobs
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   work_dir: "<full_path_to_work_dir>"
-  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"
-  server_env_var: ""
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
+  ctx_profile_range: "10-30"  # Set TLLM_PROFILE_START_STOP for ctx workers
+  gen_profile_range: "200-250"  # Set TLLM_PROFILE_START_STOP for gen workers
 
 # Accuracy Configuration
 accuracy:

@@ -40,18 +40,20 @@ trtllm_wheel_path=${28}
 
 # Profiling
 nsys_on=${29}
+ctx_profile_range=${30}
+gen_profile_range=${31}
 
 # Accuracy evaluation
-enable_accuracy_test=${30}
-accuracy_model=${31}
-accuracy_tasks=${32}
-model_args_extra=${33}
+enable_accuracy_test=${32}
+accuracy_model=${33}
+accuracy_tasks=${34}
+model_args_extra=${35}
 
 # Worker environment variables
-worker_env_var=${34}
+worker_env_var=${36}
 
 # Server environment variables
-server_env_var=${35}
+server_env_var=${37}
 
 # Print all parsed arguments
 echo "Parsed arguments:"
@@ -90,6 +92,8 @@ echo "  build_wheel: ${build_wheel}"
 echo "  trtllm_wheel_path: ${trtllm_wheel_path}"
 echo "  work_dir: ${work_dir}"
 echo "  nsys_on: ${nsys_on}"
+echo "  ctx_profile_range: ${ctx_profile_range}"
+echo "  gen_profile_range: ${gen_profile_range}"
 echo
 echo "Accuracy Configuration:"
 echo "  enable_accuracy_test: ${enable_accuracy_test}"
@@ -169,17 +173,6 @@ elif [ -d "${trtllm_repo}" ]; then
     echo "TensorRT-LLM installation completed successfully"
 fi
 
-# Get enable_pdl from gen config
-enable_pdl=$(python3 -c "import yaml; import sys;
-try:
-    with open('${gen_config_path}') as f:
-        c = yaml.safe_load(f)
-        print(str(not c.get('enable_attention_dp', True)).lower())
-except Exception as e:
-    print(f'Error reading config: {e}', file=sys.stderr)
-    sys.exit(1)
-")
-
 # Get node lists
 all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
 total_nodes_num=${#all_nodes[@]}
@@ -211,7 +204,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
+        "GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_profile_range}" "${gen_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_gen_${i}.log &
 done
 
@@ -226,7 +219,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
         --container-mounts=${container_mount} \
         --mpi=pmix \
         bash ${work_dir}/start_worker.sh \
-        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
+        "CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_profile_range}" "${ctx_config_path}" "${worker_env_var}" \
         &> ${full_logdir}/output_ctx_${i}.log &
 done
 

@@ -9,15 +9,14 @@ model_path=${3}
 port=${4}
 benchmark_mode=${5}
 concurrency=${6}
-enable_pdl=${7}
-numa_bind=${8}
-log_dir=${9}
-enable_nsys=${10}
+numa_bind=${7}
+log_dir=${8}
+enable_nsys=${9}
+profile_range=${10}
 config_file=${11}
 worker_env_var=${12}
 
 unset UCX_TLS
-echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}"
 echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"
 
 # Export worker environment variables from config
@@ -26,10 +25,6 @@ for env_var in ${worker_env_var}; do
     echo "Exported: ${env_var}"
 done
 
-if [ "${enable_pdl}" = "true" ]; then
-    export TRTLLM_ENABLE_PDL=1
-fi
-
 if [ "${numa_bind}" = "true" ]; then
     numa_bind_cmd="numactl -m 0,1"
     echo "numactl -m 0,1 - Only allocate memory from nodes on GB200"
@@ -45,34 +40,27 @@ fi
 
 echo "config_file: ${config_file}"
 
-# save the hostname to a file
-
-# if SLURM_NODEID is 0
+# if SLURM_NODEID is 0, save the hostname to a file
 if [ "${SLURM_NODEID}" = "0" ]; then
     mkdir -p ${log_dir}/hostnames/
     echo $(hostname) > ${log_dir}/hostnames/${role}_${instance_id}.txt
     echo "hostname saved to ${log_dir}/hostnames/${role}_${instance_id}.txt"
 fi
 
-#check if nsys is enabled
+nsys_prefix=""
 if [ "${enable_nsys}" != "true" ]; then
     echo "nsys is not enabled, start normal flow"
-    trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
 else
-    nsys_prefix=""
     nsys_file=${log_dir}/nsys_worker_proc_${role}_${instance_id}_${SLURM_PROCID}
     export TLLM_PROFILE_RECORD_GC=1
     export TLLM_NVTX_DEBUG=1
-    nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
-    if [ "${role}" = "GEN" ]; then
-        export TLLM_PROFILE_START_STOP=200-250
-        echo "nsys is enabled on gen_gpus"
-    elif [ "${role}" = "CTX" ]; then
-        export TLLM_PROFILE_START_STOP=10-30
-        echo "nsys is enabled on ctx_gpus"
-    fi
-    ${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
-        trtllm-serve ${model_path} \
-            --host $(hostname) --port ${port} \
-            --extra_llm_api_options ${config_file}
+    export NSYS_MPI_STORE_TEAMS_PER_RANK=1
+    export TLLM_PROFILE_START_STOP=${profile_range}
+    echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}"
+    nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
 fi
+
+${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
+    trtllm-serve ${model_path} \
+        --host $(hostname) --port ${port} \
+        --extra_llm_api_options ${config_file}
@@ -6,6 +6,7 @@
 import shutil
 import subprocess
 import sys
+from datetime import datetime
 
 import yaml
 
@@ -22,6 +23,10 @@ def parse_args():
                        '--dir',
                        type=str,
                        help='Directory containing YAML configuration files')
+    group.add_argument('--log-dir',
+                       type=str,
+                       default=None,
+                       help='Log directory')
     return parser.parse_args()
 
 
@@ -45,9 +50,11 @@ def calculate_nodes(world_size, num_servers, gpus_per_node):
     return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers
 
 
-def submit_job(config):
+def submit_job(config, log_dir):
     # Extract configurations
     slurm_config = config['slurm']
+    slurm_config.setdefault('extra_args', '')
+
     hw_config = config['hardware']
     env_config = config['environment']
 
@@ -71,6 +78,11 @@ def submit_job(config):
     env_config.setdefault('worker_env_var', '')
     env_config.setdefault('server_env_var', '')
 
+    profiling_config = config.get('profiling', {})
+    profiling_config.setdefault('nsys_on', False)
+    profiling_config.setdefault('ctx_profile_range', '10-30')
+    profiling_config.setdefault('gen_profile_range', '200-250')
+
     # Get number of servers from config
     ctx_num = hw_config['num_ctx_servers']
     gen_num = hw_config['num_gen_servers']
@@ -101,29 +113,35 @@ def submit_job(config):
     gen_enable_attention_dp = config['worker_config']['gen'][
         'enable_attention_dp']
 
-    # Create base log directory path
-    log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}")
-
-    # Get eplb num_slots for gen worker
-    load_balancer_config = config['worker_config']['gen'].get(
-        'moe_config', {}).get('load_balancer', {})
-    if isinstance(load_balancer_config, str):
-        with open(load_balancer_config, 'r') as f:
-            load_balancer_config = yaml.safe_load(f)
-    eplb_num_slots = load_balancer_config.get('num_slots', 0)
-
-    # Determine directory suffix based on attention_dp
-    if gen_enable_attention_dp:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
-    else:
-        dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+    if log_dir is None:
+        # Create base log directory path
+        date_prefix = datetime.now().strftime("%Y%m%d")
+        log_base = os.path.join(env_config['work_dir'],
+                                f"{date_prefix}/{isl}-{osl}")
+
+        # Get eplb num_slots for gen worker
+        load_balancer_config = config['worker_config']['gen'].get(
+            'moe_config', {}).get('load_balancer', {})
+        if isinstance(load_balancer_config, str):
+            with open(load_balancer_config, 'r') as f:
+                load_balancer_config = yaml.safe_load(f)
+        eplb_num_slots = load_balancer_config.get('num_slots', 0)
+
+        # Determine directory suffix based on attention_dp
+        if gen_enable_attention_dp:
+            dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+        else:
+            dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
+
+        # Create full log directory path
+        log_dir = os.path.join(log_base, dir_suffix)
 
-    # Create full log directory path
-    log_dir = os.path.join(log_base, dir_suffix)
     # Remove existing directory if it exists
     if os.path.exists(log_dir):
+        print(f"[WARNING] Removing existing log directory: {log_dir}")
         shutil.rmtree(log_dir)
     os.makedirs(log_dir)
+    print(f"Log will be saved to: {log_dir}")
 
     # Setup config file paths and save worker configs
     ctx_config_path = os.path.join(log_dir, 'ctx_config.yaml')
@@ -135,14 +153,14 @@ def submit_job(config):
     cmd = [
         'sbatch',
         f'--partition={slurm_config["partition"]}',
-        f'--gres=gpu:{hw_config["gpus_per_node"]}',
         f'--account={slurm_config["account"]}',
         f'--time={slurm_config["job_time"]}',
         f'--job-name={slurm_config["job_name"]}',
         f'--nodes={total_nodes}',
         f'--ntasks={total_tasks}',
         f'--ntasks-per-node={hw_config["gpus_per_node"]}',
         f'--segment={total_nodes}',
+        *([arg for arg in slurm_config['extra_args'].split() if arg]),
         slurm_config['script_file'],
         # Hardware configuration
         str(hw_config['gpus_per_node']),
@@ -182,7 +200,9 @@ def submit_job(config):
         env_config['trtllm_wheel_path'],
 
         # Profiling
-        str(config['profiling']['nsys_on']).lower(),
+        str(profiling_config['nsys_on']).lower(),
+        profiling_config['ctx_profile_range'],
+        profiling_config['gen_profile_range'],
 
         # Accuracy evaluation
         str(config['accuracy']['enable_accuracy_test']).lower(),
@@ -226,11 +246,11 @@ def main():
 
     # Process each config file
     for config_file in config_files:
-        print(f"\nProcessing: {config_file}")
+        print(f"Processing: {config_file}")
         try:
             config = load_config(config_file)
-            submit_job(config)
-            print(f"Successfully submitted job for: {config_file}")
+            submit_job(config, args.log_dir)
+            print(f"Successfully submitted job for: {config_file}\n")
         except Exception as e:
             print(f"Error processing {config_file}: {e}", file=sys.stderr)
             # Continue processing other files even if one fails

@@ -5,26 +5,27 @@ slurm:
   account: "<account>"
   job_time: "02:00:00"
   job_name: "<job_name>"
+  extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
   numa_bind: true # Only enable for GB200 NVL72
 
-# Hardware Configuration
-hardware:
-  gpus_per_node: 4  # Modify this with your hardware configuration
-  num_ctx_servers: 2  # Number of context servers
-  num_gen_servers: 1  # Number of generation servers
-
 # Benchmark Mode
 benchmark:
   mode: "e2e"  # Options: e2e, gen_only
   use_nv_sa_benchmark: false  # Whether to use NVIDIA SA benchmark script
-  multi_round: 1  # Number of benchmark rounds
+  multi_round: 8  # Number of benchmark rounds
   benchmark_ratio: 0.8  # Benchmark ratio
   streaming: true  # Enable streaming mode
   concurrency_list: "1024"
   input_length: 8196  # Input sequence length
   output_length: 1024  # Output sequence length
   dataset_file: "<dataset_file>"
 
+# Hardware Configuration
+hardware:
+  gpus_per_node: 4  # Modify this with your hardware configuration
+  num_ctx_servers: 1  # Number of context servers
+  num_gen_servers: 1  # Number of generation servers
+
 # Environment Configuration
 environment:
   container_mount: "<container_mount>"  # Format: path1:path1,path2:path2
@@ -34,24 +35,24 @@ environment:
   build_wheel: false  # Don't build the wheel when launching multiple jobs
   trtllm_wheel_path: ""  # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
   work_dir: "<full_path_to_work_dir>"
-  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"  # Environment variables for workers
-  server_env_var: ""  # Environment variables for server
+  worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
+  server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"
 
 # Profiling Configuration
 profiling:
   nsys_on: false  # Set to true to enable profiling
+  ctx_profile_range: "10-30"  # Set TLLM_PROFILE_START_STOP for ctx workers
+  gen_profile_range: "200-250"  # Set TLLM_PROFILE_START_STOP for gen workers
 
 # Accuracy Configuration
 accuracy:
   enable_accuracy_test: false  # Set to true to enable accuracy evaluation
   model: "local-completions"  # Model type for lm_eval
   tasks: "gsm8k"  # Evaluation tasks (comma-separated)
-  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=False,timeout=1200,max_gen_toks=256,max_length=512"  # Extra model arguments for lm_eval
+  model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096"  # Extra model arguments for lm_eval
 
-# Worker Configuration
 worker_config:
   gen:
-    enable_layerwise_nvtx_marker: true
     tensor_parallel_size: 32
     moe_expert_parallel_size: 32
     enable_attention_dp: true
@@ -97,7 +98,6 @@ worker_config:
       decoding_type: MTP
       num_nextn_predict_layers: 3
   ctx:
-    enable_layerwise_nvtx_marker: true
     max_batch_size: 1
     max_num_tokens: 8448
     max_seq_len: 8212