Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions examples/disaggregated/slurm/benchmark/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ slurm:
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
numa_bind: true # Only enable for GB200 NVL72

# Benchmark Mode
Expand Down Expand Up @@ -34,12 +35,14 @@ environment:
build_wheel: false # Don't build the wheel when launching multiple jobs
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
work_dir: "<full_path_to_work_dir>"
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1"
server_env_var: ""
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"

# Profiling Configuration
profiling:
nsys_on: false # Set to true to enable profiling
ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers
gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers

# Accuracy Configuration
accuracy:
Expand Down
31 changes: 12 additions & 19 deletions examples/disaggregated/slurm/benchmark/disaggr_torch.slurm
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,20 @@ trtllm_wheel_path=${28}

# Profiling
nsys_on=${29}
ctx_profile_range=${30}
gen_profile_range=${31}

# Accuracy evaluation
enable_accuracy_test=${30}
accuracy_model=${31}
accuracy_tasks=${32}
model_args_extra=${33}
enable_accuracy_test=${32}
accuracy_model=${33}
accuracy_tasks=${34}
model_args_extra=${35}

# Worker environment variables
worker_env_var=${34}
worker_env_var=${36}

# Server environment variables
server_env_var=${35}
server_env_var=${37}

# Print all parsed arguments
echo "Parsed arguments:"
Expand Down Expand Up @@ -90,6 +92,8 @@ echo " build_wheel: ${build_wheel}"
echo " trtllm_wheel_path: ${trtllm_wheel_path}"
echo " work_dir: ${work_dir}"
echo " nsys_on: ${nsys_on}"
echo " ctx_profile_range: ${ctx_profile_range}"
echo " gen_profile_range: ${gen_profile_range}"
echo
echo "Accuracy Configuration:"
echo " enable_accuracy_test: ${enable_accuracy_test}"
Expand Down Expand Up @@ -169,17 +173,6 @@ elif [ -d "${trtllm_repo}" ]; then
echo "TensorRT-LLM installation completed successfully"
fi

# Get enable_pdl from gen config
enable_pdl=$(python3 -c "import yaml; import sys;
try:
with open('${gen_config_path}') as f:
c = yaml.safe_load(f)
print(str(not c.get('enable_attention_dp', True)).lower())
except Exception as e:
print(f'Error reading config: {e}', file=sys.stderr)
sys.exit(1)
")

# Get node lists
all_nodes=($(scontrol show hostname $SLURM_NODELIST | sort))
total_nodes_num=${#all_nodes[@]}
Expand Down Expand Up @@ -211,7 +204,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
--container-mounts=${container_mount} \
--mpi=pmix \
bash ${work_dir}/start_worker.sh \
"GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_config_path}" "${worker_env_var}" \
"GEN" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${gen_profile_range}" "${gen_config_path}" "${worker_env_var}" \
&> ${full_logdir}/output_gen_${i}.log &
done

Expand All @@ -226,7 +219,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
--container-mounts=${container_mount} \
--mpi=pmix \
bash ${work_dir}/start_worker.sh \
"CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${enable_pdl}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_config_path}" "${worker_env_var}" \
"CTX" ${i} ${model_path} "8336" "${benchmark_mode}" "${concurrency_list}" "${numa_bind}" "${full_logdir}" "${nsys_on}" "${ctx_profile_range}" "${ctx_config_path}" "${worker_env_var}" \
&> ${full_logdir}/output_ctx_${i}.log &
done

Expand Down
42 changes: 15 additions & 27 deletions examples/disaggregated/slurm/benchmark/start_worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,14 @@ model_path=${3}
port=${4}
benchmark_mode=${5}
concurrency=${6}
enable_pdl=${7}
numa_bind=${8}
log_dir=${9}
enable_nsys=${10}
numa_bind=${7}
log_dir=${8}
enable_nsys=${9}
profile_range=${10}
config_file=${11}
worker_env_var=${12}

unset UCX_TLS
echo "enable_pdl: ${enable_pdl}, log_dir: ${log_dir}"
echo "SLURM_PROCID: ${SLURM_PROCID}, hostname: $(hostname), instance_id: ${instance_id}"

# Export worker environment variables from config
Expand All @@ -26,10 +25,6 @@ for env_var in ${worker_env_var}; do
echo "Exported: ${env_var}"
done

if [ "${enable_pdl}" = "true" ]; then
export TRTLLM_ENABLE_PDL=1
fi

if [ "${numa_bind}" = "true" ]; then
numa_bind_cmd="numactl -m 0,1"
echo "numactl -m 0,1 - Only allocate memory from nodes on GB200"
Expand All @@ -45,34 +40,27 @@ fi

echo "config_file: ${config_file}"

# save the hostname to a file

# if SLURM_NODEID is 0
# if SLURM_NODEID is 0, save the hostname to a file
if [ "${SLURM_NODEID}" = "0" ]; then
mkdir -p ${log_dir}/hostnames/
echo $(hostname) > ${log_dir}/hostnames/${role}_${instance_id}.txt
echo "hostname saved to ${log_dir}/hostnames/${role}_${instance_id}.txt"
fi

#check if nsys is enabled
nsys_prefix=""
if [ "${enable_nsys}" != "true" ]; then
echo "nsys is not enabled, start normal flow"
trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
else
nsys_prefix=""
nsys_file=${log_dir}/nsys_worker_proc_${role}_${instance_id}_${SLURM_PROCID}
export TLLM_PROFILE_RECORD_GC=1
export TLLM_NVTX_DEBUG=1
nsys_prefix="nsys profile -e \"NSYS_MPI_STORE_TEAMS_PER_RANK=1\" -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
if [ "${role}" = "GEN" ]; then
export TLLM_PROFILE_START_STOP=200-250
echo "nsys is enabled on gen_gpus"
elif [ "${role}" = "CTX" ]; then
export TLLM_PROFILE_START_STOP=10-30
echo "nsys is enabled on ctx_gpus"
fi
${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
trtllm-serve ${model_path} \
--host $(hostname) --port ${port} \
--extra_llm_api_options ${config_file}
export NSYS_MPI_STORE_TEAMS_PER_RANK=1
export TLLM_PROFILE_START_STOP=${profile_range}
echo "nsys is enabled on ${role} GPUs, TLLM_PROFILE_START_STOP=${profile_range}"
nsys_prefix="nsys profile -o ${nsys_file} -f true -t cuda,nvtx,python-gil -c cudaProfilerApi --cuda-graph-trace node --capture-range-end=stop --gpu-metrics-devices=none"
fi

${nsys_prefix} trtllm-llmapi-launch ${numa_bind_cmd} \
trtllm-serve ${model_path} \
--host $(hostname) --port ${port} \
--extra_llm_api_options ${config_file}
68 changes: 44 additions & 24 deletions examples/disaggregated/slurm/benchmark/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import shutil
import subprocess
import sys
from datetime import datetime

import yaml

Expand All @@ -22,6 +23,10 @@ def parse_args():
'--dir',
type=str,
help='Directory containing YAML configuration files')
group.add_argument('--log-dir',
type=str,
default=None,
help='Log directory')
return parser.parse_args()


Expand All @@ -45,9 +50,11 @@ def calculate_nodes(world_size, num_servers, gpus_per_node):
return (world_size + gpus_per_node - 1) // gpus_per_node * num_servers


def submit_job(config):
def submit_job(config, log_dir):
# Extract configurations
slurm_config = config['slurm']
slurm_config.setdefault('extra_args', '')

hw_config = config['hardware']
env_config = config['environment']

Expand All @@ -71,6 +78,11 @@ def submit_job(config):
env_config.setdefault('worker_env_var', '')
env_config.setdefault('server_env_var', '')

profiling_config = config.get('profiling', {})
profiling_config.setdefault('nsys_on', False)
profiling_config.setdefault('ctx_profile_range', '10-30')
profiling_config.setdefault('gen_profile_range', '200-250')

# Get number of servers from config
ctx_num = hw_config['num_ctx_servers']
gen_num = hw_config['num_gen_servers']
Expand Down Expand Up @@ -101,29 +113,35 @@ def submit_job(config):
gen_enable_attention_dp = config['worker_config']['gen'][
'enable_attention_dp']

# Create base log directory path
log_base = os.path.join(env_config['work_dir'], f"{isl}-{osl}")

# Get eplb num_slots for gen worker
load_balancer_config = config['worker_config']['gen'].get(
'moe_config', {}).get('load_balancer', {})
if isinstance(load_balancer_config, str):
with open(load_balancer_config, 'r') as f:
load_balancer_config = yaml.safe_load(f)
eplb_num_slots = load_balancer_config.get('num_slots', 0)

# Determine directory suffix based on attention_dp
if gen_enable_attention_dp:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
else:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
if log_dir is None:
# Create base log directory path
date_prefix = datetime.now().strftime("%Y%m%d")
log_base = os.path.join(env_config['work_dir'],
f"{date_prefix}/{isl}-{osl}")

# Get eplb num_slots for gen worker
load_balancer_config = config['worker_config']['gen'].get(
'moe_config', {}).get('load_balancer', {})
if isinstance(load_balancer_config, str):
with open(load_balancer_config, 'r') as f:
load_balancer_config = yaml.safe_load(f)
eplb_num_slots = load_balancer_config.get('num_slots', 0)

# Determine directory suffix based on attention_dp
if gen_enable_attention_dp:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_dep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"
else:
dir_suffix = f"ctx{ctx_num}_gen{gen_num}_tep{gen_tp_size}_batch{gen_batch_size}_eplb{eplb_num_slots}_mtp{mtp_size}"

# Create full log directory path
log_dir = os.path.join(log_base, dir_suffix)

# Create full log directory path
log_dir = os.path.join(log_base, dir_suffix)
# Remove existing directory if it exists
if os.path.exists(log_dir):
print(f"[WARNING] Removing existing log directory: {log_dir}")
shutil.rmtree(log_dir)
os.makedirs(log_dir)
print(f"Log will be saved to: {log_dir}")

# Setup config file paths and save worker configs
ctx_config_path = os.path.join(log_dir, 'ctx_config.yaml')
Expand All @@ -135,14 +153,14 @@ def submit_job(config):
cmd = [
'sbatch',
f'--partition={slurm_config["partition"]}',
f'--gres=gpu:{hw_config["gpus_per_node"]}',
f'--account={slurm_config["account"]}',
f'--time={slurm_config["job_time"]}',
f'--job-name={slurm_config["job_name"]}',
f'--nodes={total_nodes}',
f'--ntasks={total_tasks}',
f'--ntasks-per-node={hw_config["gpus_per_node"]}',
f'--segment={total_nodes}',
*([arg for arg in slurm_config['extra_args'].split() if arg]),
slurm_config['script_file'],
# Hardware configuration
str(hw_config['gpus_per_node']),
Expand Down Expand Up @@ -182,7 +200,9 @@ def submit_job(config):
env_config['trtllm_wheel_path'],

# Profiling
str(config['profiling']['nsys_on']).lower(),
str(profiling_config['nsys_on']).lower(),
profiling_config['ctx_profile_range'],
profiling_config['gen_profile_range'],

# Accuracy evaluation
str(config['accuracy']['enable_accuracy_test']).lower(),
Expand Down Expand Up @@ -226,11 +246,11 @@ def main():

# Process each config file
for config_file in config_files:
print(f"\nProcessing: {config_file}")
print(f"Processing: {config_file}")
try:
config = load_config(config_file)
submit_job(config)
print(f"Successfully submitted job for: {config_file}")
submit_job(config, args.log_dir)
print(f"Successfully submitted job for: {config_file}\n")
except Exception as e:
print(f"Error processing {config_file}: {e}", file=sys.stderr)
# Continue processing other files even if one fails
Expand Down
26 changes: 13 additions & 13 deletions examples/wide_ep/slurm_scripts/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@ slurm:
account: "<account>"
job_time: "02:00:00"
job_name: "<job_name>"
extra_args: "" # Cluster specific arguments, e.g. "--gres=gpu:4 --exclude=node1,node2"
numa_bind: true # Only enable for GB200 NVL72

# Hardware Configuration
hardware:
gpus_per_node: 4 # Modify this with your hardware configuration
num_ctx_servers: 2 # Number of context servers
num_gen_servers: 1 # Number of generation servers

# Benchmark Mode
benchmark:
mode: "e2e" # Options: e2e, gen_only
use_nv_sa_benchmark: false # Whether to use NVIDIA SA benchmark script
multi_round: 1 # Number of benchmark rounds
multi_round: 8 # Number of benchmark rounds
benchmark_ratio: 0.8 # Benchmark ratio
streaming: true # Enable streaming mode
concurrency_list: "1024"
input_length: 8196 # Input sequence length
output_length: 1024 # Output sequence length
dataset_file: "<dataset_file>"

# Hardware Configuration
hardware:
gpus_per_node: 4 # Modify this with your hardware configuration
num_ctx_servers: 1 # Number of context servers
num_gen_servers: 1 # Number of generation servers

# Environment Configuration
environment:
container_mount: "<container_mount>" # Format: path1:path1,path2:path2
Expand All @@ -34,24 +35,24 @@ environment:
build_wheel: false # Don't build the wheel when launching multiple jobs
trtllm_wheel_path: "" # Path to pre-built TensorRT-LLM wheel. If provided, install from this wheel instead
work_dir: "<full_path_to_work_dir>"
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1" # Environment variables for workers
server_env_var: "" # Environment variables for server
worker_env_var: "TLLM_LOG_LEVEL=INFO TRTLLM_SERVER_DISABLE_GC=1 TRTLLM_WORKER_DISABLE_GC=1 TRTLLM_ENABLE_PDL=1 ENROOT_ALLOW_DEV=yes"
server_env_var: "TRTLLM_SERVER_DISABLE_GC=1"

# Profiling Configuration
profiling:
nsys_on: false # Set to true to enable profiling
ctx_profile_range: "10-30" # Set TLLM_PROFILE_START_STOP for ctx workers
gen_profile_range: "200-250" # Set TLLM_PROFILE_START_STOP for gen workers

# Accuracy Configuration
accuracy:
enable_accuracy_test: false # Set to true to enable accuracy evaluation
model: "local-completions" # Model type for lm_eval
tasks: "gsm8k" # Evaluation tasks (comma-separated)
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=False,timeout=1200,max_gen_toks=256,max_length=512" # Extra model arguments for lm_eval
model_args_extra: "num_concurrent=512,max_retries=3,tokenized_requests=false,timeout=1200,max_gen_toks=256,max_length=4096" # Extra model arguments for lm_eval

# Worker Configuration
worker_config:
gen:
enable_layerwise_nvtx_marker: true
tensor_parallel_size: 32
moe_expert_parallel_size: 32
enable_attention_dp: true
Expand Down Expand Up @@ -97,7 +98,6 @@ worker_config:
decoding_type: MTP
num_nextn_predict_layers: 3
ctx:
enable_layerwise_nvtx_marker: true
max_batch_size: 1
max_num_tokens: 8448
max_seq_len: 8212
Expand Down