Skip to content

Commit 9832d7a

Browse files
evberrypidevpatelio
authored andcommitted
Enhance run_cluster.sh for multi-NIC support (vllm-project#28328)
Signed-off-by: Ev Lacey <elacey@nvidia.com>
1 parent cdc0506 commit 9832d7a

File tree

1 file changed

+24
-0
lines changed

1 file changed

+24
-0
lines changed

examples/online_serving/run_cluster.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,29 @@ else
8383
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
8484
fi
8585

86+
# Parse VLLM_HOST_IP from additional args if present.
87+
# This is needed for multi-NIC configurations where Ray needs explicit IP bindings.
88+
VLLM_HOST_IP=""
89+
for arg in "${ADDITIONAL_ARGS[@]}"; do
90+
if [[ $arg == "-e" ]]; then
91+
continue
92+
fi
93+
if [[ $arg == VLLM_HOST_IP=* ]]; then
94+
VLLM_HOST_IP="${arg#VLLM_HOST_IP=}"
95+
break
96+
fi
97+
done
98+
99+
# Build Ray IP environment variables if VLLM_HOST_IP is set.
100+
# These variables ensure Ray binds to the correct network interface on multi-NIC systems.
101+
RAY_IP_VARS=()
102+
if [ -n "${VLLM_HOST_IP}" ]; then
103+
RAY_IP_VARS=(
104+
-e "RAY_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
105+
-e "RAY_OVERRIDE_NODE_IP_ADDRESS=${VLLM_HOST_IP}"
106+
)
107+
fi
108+
86109
# Launch the container with the assembled parameters.
87110
# --network host: Allows Ray nodes to communicate directly via host networking
88111
# --shm-size 10.24g: Increases shared memory
@@ -95,5 +118,6 @@ docker run \
95118
--shm-size 10.24g \
96119
--gpus all \
97120
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
121+
"${RAY_IP_VARS[@]}" \
98122
"${ADDITIONAL_ARGS[@]}" \
99123
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"

0 commit comments

Comments
 (0)