diff --git a/.ci/gpu/reset-b200.sh b/.ci/gpu/reset-b200.sh new file mode 100644 index 000000000..ba7a3598d --- /dev/null +++ b/.ci/gpu/reset-b200.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA B200 GPU +# To reset GPU status + +# Reset GPU and Memory clocks +sudo nvidia-smi -rgc +sudo nvidia-smi -rmc + +# Restore the default power limit (750W) +sudo nvidia-smi -pl 750 + +# Disable persistent mode +sudo nvidia-smi -pm 0 diff --git a/.ci/gpu/tune-b200.sh b/.ci/gpu/tune-b200.sh new file mode 100644 index 000000000..2348468c2 --- /dev/null +++ b/.ci/gpu/tune-b200.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Script to tune NVIDIA B200 GPU +# To stablize performance + +set -ex + +GPU_ID=0 +POWER_CAP=750 + +MAX_POWER=$(nvidia-smi --query-gpu=power.max_limit --format=csv,noheader,nounits -i $GPU_ID) +MAX_SM_CLOCK=$(nvidia-smi --query-gpu=clocks.max.graphics --format=csv,noheader,nounits -i $GPU_ID) +MAX_MEM_CLOCK=$(nvidia-smi --query-gpu=clocks.max.memory --format=csv,noheader,nounits -i $GPU_ID) +GPU_MODEL=$(nvidia-smi --query-gpu=name --format=csv,noheader -i $GPU_ID | head -n1 | awk '{print $2}') + +if [[ "$GPU_MODEL" == "H100" ]]; then + DESIRED_POWER=500 +elif [[ "$GPU_MODEL" == "GB200" ]]; then + DESIRED_POWER=1200 +elif [[ "$GPU_MODEL" == "B200" ]]; then + DESIRED_POWER=750 +else + DESIRED_POWER=500 +fi + +echo "→ Locking power cap to $POWER_CAP W, SM clock to $MAX_SM_CLOCK MHz, and memory clock to $MAX_MEM_CLOCK MHz on GPU $GPU_ID" + +( + sudo nvidia-smi -i "$GPU_ID" -pm 1 + sudo nvidia-smi --power-limit=$POWER_CAP -i "$GPU_ID" + sudo nvidia-smi -lgc $MAX_SM_CLOCK -i "$GPU_ID" + sudo nvidia-smi -lmc $MAX_MEM_CLOCK -i "$GPU_ID" + sudo nvidia-smi -ac $MAX_MEM_CLOCK,$MAX_SM_CLOCK -i "$GPU_ID" +) >/dev/null + diff --git a/docker/infra/values.yaml b/docker/infra/values.yaml index 9dd824162..9731931be 100644 --- a/docker/infra/values.yaml +++ b/docker/infra/values.yaml @@ -248,11 +248,11 @@ template: volumes: - name: nvidia-lib hostPath: - path: /opt/nvidia/lib64 + path: /home/kubernetes/bin/nvidia/lib64 type: Directory - name: nvidia-bin hostPath: - path: /opt/nvidia/bin + path: /home/kubernetes/bin/nvidia/bin type: Directory - name: nvidia-card hostPath: