Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .ci/gpu/reset-b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# Script to tune NVIDIA B200 GPU
# To reset GPU status

# Reset GPU and Memory clocks
sudo nvidia-smi -rgc
sudo nvidia-smi -rmc

# Restore the default power limit (750W)
sudo nvidia-smi -pl 750

# Disable persistent mode
sudo nvidia-smi -pm 0
34 changes: 34 additions & 0 deletions .ci/gpu/tune-b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
# Script to tune NVIDIA B200 GPU
# To stablize performance

set -ex

GPU_ID=0
POWER_CAP=750

MAX_POWER=$(nvidia-smi --query-gpu=power.max_limit --format=csv,noheader,nounits -i $GPU_ID)
MAX_SM_CLOCK=$(nvidia-smi --query-gpu=clocks.max.graphics --format=csv,noheader,nounits -i $GPU_ID)
MAX_MEM_CLOCK=$(nvidia-smi --query-gpu=clocks.max.memory --format=csv,noheader,nounits -i $GPU_ID)
GPU_MODEL=$(nvidia-smi --query-gpu=name --format=csv,noheader -i $GPU_ID | head -n1 | awk '{print $2}')

if [[ "$GPU_MODEL" == "H100" ]]; then
DESIRED_POWER=500
elif [[ "$GPU_MODEL" == "GB200" ]]; then
DESIRED_POWER=1200
elif [[ "$GPU_MODEL" == "B200" ]]; then
DESIRED_POWER=750
else
DESIRED_POWER=500
fi

echo "→ Locking power cap to $POWER_CAP W, SM clock to $MAX_SM_CLOCK MHz, and memory clock to $MAX_MEM_CLOCK MHz on GPU $GPU_ID"

(
sudo nvidia-smi -i "$GPU_ID" -pm 1
sudo nvidia-smi --power-limit=$POWER_CAP -i "$GPU_ID"
sudo nvidia-smi -lgc $MAX_SM_CLOCK -i "$GPU_ID"
sudo nvidia-smi -lmc $MAX_MEM_CLOCK -i "$GPU_ID"
sudo nvidia-smi -ac $MAX_MEM_CLOCK,$MAX_SM_CLOCK -i "$GPU_ID"
) >/dev/null

4 changes: 2 additions & 2 deletions docker/infra/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,11 +248,11 @@ template:
volumes:
- name: nvidia-lib
hostPath:
path: /opt/nvidia/lib64
path: /home/kubernetes/bin/nvidia/lib64
type: Directory
- name: nvidia-bin
hostPath:
path: /opt/nvidia/bin
path: /home/kubernetes/bin/nvidia/bin
type: Directory
- name: nvidia-card
hostPath:
Expand Down
Loading