Skip to content

Commit 2b9c215

Browse files
committed
DEBUG: add nvidia_quiesce
1 parent da4af10 commit 2b9c215

File tree

1 file changed

+55
-0
lines changed

1 file changed

+55
-0
lines changed

.github/actions/setup-nvidia-helion/action.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,60 @@ runs:
7272
)
7373
}
7474
75+
nvidia_quiesce() {
76+
(
77+
local GRACE=5
78+
local -a MODULES=("nvidia_uvm" "nvidia_peermem" "nvidia_drm" "nvidia_modeset" "nvidia")
79+
80+
echo "[nvidia-quiesce] Stopping NVIDIA daemons (if any)..."
81+
sudo systemctl stop --quiet nvidia-persistenced.service || true
82+
sudo systemctl stop --quiet nvidia-fabricmanager.service || true
83+
sudo pkill -9 nvidia-persistenced 2>/dev/null || true
84+
sudo pkill -9 nvidia-fabricmanager 2>/dev/null || true
85+
86+
echo "[nvidia-quiesce] Enumerating processes with an open GPU handle..."
87+
local -a pids=()
88+
local -a extra=()
89+
mapfile -t pids < <(
90+
{ nvidia-smi --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null || true; } | sort -u
91+
)
92+
mapfile -t extra < <(
93+
{ sudo lsof -t /dev/nvidia* 2>/dev/null || true; } | sort -u
94+
)
95+
if ((${#extra[@]})); then
96+
pids+=("${extra[@]}")
97+
fi
98+
if ((${#pids[@]})); then
99+
mapfile -t pids < <(
100+
printf "%s\n" "${pids[@]}" | grep -E '^[0-9]+$' | sort -u
101+
)
102+
fi
103+
104+
if ((${#pids[@]})); then
105+
echo "[nvidia-quiesce] Sending SIGTERM to: ${pids[*]}"
106+
sudo kill -TERM "${pids[@]}" 2>/dev/null || true
107+
sleep "${GRACE}"
108+
echo "[nvidia-quiesce] Forcing SIGKILL (if still alive)..."
109+
sudo kill -KILL "${pids[@]}" 2>/dev/null || true
110+
else
111+
echo "[nvidia-quiesce] No GPU users found."
112+
fi
113+
114+
echo "[nvidia-quiesce] Unloading kernel modules..."
115+
for module in "${MODULES[@]}"; do
116+
if lsmod | awk '{print $1}' | grep -qx "${module}"; then
117+
echo " -> ${module}"
118+
if ! sudo modprobe -r "${module}"; then
119+
echo " WARNING: ${module} still busy; aborting."
120+
return 1
121+
fi
122+
fi
123+
done
124+
125+
echo "[nvidia-quiesce] All NVIDIA modules removed. Safe to run the new driver installer."
126+
)
127+
}
128+
75129
unload_nvidia_modules() {
76130
MODULES=("nvidia-uvm" "nvidia-drm" "nvidia-modeset" "nvidia")
77131
for module in "${MODULES[@]}"; do
@@ -116,6 +170,7 @@ runs:
116170
117171
# Turn off persistent mode so that the installation script can unload the kernel module
118172
sudo killall nvidia-persistenced || true
173+
nvidia_quiesce || true
119174
unload_nvidia_modules || true
120175
else
121176
HAS_NVIDIA_DRIVER=1

0 commit comments

Comments
 (0)