|
72 | 72 | ) |
73 | 73 | } |
74 | 74 |
|
| 75 | + nvidia_quiesce() { |
| 76 | + ( |
| 77 | + local GRACE=5 |
| 78 | + local -a MODULES=("nvidia_uvm" "nvidia_peermem" "nvidia_drm" "nvidia_modeset" "nvidia") |
| 79 | +
|
| 80 | + echo "[nvidia-quiesce] Stopping NVIDIA daemons (if any)..." |
| 81 | + sudo systemctl stop --quiet nvidia-persistenced.service || true |
| 82 | + sudo systemctl stop --quiet nvidia-fabricmanager.service || true |
| 83 | + sudo pkill -9 nvidia-persistenced 2>/dev/null || true |
| 84 | + sudo pkill -9 nvidia-fabricmanager 2>/dev/null || true |
| 85 | +
|
| 86 | + echo "[nvidia-quiesce] Enumerating processes with an open GPU handle..." |
| 87 | + local -a pids=() |
| 88 | + local -a extra=() |
| 89 | + mapfile -t pids < <( |
| 90 | + { nvidia-smi --query-compute-apps=pid --format=csv,noheader,nounits 2>/dev/null || true; } | sort -u |
| 91 | + ) |
| 92 | + mapfile -t extra < <( |
| 93 | + { sudo lsof -t /dev/nvidia* 2>/dev/null || true; } | sort -u |
| 94 | + ) |
| 95 | + if ((${#extra[@]})); then |
| 96 | + pids+=("${extra[@]}") |
| 97 | + fi |
| 98 | + if ((${#pids[@]})); then |
| 99 | + mapfile -t pids < <( |
| 100 | + printf "%s\n" "${pids[@]}" | grep -E '^[0-9]+$' | sort -u |
| 101 | + ) |
| 102 | + fi |
| 103 | +
|
| 104 | + if ((${#pids[@]})); then |
| 105 | + echo "[nvidia-quiesce] Sending SIGTERM to: ${pids[*]}" |
| 106 | + sudo kill -TERM "${pids[@]}" 2>/dev/null || true |
| 107 | + sleep "${GRACE}" |
| 108 | + echo "[nvidia-quiesce] Forcing SIGKILL (if still alive)..." |
| 109 | + sudo kill -KILL "${pids[@]}" 2>/dev/null || true |
| 110 | + else |
| 111 | + echo "[nvidia-quiesce] No GPU users found." |
| 112 | + fi |
| 113 | +
|
| 114 | + echo "[nvidia-quiesce] Unloading kernel modules..." |
| 115 | + for module in "${MODULES[@]}"; do |
| 116 | + if lsmod | awk '{print $1}' | grep -qx "${module}"; then |
| 117 | + echo " -> ${module}" |
| 118 | + if ! sudo modprobe -r "${module}"; then |
| 119 | + echo " WARNING: ${module} still busy; aborting." |
| 120 | + return 1 |
| 121 | + fi |
| 122 | + fi |
| 123 | + done |
| 124 | +
|
| 125 | + echo "[nvidia-quiesce] All NVIDIA modules removed. Safe to run the new driver installer." |
| 126 | + ) |
| 127 | + } |
| 128 | +
|
75 | 129 | unload_nvidia_modules() { |
76 | 130 | MODULES=("nvidia-uvm" "nvidia-drm" "nvidia-modeset" "nvidia") |
77 | 131 | for module in "${MODULES[@]}"; do |
@@ -116,6 +170,7 @@ runs: |
116 | 170 |
|
117 | 171 | # Turn off persistent mode so that the installation script can unload the kernel module |
118 | 172 | sudo killall nvidia-persistenced || true |
| 173 | + nvidia_quiesce || true |
119 | 174 | unload_nvidia_modules || true |
120 | 175 | else |
121 | 176 | HAS_NVIDIA_DRIVER=1 |
|
0 commit comments