@@ -129,41 +129,90 @@ spec:
129129 - /bin/sh
130130 - -c
131131 - |
132- # Get credentials from environment variables
132+ echo "=========================================="
133+ echo "PreStop Hook Started: $(date)"
134+ echo "Pod: $HOSTNAME"
135+ echo "=========================================="
136+
137+ # Get credentials from environment
133138 USER_EMAIL="$ZO_ROOT_USER_EMAIL"
134139 USER_PASSWORD="$ZO_ROOT_USER_PASSWORD"
135-
136- # Create base64 encoded credentials for Authorization header
137140 AUTH_HEADER=$(echo -n "${USER_EMAIL}:${USER_PASSWORD}" | base64)
138-
139- # Disable the node first
140- echo "Disabling ingester node..."
141- curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/enable?value=false" \
142- -H "Authorization: Basic ${AUTH_HEADER}"
143-
144- # returns 200 if successful and "true" if the node is disabled
145-
146- # Flush all data from memory to WAL. This does not flush data from ingester to s3.
147- echo "Flushing data from ingester..."
148- RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \
141+ PORT="${ZO_HTTP_PORT:-5080}"
142+
143+ # Step 1: Disable the node (triggers drain mode)
144+ echo "[$(date)] Step 1: Calling PUT /node/enable?value=false to disable node..."
145+ DISABLE_RESPONSE=$(curl -s -w "\nHTTP_CODE:%{http_code}" \
146+ -X PUT "http://localhost:${PORT}/node/enable?value=false" \
149147 -H "Authorization: Basic ${AUTH_HEADER}")
150- if [ "$RESPONSE" -ne 200 ]; then
151- echo "Error: Failed to flush data from ingester. HTTP response code: $RESPONSE"
148+
149+ HTTP_CODE=$(echo "$DISABLE_RESPONSE" | grep "HTTP_CODE:" | cut -d: -f2)
150+ BODY=$(echo "$DISABLE_RESPONSE" | grep -v "HTTP_CODE:")
151+
152+ echo "[$(date)] Response (HTTP $HTTP_CODE): $BODY"
153+
154+ if [ "$HTTP_CODE" != "200" ]; then
155+ echo "[$(date)] ERROR: Failed to disable node"
152156 exit 1
153157 fi
154-
155- # returns 200 if successful and "true" if the node is flushed
156-
157- # We need another API to check if all the data has been moved to s3 or /flush should become async and move files to s3 as well
158- # e.g /node/wal_status
159- # Need to build this API. Until then, we will wait for 900 seconds.
160-
161- # Wait for 900 seconds after flush to ensure data is moved to s3
162- # 15 minutes for now, since file movement to s3 may take up to 10 minutes
163- echo "Waiting 900 seconds to flush data..."
164- sleep 900
165-
166- echo "Pre-stop hook completed"
158+
159+ echo "[$(date)] ✓ Node disabled - drain mode activated"
160+ echo ""
161+
162+ # Step 2: Poll drain status until ready for shutdown
163+ echo "[$(date)] Step 2: Monitoring drain status via GET /node/drain_status..."
164+
165+ START_TIME=$(date +%s)
166+ MAX_WAIT=1000 # ~16 minutes (leave buffer for k8s)
167+ POLL_INTERVAL=5
168+
169+ while true; do
170+ CURRENT_TIME=$(date +%s)
171+ ELAPSED=$((CURRENT_TIME - START_TIME))
172+
173+ if [ $ELAPSED -ge $MAX_WAIT ]; then
174+ echo "[$(date)] WARNING: Drain timeout after ${ELAPSED}s"
175+ echo "[$(date)] Exiting to allow Kubernetes to terminate pod"
176+ break
177+ fi
178+
179+ # Call drain_status API
180+ STATUS=$(curl -s "http://localhost:${PORT}/node/drain_status" \
181+ -H "Authorization: Basic ${AUTH_HEADER}")
182+
183+ if [ $? -ne 0 ]; then
184+ echo "[$(date)] ERROR: Failed to get drain status"
185+ sleep $POLL_INTERVAL
186+ continue
187+ fi
188+
189+ # Parse JSON response (without jq dependency)
190+ READY=$(echo "$STATUS" | grep -o ' " readyForShutdown" :[^,}]*' | cut -d: -f2 | tr -d ' ')
191+ PENDING=$(echo "$STATUS" | grep -o '"pendingParquetFiles":[^,}]*' | cut -d : -f2 | tr -d ' ')
192+ IS_DRAINING=$(echo "$STATUS" | grep -o '"isDraining":[^,}]*' | cut -d : -f2 | tr -d ' ')
193+ MEMORY_FLUSHED=$(echo "$STATUS" | grep -o '"memoryFlushed":[^,}]*' | cut -d : -f2 | tr -d ' ')
194+
195+ echo "[$(date)] [${ELAPSED}s] Status:"
196+ echo " - isDraining : $IS_DRAINING"
197+ echo " - memoryFlushed : $MEMORY_FLUSHED"
198+ echo " - pendingParquetFiles : $PENDING"
199+ echo " - readyForShutdown : $READY"
200+
201+ # Check if ready for shutdown
202+ if [ "$READY" = "true" ]; then
203+ echo ""
204+ echo "=========================================="
205+ echo "[$(date)] ✓ DRAIN COMPLETED in ${ELAPSED}s"
206+ echo "=========================================="
207+ echo "All parquet files uploaded to S3"
208+ echo "Pod is safe to terminate"
209+ break
210+ fi
211+
212+ sleep $POLL_INTERVAL
213+ done
214+
215+ echo "[$(date)] PreStop hook completed. Pod will now terminate."
167216 {{- end }}
168217 resources :
169218 {{- toYaml .Values.resources.ingester | nindent 12 }}
0 commit comments