From 740c08357a74254fc2dd03d2ecae9446443e41e7 Mon Sep 17 00:00:00 2001 From: Prabhat Sharma Date: Sun, 20 Jul 2025 11:30:51 -0700 Subject: [PATCH 1/6] add preStop hook for autoscaling down --- charts/openobserve/README.md | 4 +- .../templates/ingester-statefulset.yaml | 40 +++++++++++++++++++ charts/openobserve/values.yaml | 4 +- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/charts/openobserve/README.md b/charts/openobserve/README.md index 5ec97a4..61677be 100644 --- a/charts/openobserve/README.md +++ b/charts/openobserve/README.md @@ -1,4 +1,4 @@ -# OpenObserve helm chart +# OpenObserve Helm Chart ## Amazon EKS @@ -11,7 +11,7 @@ You must set a minimum of 2 values: 1. IAM role for the serviceAccount to gain AWS IAM credentials to access s3 - serviceAccount.annotations."eks.amazonaws.com/role-arn" -## Install +## Installation Install the Cloud Native PostgreSQL Operator. This is a prerequisite for openobserve helm chart. This helm chart sets up a postgres database cluster (1 primary + 1 replica) and uses it as metadata store of OpenObserve. ```shell diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml index 25e6d06..ca66596 100644 --- a/charts/openobserve/templates/ingester-statefulset.yaml +++ b/charts/openobserve/templates/ingester-statefulset.yaml @@ -121,6 +121,46 @@ spec: successThreshold: {{ .Values.probes.ingester.config.readinessProbe.successThreshold | default 1 }} failureThreshold: {{ .Values.probes.ingester.config.readinessProbe.failureThreshold | default 3 }} {{- end }} + {{- if .Values.autoscaling.ingester.enabled }} + lifecycle: + preStop: + exec: + command: + - /bin/sh + - -c + - | + # Get credentials from environment variables + USER_EMAIL="$ZO_ROOT_USER_EMAIL" + USER_PASSWORD="$ZO_ROOT_USER_PASSWORD" + + # Create base64 encoded credentials for Authorization header + AUTH_HEADER=$(echo -n "${USER_EMAIL}:${USER_PASSWORD}" | base64) + + # Disable the node first + echo "Disabling ingester node..." + curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/enable?value=false" \ + -H "Authorization: Basic ${AUTH_HEADER}" + + # returns 200 if successful and "true" if the node is disabled + + # Flush all data from memory to WAL. This does not flush data from ingester to s3. + echo "Flushing data from ingester..." + curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \ + -H "Authorization: Basic ${AUTH_HEADER}" + + # returns 200 if successful and "true" if the node is flushed + + # We need another API to check if all the data has been moved to s3 or /flush should become async and move files to s3 as well + # e.g /node/wal_status + # Need to build this API. Until then, we will wait for 900 seconds. + + # Wait for 900 seconds after flush to ensure data is moved to s3 + # 15 minutes for now, since file movement to s3 may take up to 10 minutes + echo "Waiting 900 seconds to flush data..." + sleep 900 + + echo "Pre-stop hook completed" + {{- end }} resources: {{- toYaml .Values.resources.ingester | nindent 12 }} envFrom: diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml index b845ac3..1a63087 100644 --- a/charts/openobserve/values.yaml +++ b/charts/openobserve/values.yaml @@ -1028,14 +1028,14 @@ probes: timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 - terminationGracePeriodSeconds: 30 + terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3 livenessProbe: initialDelaySeconds: 10 periodSeconds: 10 timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 - terminationGracePeriodSeconds: 30 + terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3 querier: enabled: false config: From 69254b7975111d52d31487bbf247a9a26a9e984d Mon Sep 17 00:00:00 2001 From: Prabhat Sharma Date: Sun, 20 Jul 2025 15:15:17 -0700 Subject: [PATCH 2/6] Update charts/openobserve/templates/ingester-statefulset.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- charts/openobserve/templates/ingester-statefulset.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml index ca66596..d2bf54c 100644 --- a/charts/openobserve/templates/ingester-statefulset.yaml +++ b/charts/openobserve/templates/ingester-statefulset.yaml @@ -145,8 +145,12 @@ spec: # Flush all data from memory to WAL. This does not flush data from ingester to s3. echo "Flushing data from ingester..." - curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \ - -H "Authorization: Basic ${AUTH_HEADER}" + RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \ + -H "Authorization: Basic ${AUTH_HEADER}") + if [ "$RESPONSE" -ne 200 ]; then + echo "Error: Failed to flush data from ingester. HTTP response code: $RESPONSE" + exit 1 + fi # returns 200 if successful and "true" if the node is flushed From 72e95fb9bc750a7f9c1f9d8892f52d33b122670b Mon Sep 17 00:00:00 2001 From: Prabhat Sharma Date: Sun, 20 Jul 2025 15:15:43 -0700 Subject: [PATCH 3/6] Update charts/openobserve/values.yaml Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- charts/openobserve/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml index 1a63087..896ed9b 100644 --- a/charts/openobserve/values.yaml +++ b/charts/openobserve/values.yaml @@ -1028,7 +1028,7 @@ probes: timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 - terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3 + terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data and it takes up to 10 minutes to flush data to s3 livenessProbe: initialDelaySeconds: 10 periodSeconds: 10 From 1c5ce0db33e8edf6451b3cc6d40f7c018fedb3aa Mon Sep 17 00:00:00 2001 From: Prabhat Sharma Date: Sun, 20 Jul 2025 17:04:44 -0700 Subject: [PATCH 4/6] typo fix --- charts/openobserve/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml index 896ed9b..3110b4c 100644 --- a/charts/openobserve/values.yaml +++ b/charts/openobserve/values.yaml @@ -1035,7 +1035,7 @@ probes: timeoutSeconds: 5 successThreshold: 1 failureThreshold: 3 - terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3 + terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data and it takes up to 10 minutes to flush data to s3 querier: enabled: false config: From eaa0a47ed53a12119ce2a8b7abb987395dcf758c Mon Sep 17 00:00:00 2001 From: mmosarafO2 Date: Wed, 19 Nov 2025 21:34:21 +0530 Subject: [PATCH 5/6] updated autoscaling for enterprise --- .../openobserve/templates/compactor-hpa.yaml | 2 +- .../openobserve/templates/ingester-hpa.yaml | 2 +- .../templates/ingester-statefulset.yaml | 107 +++++++++++++----- charts/openobserve/templates/router-hpa.yaml | 2 +- 4 files changed, 81 insertions(+), 32 deletions(-) diff --git a/charts/openobserve/templates/compactor-hpa.yaml b/charts/openobserve/templates/compactor-hpa.yaml index f9412fe..25ed6b2 100644 --- a/charts/openobserve/templates/compactor-hpa.yaml +++ b/charts/openobserve/templates/compactor-hpa.yaml @@ -1,4 +1,4 @@ -{{- if .Values.autoscaling.compactor.enabled }} +{{- if and .Values.autoscaling.compactor.enabled .Values.enterprise.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: diff --git a/charts/openobserve/templates/ingester-hpa.yaml b/charts/openobserve/templates/ingester-hpa.yaml index 182839d..de10275 100644 --- a/charts/openobserve/templates/ingester-hpa.yaml +++ b/charts/openobserve/templates/ingester-hpa.yaml @@ -1,4 +1,4 @@ -{{- if .Values.autoscaling.ingester.enabled }} +{{- if and .Values.autoscaling.ingester.enabled .Values.enterprise.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml index d2bf54c..8ec4594 100644 --- a/charts/openobserve/templates/ingester-statefulset.yaml +++ b/charts/openobserve/templates/ingester-statefulset.yaml @@ -129,41 +129,90 @@ spec: - /bin/sh - -c - | - # Get credentials from environment variables + echo "==========================================" + echo "PreStop Hook Started: $(date)" + echo "Pod: $HOSTNAME" + echo "==========================================" + + # Get credentials from environment USER_EMAIL="$ZO_ROOT_USER_EMAIL" USER_PASSWORD="$ZO_ROOT_USER_PASSWORD" - - # Create base64 encoded credentials for Authorization header AUTH_HEADER=$(echo -n "${USER_EMAIL}:${USER_PASSWORD}" | base64) - - # Disable the node first - echo "Disabling ingester node..." - curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/enable?value=false" \ - -H "Authorization: Basic ${AUTH_HEADER}" - - # returns 200 if successful and "true" if the node is disabled - - # Flush all data from memory to WAL. This does not flush data from ingester to s3. - echo "Flushing data from ingester..." - RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \ + PORT="${ZO_HTTP_PORT:-5080}" + + # Step 1: Disable the node (triggers drain mode) + echo "[$(date)] Step 1: Calling PUT /node/enable?value=false to disable node..." + DISABLE_RESPONSE=$(curl -s -w "\nHTTP_CODE:%{http_code}" \ + -X PUT "http://localhost:${PORT}/node/enable?value=false" \ -H "Authorization: Basic ${AUTH_HEADER}") - if [ "$RESPONSE" -ne 200 ]; then - echo "Error: Failed to flush data from ingester. HTTP response code: $RESPONSE" + + HTTP_CODE=$(echo "$DISABLE_RESPONSE" | grep "HTTP_CODE:" | cut -d: -f2) + BODY=$(echo "$DISABLE_RESPONSE" | grep -v "HTTP_CODE:") + + echo "[$(date)] Response (HTTP $HTTP_CODE): $BODY" + + if [ "$HTTP_CODE" != "200" ]; then + echo "[$(date)] ERROR: Failed to disable node" exit 1 fi - - # returns 200 if successful and "true" if the node is flushed - - # We need another API to check if all the data has been moved to s3 or /flush should become async and move files to s3 as well - # e.g /node/wal_status - # Need to build this API. Until then, we will wait for 900 seconds. - - # Wait for 900 seconds after flush to ensure data is moved to s3 - # 15 minutes for now, since file movement to s3 may take up to 10 minutes - echo "Waiting 900 seconds to flush data..." - sleep 900 - - echo "Pre-stop hook completed" + + echo "[$(date)] ✓ Node disabled - drain mode activated" + echo "" + + # Step 2: Poll drain status until ready for shutdown + echo "[$(date)] Step 2: Monitoring drain status via GET /node/drain_status..." + + START_TIME=$(date +%s) + MAX_WAIT=1000 # ~16 minutes (leave buffer for k8s) + POLL_INTERVAL=5 + + while true; do + CURRENT_TIME=$(date +%s) + ELAPSED=$((CURRENT_TIME - START_TIME)) + + if [ $ELAPSED -ge $MAX_WAIT ]; then + echo "[$(date)] WARNING: Drain timeout after ${ELAPSED}s" + echo "[$(date)] Exiting to allow Kubernetes to terminate pod" + break + fi + + # Call drain_status API + STATUS=$(curl -s "http://localhost:${PORT}/node/drain_status" \ + -H "Authorization: Basic ${AUTH_HEADER}") + + if [ $? -ne 0 ]; then + echo "[$(date)] ERROR: Failed to get drain status" + sleep $POLL_INTERVAL + continue + fi + + # Parse JSON response (without jq dependency) + READY=$(echo "$STATUS" | grep -o '"readyForShutdown":[^,}]*' | cut -d: -f2 | tr -d ' ') + PENDING=$(echo "$STATUS" | grep -o '"pendingParquetFiles":[^,}]*' | cut -d: -f2 | tr -d ' ') + IS_DRAINING=$(echo "$STATUS" | grep -o '"isDraining":[^,}]*' | cut -d: -f2 | tr -d ' ') + MEMORY_FLUSHED=$(echo "$STATUS" | grep -o '"memoryFlushed":[^,}]*' | cut -d: -f2 | tr -d ' ') + + echo "[$(date)] [${ELAPSED}s] Status:" + echo " - isDraining: $IS_DRAINING" + echo " - memoryFlushed: $MEMORY_FLUSHED" + echo " - pendingParquetFiles: $PENDING" + echo " - readyForShutdown: $READY" + + # Check if ready for shutdown + if [ "$READY" = "true" ]; then + echo "" + echo "==========================================" + echo "[$(date)] ✓ DRAIN COMPLETED in ${ELAPSED}s" + echo "==========================================" + echo "All parquet files uploaded to S3" + echo "Pod is safe to terminate" + break + fi + + sleep $POLL_INTERVAL + done + + echo "[$(date)] PreStop hook completed. Pod will now terminate." {{- end }} resources: {{- toYaml .Values.resources.ingester | nindent 12 }} diff --git a/charts/openobserve/templates/router-hpa.yaml b/charts/openobserve/templates/router-hpa.yaml index 15faf5c..22676ad 100644 --- a/charts/openobserve/templates/router-hpa.yaml +++ b/charts/openobserve/templates/router-hpa.yaml @@ -1,4 +1,4 @@ -{{- if .Values.autoscaling.router.enabled }} +{{- if and .Values.autoscaling.router.enabled .Values.enterprise.enabled }} apiVersion: autoscaling/v2 kind: HorizontalPodAutoscaler metadata: From 754f633b8e97f6cb5c63b48449f8371ea8db10cb Mon Sep 17 00:00:00 2001 From: mmosarafO2 Date: Thu, 20 Nov 2025 08:55:38 +0530 Subject: [PATCH 6/6] Updated release version from v0.16.1 to v0.16.2 --- charts/openobserve-standalone/Chart.yaml | 4 ++-- charts/openobserve-standalone/values.yaml | 2 +- charts/openobserve/Chart.yaml | 4 ++-- charts/openobserve/values.yaml | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/charts/openobserve-standalone/Chart.yaml b/charts/openobserve-standalone/Chart.yaml index ab36518..c102581 100644 --- a/charts/openobserve-standalone/Chart.yaml +++ b/charts/openobserve-standalone/Chart.yaml @@ -15,13 +15,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.16.1 +version: 0.16.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "v0.16.1" +appVersion: "v0.16.2" dependencies: - name: minio diff --git a/charts/openobserve-standalone/values.yaml b/charts/openobserve-standalone/values.yaml index 27c6115..7210a2c 100644 --- a/charts/openobserve-standalone/values.yaml +++ b/charts/openobserve-standalone/values.yaml @@ -6,7 +6,7 @@ image: repository: o2cr.ai/openobserve/openobserve pullPolicy: IfNotPresent # Overrides the image tag whose default is the chart appVersion. - tag: "v0.16.1" + tag: "v0.16.2" busybox: repository: busybox tag: 1.37.0 diff --git a/charts/openobserve/Chart.yaml b/charts/openobserve/Chart.yaml index 56bc4c2..177659f 100644 --- a/charts/openobserve/Chart.yaml +++ b/charts/openobserve/Chart.yaml @@ -15,13 +15,13 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.16.1 +version: 0.16.2 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "v0.16.1" +appVersion: "v0.16.2" dependencies: - name: etcd diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml index 3110b4c..6097f95 100644 --- a/charts/openobserve/values.yaml +++ b/charts/openobserve/values.yaml @@ -10,7 +10,7 @@ image: enterprise: repository: o2cr.ai/openobserve/openobserve-enterprise # Overrides the image tag whose default is the chart appVersion. - tag: "v0.16.1" + tag: "v0.16.2" reportserver: repository: o2cr.ai/openobserve/report-server tag: "v0.11.0-70baf7a"