From 740c08357a74254fc2dd03d2ecae9446443e41e7 Mon Sep 17 00:00:00 2001
From: Prabhat Sharma <hi.prabhat@gmail.com>
Date: Sun, 20 Jul 2025 11:30:51 -0700
Subject: [PATCH 1/6] add preStop hook for autoscaling down

---
 charts/openobserve/README.md                  |  4 +-
 .../templates/ingester-statefulset.yaml       | 40 +++++++++++++++++++
 charts/openobserve/values.yaml                |  4 +-
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/charts/openobserve/README.md b/charts/openobserve/README.md
index 5ec97a4..61677be 100644
--- a/charts/openobserve/README.md
+++ b/charts/openobserve/README.md
@@ -1,4 +1,4 @@
-# OpenObserve helm chart
+# OpenObserve Helm Chart
 
 ## Amazon EKS
 
@@ -11,7 +11,7 @@ You must set a minimum of 2 values:
 1. IAM role for the serviceAccount to gain AWS IAM credentials to access s3
    - serviceAccount.annotations."eks.amazonaws.com/role-arn"
 
-## Install
+## Installation
 
 Install the Cloud Native PostgreSQL Operator. This is a prerequisite for openobserve helm chart. This helm chart sets up a postgres database cluster (1 primary + 1 replica) and uses it as metadata store of OpenObserve.
 ```shell
diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml
index 25e6d06..ca66596 100644
--- a/charts/openobserve/templates/ingester-statefulset.yaml
+++ b/charts/openobserve/templates/ingester-statefulset.yaml
@@ -121,6 +121,46 @@ spec:
             successThreshold: {{ .Values.probes.ingester.config.readinessProbe.successThreshold | default 1 }}
             failureThreshold: {{ .Values.probes.ingester.config.readinessProbe.failureThreshold | default 3 }}
           {{- end }}
+          {{- if .Values.autoscaling.ingester.enabled }}
+          lifecycle:
+            preStop:
+              exec:
+                command:
+                - /bin/sh
+                - -c
+                - |
+                  # Get credentials from environment variables
+                  USER_EMAIL="$ZO_ROOT_USER_EMAIL"
+                  USER_PASSWORD="$ZO_ROOT_USER_PASSWORD"
+                  
+                  # Create base64 encoded credentials for Authorization header
+                  AUTH_HEADER=$(echo -n "${USER_EMAIL}:${USER_PASSWORD}" | base64)
+                  
+                  # Disable the node first
+                  echo "Disabling ingester node..."
+                  curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/enable?value=false" \
+                    -H "Authorization: Basic ${AUTH_HEADER}" 
+                  
+                  # returns 200 if successful and "true" if the node is disabled
+
+                  # Flush all data from memory to WAL. This does not flush data from ingester to s3.
+                  echo "Flushing data from ingester..."
+                  curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \
+                    -H "Authorization: Basic ${AUTH_HEADER}" 
+                  
+                  # returns 200 if successful and "true" if the node is flushed
+
+                  # We need another API to check if all the data has been moved to s3 or /flush should become async and move files to s3 as well
+                  # e.g /node/wal_status
+                  # Need to build this API. Until then, we will wait for 900 seconds.
+
+                  # Wait for 900 seconds after flush to ensure data is moved to s3
+                  # 15 minutes for now, since file movement to s3 may take up to 10 minutes
+                  echo "Waiting 900 seconds to flush data..."
+                  sleep 900
+                  
+                  echo "Pre-stop hook completed"
+          {{- end }}
           resources:
             {{- toYaml .Values.resources.ingester | nindent 12 }}
           envFrom:
diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml
index b845ac3..1a63087 100644
--- a/charts/openobserve/values.yaml
+++ b/charts/openobserve/values.yaml
@@ -1028,14 +1028,14 @@ probes:
         timeoutSeconds: 5
         successThreshold: 1
         failureThreshold: 3
-        terminationGracePeriodSeconds: 30
+        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3
       livenessProbe:
         initialDelaySeconds: 10
         periodSeconds: 10
         timeoutSeconds: 5
         successThreshold: 1
         failureThreshold: 3
-        terminationGracePeriodSeconds: 30
+        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3
   querier:
     enabled: false
     config:

From 69254b7975111d52d31487bbf247a9a26a9e984d Mon Sep 17 00:00:00 2001
From: Prabhat Sharma <hi.prabhat@gmail.com>
Date: Sun, 20 Jul 2025 15:15:17 -0700
Subject: [PATCH 2/6] Update
 charts/openobserve/templates/ingester-statefulset.yaml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 charts/openobserve/templates/ingester-statefulset.yaml | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml
index ca66596..d2bf54c 100644
--- a/charts/openobserve/templates/ingester-statefulset.yaml
+++ b/charts/openobserve/templates/ingester-statefulset.yaml
@@ -145,8 +145,12 @@ spec:
 
                   # Flush all data from memory to WAL. This does not flush data from ingester to s3.
                   echo "Flushing data from ingester..."
-                  curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \
-                    -H "Authorization: Basic ${AUTH_HEADER}" 
+                  RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \
+                    -H "Authorization: Basic ${AUTH_HEADER}")
+                  if [ "$RESPONSE" -ne 200 ]; then
+                    echo "Error: Failed to flush data from ingester. HTTP response code: $RESPONSE"
+                    exit 1
+                  fi
                   
                   # returns 200 if successful and "true" if the node is flushed
 

From 72e95fb9bc750a7f9c1f9d8892f52d33b122670b Mon Sep 17 00:00:00 2001
From: Prabhat Sharma <hi.prabhat@gmail.com>
Date: Sun, 20 Jul 2025 15:15:43 -0700
Subject: [PATCH 3/6] Update charts/openobserve/values.yaml

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 charts/openobserve/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml
index 1a63087..896ed9b 100644
--- a/charts/openobserve/values.yaml
+++ b/charts/openobserve/values.yaml
@@ -1028,7 +1028,7 @@ probes:
         timeoutSeconds: 5
         successThreshold: 1
         failureThreshold: 3
-        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3
+        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data and it takes up to 10 minutes to flush data to s3
       livenessProbe:
         initialDelaySeconds: 10
         periodSeconds: 10

From 1c5ce0db33e8edf6451b3cc6d40f7c018fedb3aa Mon Sep 17 00:00:00 2001
From: Prabhat Sharma <hi.prabhat@gmail.com>
Date: Sun, 20 Jul 2025 17:04:44 -0700
Subject: [PATCH 4/6] typo fix

---
 charts/openobserve/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml
index 896ed9b..3110b4c 100644
--- a/charts/openobserve/values.yaml
+++ b/charts/openobserve/values.yaml
@@ -1035,7 +1035,7 @@ probes:
         timeoutSeconds: 5
         successThreshold: 1
         failureThreshold: 3
-        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data andit takes up to 10 minutes to flush data to s3
+        terminationGracePeriodSeconds: 1200 # 20 minutes for now, since we are using pre-stop hook to flush data and it takes up to 10 minutes to flush data to s3
   querier:
     enabled: false
     config:

From eaa0a47ed53a12119ce2a8b7abb987395dcf758c Mon Sep 17 00:00:00 2001
From: mmosarafO2 <mosraf@openobserve.ai>
Date: Wed, 19 Nov 2025 21:34:21 +0530
Subject: [PATCH 5/6] updated autoscaling for enterprise

---
 .../openobserve/templates/compactor-hpa.yaml  |   2 +-
 .../openobserve/templates/ingester-hpa.yaml   |   2 +-
 .../templates/ingester-statefulset.yaml       | 107 +++++++++++++-----
 charts/openobserve/templates/router-hpa.yaml  |   2 +-
 4 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/charts/openobserve/templates/compactor-hpa.yaml b/charts/openobserve/templates/compactor-hpa.yaml
index f9412fe..25ed6b2 100644
--- a/charts/openobserve/templates/compactor-hpa.yaml
+++ b/charts/openobserve/templates/compactor-hpa.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.autoscaling.compactor.enabled }}
+{{- if and .Values.autoscaling.compactor.enabled .Values.enterprise.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
diff --git a/charts/openobserve/templates/ingester-hpa.yaml b/charts/openobserve/templates/ingester-hpa.yaml
index 182839d..de10275 100644
--- a/charts/openobserve/templates/ingester-hpa.yaml
+++ b/charts/openobserve/templates/ingester-hpa.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.autoscaling.ingester.enabled }}
+{{- if and .Values.autoscaling.ingester.enabled .Values.enterprise.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:
diff --git a/charts/openobserve/templates/ingester-statefulset.yaml b/charts/openobserve/templates/ingester-statefulset.yaml
index d2bf54c..8ec4594 100644
--- a/charts/openobserve/templates/ingester-statefulset.yaml
+++ b/charts/openobserve/templates/ingester-statefulset.yaml
@@ -129,41 +129,90 @@ spec:
                 - /bin/sh
                 - -c
                 - |
-                  # Get credentials from environment variables
+                  echo "=========================================="
+                  echo "PreStop Hook Started: $(date)"
+                  echo "Pod: $HOSTNAME"
+                  echo "=========================================="
+
+                  # Get credentials from environment
                   USER_EMAIL="$ZO_ROOT_USER_EMAIL"
                   USER_PASSWORD="$ZO_ROOT_USER_PASSWORD"
-                  
-                  # Create base64 encoded credentials for Authorization header
                   AUTH_HEADER=$(echo -n "${USER_EMAIL}:${USER_PASSWORD}" | base64)
-                  
-                  # Disable the node first
-                  echo "Disabling ingester node..."
-                  curl -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/enable?value=false" \
-                    -H "Authorization: Basic ${AUTH_HEADER}" 
-                  
-                  # returns 200 if successful and "true" if the node is disabled
-
-                  # Flush all data from memory to WAL. This does not flush data from ingester to s3.
-                  echo "Flushing data from ingester..."
-                  RESPONSE=$(curl -s -o /dev/null -w "%{http_code}" -X PUT "http://localhost:{{ .Values.config.ZO_HTTP_PORT }}/node/flush" \
+                  PORT="${ZO_HTTP_PORT:-5080}"
+
+                  # Step 1: Disable the node (triggers drain mode)
+                  echo "[$(date)] Step 1: Calling PUT /node/enable?value=false to disable node..."
+                  DISABLE_RESPONSE=$(curl -s -w "\nHTTP_CODE:%{http_code}" \
+                    -X PUT "http://localhost:${PORT}/node/enable?value=false" \
                     -H "Authorization: Basic ${AUTH_HEADER}")
-                  if [ "$RESPONSE" -ne 200 ]; then
-                    echo "Error: Failed to flush data from ingester. HTTP response code: $RESPONSE"
+
+                  HTTP_CODE=$(echo "$DISABLE_RESPONSE" | grep "HTTP_CODE:" | cut -d: -f2)
+                  BODY=$(echo "$DISABLE_RESPONSE" | grep -v "HTTP_CODE:")
+
+                  echo "[$(date)] Response (HTTP $HTTP_CODE): $BODY"
+
+                  if [ "$HTTP_CODE" != "200" ]; then
+                    echo "[$(date)] ERROR: Failed to disable node"
                     exit 1
                   fi
-                  
-                  # returns 200 if successful and "true" if the node is flushed
-
-                  # We need another API to check if all the data has been moved to s3 or /flush should become async and move files to s3 as well
-                  # e.g /node/wal_status
-                  # Need to build this API. Until then, we will wait for 900 seconds.
-
-                  # Wait for 900 seconds after flush to ensure data is moved to s3
-                  # 15 minutes for now, since file movement to s3 may take up to 10 minutes
-                  echo "Waiting 900 seconds to flush data..."
-                  sleep 900
-                  
-                  echo "Pre-stop hook completed"
+
+                  echo "[$(date)] ✓ Node disabled - drain mode activated"
+                  echo ""
+
+                  # Step 2: Poll drain status until ready for shutdown
+                  echo "[$(date)] Step 2: Monitoring drain status via GET /node/drain_status..."
+
+                  START_TIME=$(date +%s)
+                  MAX_WAIT=1000  # ~16 minutes (leave buffer for k8s)
+                  POLL_INTERVAL=5
+
+                  while true; do
+                    CURRENT_TIME=$(date +%s)
+                    ELAPSED=$((CURRENT_TIME - START_TIME))
+
+                    if [ $ELAPSED -ge $MAX_WAIT ]; then
+                      echo "[$(date)] WARNING: Drain timeout after ${ELAPSED}s"
+                      echo "[$(date)] Exiting to allow Kubernetes to terminate pod"
+                      break
+                    fi
+
+                    # Call drain_status API
+                    STATUS=$(curl -s "http://localhost:${PORT}/node/drain_status" \
+                      -H "Authorization: Basic ${AUTH_HEADER}")
+
+                    if [ $? -ne 0 ]; then
+                      echo "[$(date)] ERROR: Failed to get drain status"
+                      sleep $POLL_INTERVAL
+                      continue
+                    fi
+
+                    # Parse JSON response (without jq dependency)
+                    READY=$(echo "$STATUS" | grep -o '"readyForShutdown":[^,}]*' | cut -d: -f2 | tr -d ' ')
+                    PENDING=$(echo "$STATUS" | grep -o '"pendingParquetFiles":[^,}]*' | cut -d: -f2 | tr -d ' ')
+                    IS_DRAINING=$(echo "$STATUS" | grep -o '"isDraining":[^,}]*' | cut -d: -f2 | tr -d ' ')
+                    MEMORY_FLUSHED=$(echo "$STATUS" | grep -o '"memoryFlushed":[^,}]*' | cut -d: -f2 | tr -d ' ')
+
+                    echo "[$(date)] [${ELAPSED}s] Status:"
+                    echo "  - isDraining: $IS_DRAINING"
+                    echo "  - memoryFlushed: $MEMORY_FLUSHED"
+                    echo "  - pendingParquetFiles: $PENDING"
+                    echo "  - readyForShutdown: $READY"
+
+                    # Check if ready for shutdown
+                    if [ "$READY" = "true" ]; then
+                      echo ""
+                      echo "=========================================="
+                      echo "[$(date)] ✓ DRAIN COMPLETED in ${ELAPSED}s"
+                      echo "=========================================="
+                      echo "All parquet files uploaded to S3"
+                      echo "Pod is safe to terminate"
+                      break
+                    fi
+
+                    sleep $POLL_INTERVAL
+                  done
+
+                  echo "[$(date)] PreStop hook completed. Pod will now terminate."
           {{- end }}
           resources:
             {{- toYaml .Values.resources.ingester | nindent 12 }}
diff --git a/charts/openobserve/templates/router-hpa.yaml b/charts/openobserve/templates/router-hpa.yaml
index 15faf5c..22676ad 100644
--- a/charts/openobserve/templates/router-hpa.yaml
+++ b/charts/openobserve/templates/router-hpa.yaml
@@ -1,4 +1,4 @@
-{{- if .Values.autoscaling.router.enabled }}
+{{- if and .Values.autoscaling.router.enabled .Values.enterprise.enabled }}
 apiVersion: autoscaling/v2
 kind: HorizontalPodAutoscaler
 metadata:

From 754f633b8e97f6cb5c63b48449f8371ea8db10cb Mon Sep 17 00:00:00 2001
From: mmosarafO2 <mosraf@openobserve.ai>
Date: Thu, 20 Nov 2025 08:55:38 +0530
Subject: [PATCH 6/6] Updated release version from v0.16.1 to v0.16.2

---
 charts/openobserve-standalone/Chart.yaml  | 4 ++--
 charts/openobserve-standalone/values.yaml | 2 +-
 charts/openobserve/Chart.yaml             | 4 ++--
 charts/openobserve/values.yaml            | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/charts/openobserve-standalone/Chart.yaml b/charts/openobserve-standalone/Chart.yaml
index ab36518..c102581 100644
--- a/charts/openobserve-standalone/Chart.yaml
+++ b/charts/openobserve-standalone/Chart.yaml
@@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.16.1
+version: 0.16.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "v0.16.1"
+appVersion: "v0.16.2"
 
 dependencies:
   - name: minio
diff --git a/charts/openobserve-standalone/values.yaml b/charts/openobserve-standalone/values.yaml
index 27c6115..7210a2c 100644
--- a/charts/openobserve-standalone/values.yaml
+++ b/charts/openobserve-standalone/values.yaml
@@ -6,7 +6,7 @@ image:
   repository: o2cr.ai/openobserve/openobserve
   pullPolicy: IfNotPresent
   # Overrides the image tag whose default is the chart appVersion.
-  tag: "v0.16.1"
+  tag: "v0.16.2"
   busybox:
     repository: busybox
     tag: 1.37.0
diff --git a/charts/openobserve/Chart.yaml b/charts/openobserve/Chart.yaml
index 56bc4c2..177659f 100644
--- a/charts/openobserve/Chart.yaml
+++ b/charts/openobserve/Chart.yaml
@@ -15,13 +15,13 @@ type: application
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
-version: 0.16.1
+version: 0.16.2
 
 # This is the version number of the application being deployed. This version number should be
 # incremented each time you make changes to the application. Versions are not expected to
 # follow Semantic Versioning. They should reflect the version the application is using.
 # It is recommended to use it with quotes.
-appVersion: "v0.16.1"
+appVersion: "v0.16.2"
 
 dependencies:
   - name: etcd
diff --git a/charts/openobserve/values.yaml b/charts/openobserve/values.yaml
index 3110b4c..6097f95 100644
--- a/charts/openobserve/values.yaml
+++ b/charts/openobserve/values.yaml
@@ -10,7 +10,7 @@ image:
   enterprise:
     repository: o2cr.ai/openobserve/openobserve-enterprise
     # Overrides the image tag whose default is the chart appVersion.
-    tag: "v0.16.1"
+    tag: "v0.16.2"
   reportserver:
     repository: o2cr.ai/openobserve/report-server
     tag: "v0.11.0-70baf7a"