From 95f30f6d92ac0ae5f6f830ae0c115617fb00e721 Mon Sep 17 00:00:00 2001
From: RishabhSaini <rishabhsaini01@gmail.com>
Date: Fri, 7 Nov 2025 14:24:34 -0500
Subject: [PATCH 1/2] update GAIE to slo aware routing

---
 .../inferencepool/SLO-ROUTING-README.md       | 296 ++++++++++++++++++
 .../templates/epp-deployment.yaml             | 136 ++++++++
 .../inferencepool/templates/epp-service.yaml  |  24 ++
 .../templates/latency-predictor-config.yaml   |  41 +++
 .../inferencepool/values-slo-example.yaml     | 124 ++++++++
 config/charts/inferencepool/values.yaml       |  60 ++++
 6 files changed, 681 insertions(+)
 create mode 100644 config/charts/inferencepool/SLO-ROUTING-README.md
 create mode 100644 config/charts/inferencepool/templates/latency-predictor-config.yaml
 create mode 100644 config/charts/inferencepool/values-slo-example.yaml

diff --git a/config/charts/inferencepool/SLO-ROUTING-README.md b/config/charts/inferencepool/SLO-ROUTING-README.md
new file mode 100644
index 000000000..ea8c8d801
--- /dev/null
+++ b/config/charts/inferencepool/SLO-ROUTING-README.md
@@ -0,0 +1,296 @@
+# SLO-Aware Routing with Latency Prediction
+
+This document describes the modifications made to the InferencePool Helm chart to support SLO-aware routing with latency prediction sidecars.
+
+## Overview
+
+The SLO-aware routing feature enables intelligent request routing based on predicted latency using machine learning models. The system consists of:
+
+1. **EPP (Endpoint Picker) Container**: Main routing logic with latency prediction enabled
+2. **Training Server Sidecar**: Continuously trains XGBoost models on observed latency metrics
+3. **Prediction Server Sidecars**: Multiple replicas that serve latency predictions for TTFT (Time to First Token) and TPOT (Time Per Output Token)
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────┐
+│                    EPP Pod                           │
+├──────────────┬──────────────┬──────────────────────┤
+│     EPP      │   Training   │  Prediction Servers  │
+│  Container   │    Server    │  (3 replicas)        │
+│              │              │                       │
+│  Port 9002   │  Port 8000   │  Ports 8001-8003     │
+│  (ext-proc)  │  (training)  │  (prediction)        │
+└──────────────┴──────────────┴──────────────────────┘
+       │              │                  │
+       │              └──────┬───────────┘
+       │                     │
+       │              Model Training
+       │              & Synchronization
+       │
+    Routing Decision
+    (with latency prediction)
+```
+
+## Modified Files
+
+### 1. `templates/epp-deployment.yaml`
+- Added support for `sidecars.trainingServer` configuration
+- Added support for `sidecars.predictionServers` with configurable replicas
+- Automatically creates volumes for model storage
+- Injects ConfigMaps for training and prediction server configuration
+
+### 2. `templates/epp-service.yaml`
+- Automatically exposes ports for training server (8000)
+- Automatically exposes ports for prediction servers (8001-8003 by default)
+- Ports are only added when sidecars are enabled
+
+### 3. `templates/latency-predictor-config.yaml` (NEW)
+- Creates ConfigMap for training server configuration
+- Creates ConfigMap for prediction server configuration
+- Supports customizable model paths, retraining intervals, and other parameters
+
+### 4. `values.yaml`
+- Added comprehensive `sidecars` section with commented examples
+- Supports configuration for training and prediction server images, resources, and behavior
+
+### 5. `values-slo-example.yaml` (NEW)
+- Complete working example of SLO-aware routing configuration
+- Demonstrates all required settings including EPP flags, environment variables, and plugin configuration
+
+## Usage
+
+### Quick Start with Example Configuration
+
+```bash
+# Install with SLO-aware routing enabled
+helm install my-slo-pool oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool \
+  --namespace inference \
+  --values values-slo-example.yaml \
+  --set inferencePool.modelServers.matchLabels.app=my-model-server
+```
+
+### Custom Configuration
+
+Create a custom values file:
+
+```yaml
+inferenceExtension:
+  image:
+    hub: quay.io/your-org
+    name: epp
+    tag: slo-experimental
+
+  flags:
+    - name: enable-latency-predictor
+      value: "true"
+    - name: v
+      value: "4"
+
+  env:
+    - name: PREDICTION_SERVER_URL
+      value: "http://localhost:8001,http://localhost:8002,http://localhost:8003"
+    - name: TRAINING_SERVER_URL
+      value: "http://localhost:8000"
+    - name: LATENCY_MAX_SAMPLE_SIZE
+      value: "10000"
+
+  pluginsCustomConfig:
+    slo-plugins.yaml: |
+      apiVersion: inference.networking.x-k8s.io/v1alpha1
+      kind: EndpointPickerConfig
+      plugins:
+      - type: slo-request-tracker
+      - type: slo-scorer
+      - type: slo-aware-profile-handler
+      schedulingProfiles:
+      - name: slo
+        plugins:
+        - pluginRef: slo-request-tracker
+        - pluginRef: slo-scorer
+
+  sidecars:
+    trainingServer:
+      enabled: true
+      image:
+        hub: quay.io/your-org
+        name: latency-training
+        tag: latest
+      resources:
+        requests:
+          cpu: "2000m"
+          memory: "4Gi"
+        limits:
+          cpu: "4000m"
+          memory: "8Gi"
+
+    predictionServers:
+      enabled: true
+      replicas: 3
+      image:
+        hub: quay.io/your-org
+        name: latency-prediction
+        tag: latest
+      resources:
+        requests:
+          cpu: "500m"
+          memory: "1Gi"
+        limits:
+          cpu: "1000m"
+          memory: "2Gi"
+```
+
+## Configuration Reference
+
+### Training Server Configuration
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `sidecars.trainingServer.enabled` | Enable training server sidecar | `false` |
+| `sidecars.trainingServer.image.hub` | Container registry | `us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension` |
+| `sidecars.trainingServer.image.name` | Image name | `latency-training` |
+| `sidecars.trainingServer.image.tag` | Image tag | `latest` |
+| `sidecars.trainingServer.config.retrainingIntervalSec` | Retraining interval in seconds | `1` |
+| `sidecars.trainingServer.config.minSamplesForRetrain` | Minimum samples before retraining | `100` |
+| `sidecars.trainingServer.config.modelType` | ML model type | `xgboost` |
+| `sidecars.trainingServer.persistence.enabled` | Enable persistent storage for models | `false` |
+
+### Prediction Server Configuration
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `sidecars.predictionServers.enabled` | Enable prediction server sidecars | `false` |
+| `sidecars.predictionServers.replicas` | Number of prediction server replicas | `3` |
+| `sidecars.predictionServers.image.hub` | Container registry | `us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension` |
+| `sidecars.predictionServers.image.name` | Image name | `latency-prediction` |
+| `sidecars.predictionServers.image.tag` | Image tag | `latest` |
+| `sidecars.predictionServers.config.modelSyncIntervalSec` | Model sync interval in seconds | `10` |
+| `sidecars.predictionServers.config.modelType` | ML model type | `xgboost` |
+
+### EPP Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `PREDICTION_SERVER_URL` | Comma-separated prediction server URLs | `http://localhost:8001,http://localhost:8002,http://localhost:8003` |
+| `TRAINING_SERVER_URL` | Training server URL | `http://localhost:8000` |
+| `LATENCY_MAX_SAMPLE_SIZE` | Maximum sample size for latency prediction | `10000` |
+| `NEG_HEADROOM_TPOT_WEIGHT` | Weight for TPOT in negative headroom calculation | `0.2` |
+| `NEG_HEADROOM_TTFT_WEIGHT` | Weight for TTFT in negative headroom calculation | `0.8` |
+
+## Building Container Images
+
+### Prerequisites
+
+```bash
+cd /path/to/gateway-api-inference-extension
+git checkout slo-prediction-experimental
+```
+
+### Build EPP Image
+
+```bash
+export IMAGE_REGISTRY="quay.io/your-org"
+export EPP_TAG="slo-experimental"
+make image-build image-push
+```
+
+### Build Latency Predictor Images
+
+```bash
+cd latencypredictor-v1
+
+# Edit build-deploy.sh to set your registry
+# Then build and push:
+./build-deploy.sh build
+
+# Tag and push manually
+docker tag latencypredictor-v2-training-server:latest ${IMAGE_REGISTRY}/latency-training:slo-experimental
+docker tag latencypredictor-v2-prediction-server:latest ${IMAGE_REGISTRY}/latency-prediction:slo-experimental
+docker push ${IMAGE_REGISTRY}/latency-training:slo-experimental
+docker push ${IMAGE_REGISTRY}/latency-prediction:slo-experimental
+```
+
+## Verification
+
+After deployment, verify all containers are running:
+
+```bash
+# Check pod status
+kubectl get pods -n your-namespace
+
+# Expected: 1 pod with 5 containers (1 EPP + 1 training + 3 prediction)
+
+# Check EPP logs
+kubectl logs -n your-namespace <pod-name> -c epp
+
+# Check training server logs
+kubectl logs -n your-namespace <pod-name> -c training-server
+
+# Check prediction server logs
+kubectl logs -n your-namespace <pod-name> -c prediction-server-1
+```
+
+## Service Ports
+
+When sidecars are enabled, the service automatically exposes these ports:
+
+- `9002`: EPP gRPC ext-proc (always)
+- `9090`: EPP metrics (always)
+- `8000`: Training server (when `trainingServer.enabled: true`)
+- `8001-800N`: Prediction servers (when `predictionServers.enabled: true`, N = replicas)
+
+## Plugins
+
+The SLO-aware routing requires these plugins:
+
+- `slo-request-tracker`: Tracks request SLO requirements
+- `slo-scorer`: Scores endpoints based on predicted latency vs SLO
+- `slo-aware-profile-handler`: Handles different scheduling profiles
+- `max-score-picker`: Selects endpoint with maximum score
+
+### Scheduling Profiles
+
+- **default**: Standard routing with queue and kv-cache scoring
+- **slo**: SLO-aware routing using latency predictions
+
+## Troubleshooting
+
+### Sidecars Not Starting
+
+Check if images are accessible:
+```bash
+kubectl describe pod <pod-name> -n your-namespace
+```
+
+### Training Server Issues
+
+Check ConfigMap and logs:
+```bash
+kubectl get configmap latency-predictor-config -n your-namespace -o yaml
+kubectl logs <pod-name> -c training-server -n your-namespace
+```
+
+### Prediction Server Issues
+
+Verify prediction servers can reach training server:
+```bash
+kubectl exec <pod-name> -c prediction-server-1 -n your-namespace -- \
+  curl http://localhost:8000/healthz
+```
+
+## Integration with llm-d
+
+To use this chart in llm-d, update your helmfile:
+
+```yaml
+releases:
+  - name: gaie-slo
+    namespace: llm-d-slo
+    chart: oci://quay.io/your-org/charts/inferencepool
+    version: v1.0.1-slo
+    values:
+      - gaie-slo/values.yaml
+      - gaie-slo/values-slo.yaml
+```
+
+See the main documentation for complete integration instructions.
diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml
index f01699a96..b5570b9d7 100644
--- a/config/charts/inferencepool/templates/epp-deployment.yaml
+++ b/config/charts/inferencepool/templates/epp-deployment.yaml
@@ -31,6 +31,10 @@ spec:
         - "json"
         - --config-file
         - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}"
+        {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io/v1" }}
+        - --pool-group
+        - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}"
+        {{- end }}
         {{- range .Values.inferenceExtension.flags }}
         - "--{{ .name }}"
         - "{{ .value }}"
@@ -84,10 +88,142 @@ spec:
         volumeMounts:
         - name: plugins-config-volume
           mountPath: "/config"
+      {{- if .Values.inferenceExtension.sidecars }}
+      {{- if .Values.inferenceExtension.sidecars.trainingServer }}
+      {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }}
+      # Training Server Sidecar Container
+      - name: training-server
+        image: {{ .Values.inferenceExtension.sidecars.trainingServer.image.hub }}/{{ .Values.inferenceExtension.sidecars.trainingServer.image.name }}:{{ .Values.inferenceExtension.sidecars.trainingServer.image.tag }}
+        imagePullPolicy: {{ .Values.inferenceExtension.sidecars.trainingServer.image.pullPolicy | default "Always" }}
+        ports:
+        - containerPort: 8000
+          name: training-port
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8000
+          initialDelaySeconds: 30
+          periodSeconds: 20
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: 8000
+          initialDelaySeconds: 45
+          periodSeconds: 10
+        {{- if .Values.inferenceExtension.sidecars.trainingServer.resources }}
+        resources:
+          {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.resources | nindent 10 }}
+        {{- end }}
+        envFrom:
+        {{- if .Values.inferenceExtension.sidecars.trainingServer.envFrom }}
+          {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.envFrom | nindent 10 }}
+        {{- else }}
+        - configMapRef:
+            name: latency-predictor-config
+        {{- end }}
+        env:
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: SERVER_TYPE
+          value: "training"
+        {{- if .Values.inferenceExtension.sidecars.trainingServer.env }}
+        {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.env | nindent 8 }}
+        {{- end }}
+        volumeMounts:
+        - name: training-server-storage
+          mountPath: /models
+      {{- end }}
+      {{- end }}
+      {{- if .Values.inferenceExtension.sidecars.predictionServers }}
+      {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }}
+      {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }}
+      {{- range $i := until $replicas }}
+      # Prediction Server Sidecar Container {{ add $i 1 }}
+      - name: prediction-server-{{ add $i 1 }}
+        image: {{ $.Values.inferenceExtension.sidecars.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.sidecars.predictionServers.image.name }}:{{ $.Values.inferenceExtension.sidecars.predictionServers.image.tag }}
+        imagePullPolicy: {{ $.Values.inferenceExtension.sidecars.predictionServers.image.pullPolicy | default "Always" }}
+        command: ["uvicorn"]
+        args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add 8001 $i }}"]
+        ports:
+        - containerPort: {{ add 8001 $i }}
+          name: predict-port-{{ add $i 1 }}
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: {{ add 8001 $i }}
+          initialDelaySeconds: 15
+          periodSeconds: 15
+        readinessProbe:
+          httpGet:
+            path: /readyz
+            port: {{ add 8001 $i }}
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          failureThreshold: 10
+        {{- if $.Values.inferenceExtension.sidecars.predictionServers.resources }}
+        resources:
+          {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.resources | nindent 10 }}
+        {{- end }}
+        envFrom:
+        {{- if $.Values.inferenceExtension.sidecars.predictionServers.envFrom }}
+          {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.envFrom | nindent 10 }}
+        {{- else }}
+        - configMapRef:
+            name: prediction-server-config
+        {{- end }}
+        env:
+        - name: PREDICT_PORT
+          value: "{{ add 8001 $i }}"
+        - name: POD_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.name
+        - name: SERVER_TYPE
+          value: "prediction-{{ add $i 1 }}"
+        - name: TRAINING_SERVER_URL
+          value: "http://localhost:8000"
+        {{- if $.Values.inferenceExtension.sidecars.predictionServers.env }}
+        {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.env | nindent 8 }}
+        {{- end }}
+        volumeMounts:
+        - name: prediction-server-{{ add $i 1 }}-storage
+          mountPath: /server_models
+      {{- end }}
+      {{- end }}
+      {{- end }}
+      {{- end }}
       volumes:
       - name: plugins-config-volume
         configMap:
           name: {{ include "gateway-api-inference-extension.name" . }}
+      {{- if .Values.inferenceExtension.sidecars }}
+      {{- if .Values.inferenceExtension.sidecars.trainingServer }}
+      {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }}
+      - name: training-server-storage
+        {{- if .Values.inferenceExtension.sidecars.trainingServer.persistence }}
+        {{- if .Values.inferenceExtension.sidecars.trainingServer.persistence.enabled }}
+        persistentVolumeClaim:
+          claimName: {{ .Values.inferenceExtension.sidecars.trainingServer.persistence.claimName | default "training-models-pvc" }}
+        {{- else }}
+        emptyDir: {}
+        {{- end }}
+        {{- else }}
+        emptyDir: {}
+        {{- end }}
+      {{- end }}
+      {{- end }}
+      {{- if .Values.inferenceExtension.sidecars.predictionServers }}
+      {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }}
+      {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }}
+      {{- range $i := until $replicas }}
+      - name: prediction-server-{{ add $i 1 }}-storage
+        emptyDir: {}
+      {{- end }}
+      {{- end }}
+      {{- end }}
+      {{- end }}
       {{- if .Values.inferenceExtension.affinity }}
       affinity:
         {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }}
diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml
index b1a48df91..b39a58c82 100644
--- a/config/charts/inferencepool/templates/epp-service.yaml
+++ b/config/charts/inferencepool/templates/epp-service.yaml
@@ -12,9 +12,33 @@ spec:
     - name: grpc-ext-proc
       protocol: TCP
       port: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
+      targetPort: {{ .Values.inferenceExtension.extProcPort | default 9002 }}
+      appProtocol: http2
     - name: http-metrics
       protocol: TCP
       port: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+      targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }}
+    {{- if .Values.inferenceExtension.sidecars }}
+    {{- if .Values.inferenceExtension.sidecars.trainingServer }}
+    {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }}
+    - name: latency-training
+      protocol: TCP
+      port: 8000
+      targetPort: 8000
+    {{- end }}
+    {{- end }}
+    {{- if .Values.inferenceExtension.sidecars.predictionServers }}
+    {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }}
+    {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }}
+    {{- range $i := until $replicas }}
+    - name: latency-predict-{{ add $i 1 }}
+      protocol: TCP
+      port: {{ add 8001 $i }}
+      targetPort: {{ add 8001 $i }}
+    {{- end }}
+    {{- end }}
+    {{- end }}
+    {{- end }}
     {{- with .Values.inferenceExtension.extraServicePorts }}
     {{- toYaml . | nindent 4 }}
     {{- end }}
diff --git a/config/charts/inferencepool/templates/latency-predictor-config.yaml b/config/charts/inferencepool/templates/latency-predictor-config.yaml
new file mode 100644
index 000000000..d54a388bc
--- /dev/null
+++ b/config/charts/inferencepool/templates/latency-predictor-config.yaml
@@ -0,0 +1,41 @@
+{{- if .Values.inferenceExtension.sidecars }}
+{{- if or .Values.inferenceExtension.sidecars.trainingServer.enabled .Values.inferenceExtension.sidecars.predictionServers.enabled }}
+---
+# ConfigMap for Training Server Configuration
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: latency-predictor-config
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
+data:
+  LATENCY_RETRAINING_INTERVAL_SEC: {{ .Values.inferenceExtension.sidecars.trainingServer.config.retrainingIntervalSec | default "1" | quote }}
+  LATENCY_MIN_SAMPLES_FOR_RETRAIN: {{ .Values.inferenceExtension.sidecars.trainingServer.config.minSamplesForRetrain | default "100" | quote }}
+  LATENCY_TTFT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.ttftModelPath | default "/models/ttft.joblib" | quote }}
+  LATENCY_TPOT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.tpotModelPath | default "/models/tpot.joblib" | quote }}
+  LATENCY_TTFT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.ttftScalerPath | default "/models/ttft_scaler.joblib" | quote }}
+  LATENCY_TPOT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.tpotScalerPath | default "/models/tpot_scaler.joblib" | quote }}
+  LATENCY_MODEL_TYPE: {{ .Values.inferenceExtension.sidecars.trainingServer.config.modelType | default "xgboost" | quote }}
+---
+# ConfigMap for Prediction Server Configuration
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prediction-server-config
+  namespace: {{ .Release.Namespace }}
+  labels:
+    {{- include "gateway-api-inference-extension.labels" . | nindent 4 }}
+data:
+  MODEL_SYNC_INTERVAL_SEC: {{ .Values.inferenceExtension.sidecars.predictionServers.config.modelSyncIntervalSec | default "10" | quote }}
+  LATENCY_MODEL_TYPE: {{ .Values.inferenceExtension.sidecars.predictionServers.config.modelType | default "xgboost" | quote }}
+  PREDICT_HOST: {{ .Values.inferenceExtension.sidecars.predictionServers.config.host | default "0.0.0.0" | quote }}
+  PREDICT_PORT: {{ .Values.inferenceExtension.sidecars.predictionServers.config.port | default "8001" | quote }}
+  TRAINING_SERVER_URL: {{ .Values.inferenceExtension.sidecars.predictionServers.config.trainingServerUrl | default "http://localhost:8000" | quote }}
+  LOCAL_TTFT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTtftModelPath | default "/local_models/ttft.joblib" | quote }}
+  LOCAL_TPOT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTpotModelPath | default "/local_models/tpot.joblib" | quote }}
+  LOCAL_TTFT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTtftScalerPath | default "/local_models/ttft_scaler.joblib" | quote }}
+  LOCAL_TPOT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTpotScalerPath | default "/local_models/tpot_scaler.joblib" | quote }}
+  HTTP_TIMEOUT: {{ .Values.inferenceExtension.sidecars.predictionServers.config.httpTimeout | default "30" | quote }}
+{{- end }}
+{{- end }}
diff --git a/config/charts/inferencepool/values-slo-example.yaml b/config/charts/inferencepool/values-slo-example.yaml
new file mode 100644
index 000000000..04d5016be
--- /dev/null
+++ b/config/charts/inferencepool/values-slo-example.yaml
@@ -0,0 +1,124 @@
+# Example values file for SLO-aware routing with latency prediction
+# This file demonstrates how to enable and configure the SLO prediction sidecars
+
+inferenceExtension:
+  replicas: 1
+  image:
+    name: epp
+    hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+    tag: main
+    pullPolicy: Always
+  extProcPort: 9002
+  pluginsConfigFile: "slo-plugins.yaml"  # Use custom SLO plugins config
+
+  # Enable latency prediction flag
+  flags:
+    - name: enable-latency-predictor
+      value: "true"
+    - name: v
+      value: "4"
+
+  # EPP environment variables for SLO prediction
+  env:
+    - name: PREDICTION_SERVER_URL
+      value: "http://localhost:8001,http://localhost:8002,http://localhost:8003"
+    - name: TRAINING_SERVER_URL
+      value: "http://localhost:8000"
+    - name: LATENCY_MAX_SAMPLE_SIZE
+      value: "10000"
+    - name: NEG_HEADROOM_TPOT_WEIGHT
+      value: "0.2"
+    - name: NEG_HEADROOM_TTFT_WEIGHT
+      value: "0.8"
+
+  # Custom plugins configuration for SLO routing
+  pluginsCustomConfig:
+    slo-plugins.yaml: |
+      apiVersion: inference.networking.x-k8s.io/v1alpha1
+      kind: EndpointPickerConfig
+      plugins:
+      - type: queue-scorer
+      - type: kv-cache-utilization-scorer
+      - type: slo-request-tracker
+      - type: slo-scorer
+      - type: slo-aware-profile-handler
+      - type: max-score-picker
+      schedulingProfiles:
+      - name: default
+        plugins:
+        - pluginRef: slo-request-tracker
+        - pluginRef: queue-scorer
+        - pluginRef: kv-cache-utilization-scorer
+        - pluginRef: max-score-picker
+      - name: slo
+        plugins:
+        - pluginRef: slo-request-tracker
+        - pluginRef: slo-scorer
+        - pluginRef: max-score-picker
+
+  # Enable SLO prediction sidecars
+  sidecars:
+    trainingServer:
+      enabled: true
+      image:
+        hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+        name: latency-training
+        tag: latest
+        pullPolicy: Always
+      resources:
+        requests:
+          cpu: "2000m"
+          memory: "4Gi"
+        limits:
+          cpu: "4000m"
+          memory: "8Gi"
+      config:
+        retrainingIntervalSec: "1"
+        minSamplesForRetrain: "100"
+        ttftModelPath: "/models/ttft.joblib"
+        tpotModelPath: "/models/tpot.joblib"
+        ttftScalerPath: "/models/ttft_scaler.joblib"
+        tpotScalerPath: "/models/tpot_scaler.joblib"
+        modelType: "xgboost"
+      persistence:
+        enabled: false  # Set to true if you want persistent model storage
+        # claimName: "training-models-pvc"
+
+    predictionServers:
+      enabled: true
+      replicas: 3  # Number of prediction server replicas
+      image:
+        hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+        name: latency-prediction
+        tag: latest
+        pullPolicy: Always
+      resources:
+        requests:
+          cpu: "500m"
+          memory: "1Gi"
+        limits:
+          cpu: "1000m"
+          memory: "2Gi"
+      config:
+        modelSyncIntervalSec: "10"
+        modelType: "xgboost"
+        host: "0.0.0.0"
+        port: "8001"
+        trainingServerUrl: "http://localhost:8000"
+        localTtftModelPath: "/local_models/ttft.joblib"
+        localTpotModelPath: "/local_models/tpot.joblib"
+        localTtftScalerPath: "/local_models/ttft_scaler.joblib"
+        localTpotScalerPath: "/local_models/tpot_scaler.joblib"
+        httpTimeout: "30"
+
+inferencePool:
+  targetPorts:
+    - number: 8000
+  modelServerType: vllm
+  apiVersion: inference.networking.k8s.io/v1
+  # modelServers:
+  #   matchLabels:
+  #     app: vllm-llama3-8b-instruct
+
+provider:
+  name: none
diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml
index d45e6ed39..13e57cb79 100644
--- a/config/charts/inferencepool/values.yaml
+++ b/config/charts/inferencepool/values.yaml
@@ -40,6 +40,66 @@ inferenceExtension:
 
   tolerations: []
 
+  # SLO-aware routing with latency prediction sidecars
+  # Uncomment and configure to enable SLO prediction
+  # sidecars:
+  #   trainingServer:
+  #     enabled: false
+  #     image:
+  #       hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+  #       name: latency-training
+  #       tag: latest
+  #       pullPolicy: Always
+  #     resources:
+  #       requests:
+  #         cpu: "2000m"
+  #         memory: "4Gi"
+  #       limits:
+  #         cpu: "4000m"
+  #         memory: "8Gi"
+  #     config:
+  #       retrainingIntervalSec: "1"
+  #       minSamplesForRetrain: "100"
+  #       ttftModelPath: "/models/ttft.joblib"
+  #       tpotModelPath: "/models/tpot.joblib"
+  #       ttftScalerPath: "/models/ttft_scaler.joblib"
+  #       tpotScalerPath: "/models/tpot_scaler.joblib"
+  #       modelType: "xgboost"
+  #     persistence:
+  #       enabled: false
+  #       claimName: "training-models-pvc"
+  #     env: []
+  #     envFrom: []
+  #
+  #   predictionServers:
+  #     enabled: false
+  #     replicas: 3
+  #     image:
+  #       hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension
+  #       name: latency-prediction
+  #       tag: latest
+  #       pullPolicy: Always
+  #     resources:
+  #       requests:
+  #         cpu: "500m"
+  #         memory: "1Gi"
+  #       limits:
+  #         cpu: "1000m"
+  #         memory: "2Gi"
+  #     config:
+  #       modelSyncIntervalSec: "10"
+  #       modelType: "xgboost"
+  #       host: "0.0.0.0"
+  #       port: "8001"
+  #       trainingServerUrl: "http://localhost:8000"
+  #       localTtftModelPath: "/local_models/ttft.joblib"
+  #       localTpotModelPath: "/local_models/tpot.joblib"
+  #       localTtftScalerPath: "/local_models/ttft_scaler.joblib"
+  #       localTpotScalerPath: "/local_models/tpot_scaler.joblib"
+  #       httpTimeout: "30"
+  #     env: []
+  #     envFrom: []
+
 inferencePool:
   targetPorts:
     - number: 8000

From 2c0dcc5b6cf36031bc45fcb0bc910a4e6d169aba Mon Sep 17 00:00:00 2001
From: RishabhSaini <rishabhsaini01@gmail.com>
Date: Mon, 10 Nov 2025 15:21:57 -0500
Subject: [PATCH 2/2] gracefully skip TTFT model training when not enough
 samples (no undefined vars)

---
 latencypredictor-v1/training_server.py | 32 ++++++++++++--------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/latencypredictor-v1/training_server.py b/latencypredictor-v1/training_server.py
index fa8a81118..51425bfe0 100644
--- a/latencypredictor-v1/training_server.py
+++ b/latencypredictor-v1/training_server.py
@@ -628,22 +628,22 @@ def train(self):
                 if len(df_ttft) >= settings.MIN_SAMPLES_FOR_RETRAIN:
                     # Updated TTFT features to include prefix_cache_score
                     ttft_feature_cols_tree = [
-                    'kv_cache_percentage','input_token_length','num_request_waiting',
-                    'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket'
-                ]
-                ttft_feature_cols_br = [
-                    'kv_cache_percentage','input_token_length','num_request_waiting',
-                    'num_request_running','prefix_cache_score','effective_input_tokens'
-                ]
-
-                # Build X_ttft for all model types, then trim for BR
-                X_ttft = df_ttft[ttft_feature_cols_tree]
-                if self.model_type == ModelType.BAYESIAN_RIDGE:
-                    X_ttft = X_ttft[ttft_feature_cols_br]
+                        'kv_cache_percentage','input_token_length','num_request_waiting',
+                        'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket'
+                    ]
+                    ttft_feature_cols_br = [
+                        'kv_cache_percentage','input_token_length','num_request_waiting',
+                        'num_request_running','prefix_cache_score','effective_input_tokens'
+                    ]
+
+                    # Build X_ttft for all model types, then trim for BR
+                    X_ttft = df_ttft[ttft_feature_cols_tree]
+                    if self.model_type == ModelType.BAYESIAN_RIDGE:
+                        X_ttft = X_ttft[ttft_feature_cols_br]
 
-                y_ttft = raw_ttft['actual_ttft_ms']
+                    y_ttft = raw_ttft['actual_ttft_ms']
 
-                try:
+                    try:
                         # raw_ttft still has the original columns including 'prefix_cache_score'
                         raw_ttft['_prefix_bucket'] = raw_ttft['prefix_cache_score'].clip(0, 1).apply(
                             lambda s: min(int(s * self.prefix_buckets), self.prefix_buckets - 1)
@@ -677,8 +677,6 @@ def train(self):
                                 new_ttft_model, new_ttft_scaler, test_records, cols, 'actual_ttft_ms'
                             )
 
-
-                        
                         if ql is not None:
                             self.ttft_quantile_loss_scores.append(ql)
                             self.ttft_coverage_scores.append(coverage)
@@ -690,7 +688,7 @@ def train(self):
                         else:
                             logging.info(f"TTFT model trained on {len(df_ttft)} samples. Quantile metrics = N/A (insufficient test data)")
 
-                except Exception:
+                    except Exception:
                         logging.error("Error training TTFT model", exc_info=True)