From 95f30f6d92ac0ae5f6f830ae0c115617fb00e721 Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Fri, 7 Nov 2025 14:24:34 -0500 Subject: [PATCH 1/2] update GAIE to slo aware routing --- .../inferencepool/SLO-ROUTING-README.md | 296 ++++++++++++++++++ .../templates/epp-deployment.yaml | 136 ++++++++ .../inferencepool/templates/epp-service.yaml | 24 ++ .../templates/latency-predictor-config.yaml | 41 +++ .../inferencepool/values-slo-example.yaml | 124 ++++++++ config/charts/inferencepool/values.yaml | 60 ++++ 6 files changed, 681 insertions(+) create mode 100644 config/charts/inferencepool/SLO-ROUTING-README.md create mode 100644 config/charts/inferencepool/templates/latency-predictor-config.yaml create mode 100644 config/charts/inferencepool/values-slo-example.yaml diff --git a/config/charts/inferencepool/SLO-ROUTING-README.md b/config/charts/inferencepool/SLO-ROUTING-README.md new file mode 100644 index 000000000..ea8c8d801 --- /dev/null +++ b/config/charts/inferencepool/SLO-ROUTING-README.md @@ -0,0 +1,296 @@ +# SLO-Aware Routing with Latency Prediction + +This document describes the modifications made to the InferencePool Helm chart to support SLO-aware routing with latency prediction sidecars. + +## Overview + +The SLO-aware routing feature enables intelligent request routing based on predicted latency using machine learning models. The system consists of: + +1. **EPP (Endpoint Picker) Container**: Main routing logic with latency prediction enabled +2. **Training Server Sidecar**: Continuously trains XGBoost models on observed latency metrics +3. **Prediction Server Sidecars**: Multiple replicas that serve latency predictions for TTFT (Time to First Token) and TPOT (Time Per Output Token) + +## Architecture + +``` +┌─────────────────────────────────────────────────────┐ +│ EPP Pod │ +├──────────────┬──────────────┬──────────────────────┤ +│ EPP │ Training │ Prediction Servers │ +│ Container │ Server │ (3 replicas) │ +│ │ │ │ +│ Port 9002 │ Port 8000 │ Ports 8001-8003 │ +│ (ext-proc) │ (training) │ (prediction) │ +└──────────────┴──────────────┴──────────────────────┘ + │ │ │ + │ └──────┬───────────┘ + │ │ + │ Model Training + │ & Synchronization + │ + Routing Decision + (with latency prediction) +``` + +## Modified Files + +### 1. `templates/epp-deployment.yaml` +- Added support for `sidecars.trainingServer` configuration +- Added support for `sidecars.predictionServers` with configurable replicas +- Automatically creates volumes for model storage +- Injects ConfigMaps for training and prediction server configuration + +### 2. `templates/epp-service.yaml` +- Automatically exposes ports for training server (8000) +- Automatically exposes ports for prediction servers (8001-8003 by default) +- Ports are only added when sidecars are enabled + +### 3. `templates/latency-predictor-config.yaml` (NEW) +- Creates ConfigMap for training server configuration +- Creates ConfigMap for prediction server configuration +- Supports customizable model paths, retraining intervals, and other parameters + +### 4. `values.yaml` +- Added comprehensive `sidecars` section with commented examples +- Supports configuration for training and prediction server images, resources, and behavior + +### 5. `values-slo-example.yaml` (NEW) +- Complete working example of SLO-aware routing configuration +- Demonstrates all required settings including EPP flags, environment variables, and plugin configuration + +## Usage + +### Quick Start with Example Configuration + +```bash +# Install with SLO-aware routing enabled +helm install my-slo-pool oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool \ + --namespace inference \ + --values values-slo-example.yaml \ + --set inferencePool.modelServers.matchLabels.app=my-model-server +``` + +### Custom Configuration + +Create a custom values file: + +```yaml +inferenceExtension: + image: + hub: quay.io/your-org + name: epp + tag: slo-experimental + + flags: + - name: enable-latency-predictor + value: "true" + - name: v + value: "4" + + env: + - name: PREDICTION_SERVER_URL + value: "http://localhost:8001,http://localhost:8002,http://localhost:8003" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + - name: LATENCY_MAX_SAMPLE_SIZE + value: "10000" + + pluginsCustomConfig: + slo-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: slo-request-tracker + - type: slo-scorer + - type: slo-aware-profile-handler + schedulingProfiles: + - name: slo + plugins: + - pluginRef: slo-request-tracker + - pluginRef: slo-scorer + + sidecars: + trainingServer: + enabled: true + image: + hub: quay.io/your-org + name: latency-training + tag: latest + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + + predictionServers: + enabled: true + replicas: 3 + image: + hub: quay.io/your-org + name: latency-prediction + tag: latest + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" +``` + +## Configuration Reference + +### Training Server Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `sidecars.trainingServer.enabled` | Enable training server sidecar | `false` | +| `sidecars.trainingServer.image.hub` | Container registry | `us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension` | +| `sidecars.trainingServer.image.name` | Image name | `latency-training` | +| `sidecars.trainingServer.image.tag` | Image tag | `latest` | +| `sidecars.trainingServer.config.retrainingIntervalSec` | Retraining interval in seconds | `1` | +| `sidecars.trainingServer.config.minSamplesForRetrain` | Minimum samples before retraining | `100` | +| `sidecars.trainingServer.config.modelType` | ML model type | `xgboost` | +| `sidecars.trainingServer.persistence.enabled` | Enable persistent storage for models | `false` | + +### Prediction Server Configuration + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `sidecars.predictionServers.enabled` | Enable prediction server sidecars | `false` | +| `sidecars.predictionServers.replicas` | Number of prediction server replicas | `3` | +| `sidecars.predictionServers.image.hub` | Container registry | `us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension` | +| `sidecars.predictionServers.image.name` | Image name | `latency-prediction` | +| `sidecars.predictionServers.image.tag` | Image tag | `latest` | +| `sidecars.predictionServers.config.modelSyncIntervalSec` | Model sync interval in seconds | `10` | +| `sidecars.predictionServers.config.modelType` | ML model type | `xgboost` | + +### EPP Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `PREDICTION_SERVER_URL` | Comma-separated prediction server URLs | `http://localhost:8001,http://localhost:8002,http://localhost:8003` | +| `TRAINING_SERVER_URL` | Training server URL | `http://localhost:8000` | +| `LATENCY_MAX_SAMPLE_SIZE` | Maximum sample size for latency prediction | `10000` | +| `NEG_HEADROOM_TPOT_WEIGHT` | Weight for TPOT in negative headroom calculation | `0.2` | +| `NEG_HEADROOM_TTFT_WEIGHT` | Weight for TTFT in negative headroom calculation | `0.8` | + +## Building Container Images + +### Prerequisites + +```bash +cd /path/to/gateway-api-inference-extension +git checkout slo-prediction-experimental +``` + +### Build EPP Image + +```bash +export IMAGE_REGISTRY="quay.io/your-org" +export EPP_TAG="slo-experimental" +make image-build image-push +``` + +### Build Latency Predictor Images + +```bash +cd latencypredictor-v1 + +# Edit build-deploy.sh to set your registry +# Then build and push: +./build-deploy.sh build + +# Tag and push manually +docker tag latencypredictor-v2-training-server:latest ${IMAGE_REGISTRY}/latency-training:slo-experimental +docker tag latencypredictor-v2-prediction-server:latest ${IMAGE_REGISTRY}/latency-prediction:slo-experimental +docker push ${IMAGE_REGISTRY}/latency-training:slo-experimental +docker push ${IMAGE_REGISTRY}/latency-prediction:slo-experimental +``` + +## Verification + +After deployment, verify all containers are running: + +```bash +# Check pod status +kubectl get pods -n your-namespace + +# Expected: 1 pod with 5 containers (1 EPP + 1 training + 3 prediction) + +# Check EPP logs +kubectl logs -n your-namespace -c epp + +# Check training server logs +kubectl logs -n your-namespace -c training-server + +# Check prediction server logs +kubectl logs -n your-namespace -c prediction-server-1 +``` + +## Service Ports + +When sidecars are enabled, the service automatically exposes these ports: + +- `9002`: EPP gRPC ext-proc (always) +- `9090`: EPP metrics (always) +- `8000`: Training server (when `trainingServer.enabled: true`) +- `8001-800N`: Prediction servers (when `predictionServers.enabled: true`, N = replicas) + +## Plugins + +The SLO-aware routing requires these plugins: + +- `slo-request-tracker`: Tracks request SLO requirements +- `slo-scorer`: Scores endpoints based on predicted latency vs SLO +- `slo-aware-profile-handler`: Handles different scheduling profiles +- `max-score-picker`: Selects endpoint with maximum score + +### Scheduling Profiles + +- **default**: Standard routing with queue and kv-cache scoring +- **slo**: SLO-aware routing using latency predictions + +## Troubleshooting + +### Sidecars Not Starting + +Check if images are accessible: +```bash +kubectl describe pod -n your-namespace +``` + +### Training Server Issues + +Check ConfigMap and logs: +```bash +kubectl get configmap latency-predictor-config -n your-namespace -o yaml +kubectl logs -c training-server -n your-namespace +``` + +### Prediction Server Issues + +Verify prediction servers can reach training server: +```bash +kubectl exec -c prediction-server-1 -n your-namespace -- \ + curl http://localhost:8000/healthz +``` + +## Integration with llm-d + +To use this chart in llm-d, update your helmfile: + +```yaml +releases: + - name: gaie-slo + namespace: llm-d-slo + chart: oci://quay.io/your-org/charts/inferencepool + version: v1.0.1-slo + values: + - gaie-slo/values.yaml + - gaie-slo/values-slo.yaml +``` + +See the main documentation for complete integration instructions. diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index f01699a96..b5570b9d7 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -31,6 +31,10 @@ spec: - "json" - --config-file - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}" + {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io/v1" }} + - --pool-group + - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" + {{- end }} {{- range .Values.inferenceExtension.flags }} - "--{{ .name }}" - "{{ .value }}" @@ -84,10 +88,142 @@ spec: volumeMounts: - name: plugins-config-volume mountPath: "/config" + {{- if .Values.inferenceExtension.sidecars }} + {{- if .Values.inferenceExtension.sidecars.trainingServer }} + {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }} + # Training Server Sidecar Container + - name: training-server + image: {{ .Values.inferenceExtension.sidecars.trainingServer.image.hub }}/{{ .Values.inferenceExtension.sidecars.trainingServer.image.name }}:{{ .Values.inferenceExtension.sidecars.trainingServer.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.sidecars.trainingServer.image.pullPolicy | default "Always" }} + ports: + - containerPort: 8000 + name: training-port + livenessProbe: + httpGet: + path: /healthz + port: 8000 + initialDelaySeconds: 30 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8000 + initialDelaySeconds: 45 + periodSeconds: 10 + {{- if .Values.inferenceExtension.sidecars.trainingServer.resources }} + resources: + {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.resources | nindent 10 }} + {{- end }} + envFrom: + {{- if .Values.inferenceExtension.sidecars.trainingServer.envFrom }} + {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.envFrom | nindent 10 }} + {{- else }} + - configMapRef: + name: latency-predictor-config + {{- end }} + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "training" + {{- if .Values.inferenceExtension.sidecars.trainingServer.env }} + {{- toYaml .Values.inferenceExtension.sidecars.trainingServer.env | nindent 8 }} + {{- end }} + volumeMounts: + - name: training-server-storage + mountPath: /models + {{- end }} + {{- end }} + {{- if .Values.inferenceExtension.sidecars.predictionServers }} + {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }} + {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }} + {{- range $i := until $replicas }} + # Prediction Server Sidecar Container {{ add $i 1 }} + - name: prediction-server-{{ add $i 1 }} + image: {{ $.Values.inferenceExtension.sidecars.predictionServers.image.hub }}/{{ $.Values.inferenceExtension.sidecars.predictionServers.image.name }}:{{ $.Values.inferenceExtension.sidecars.predictionServers.image.tag }} + imagePullPolicy: {{ $.Values.inferenceExtension.sidecars.predictionServers.image.pullPolicy | default "Always" }} + command: ["uvicorn"] + args: ["prediction_server:app", "--host", "0.0.0.0", "--port", "{{ add 8001 $i }}"] + ports: + - containerPort: {{ add 8001 $i }} + name: predict-port-{{ add $i 1 }} + livenessProbe: + httpGet: + path: /healthz + port: {{ add 8001 $i }} + initialDelaySeconds: 15 + periodSeconds: 15 + readinessProbe: + httpGet: + path: /readyz + port: {{ add 8001 $i }} + initialDelaySeconds: 10 + periodSeconds: 5 + failureThreshold: 10 + {{- if $.Values.inferenceExtension.sidecars.predictionServers.resources }} + resources: + {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.resources | nindent 10 }} + {{- end }} + envFrom: + {{- if $.Values.inferenceExtension.sidecars.predictionServers.envFrom }} + {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.envFrom | nindent 10 }} + {{- else }} + - configMapRef: + name: prediction-server-config + {{- end }} + env: + - name: PREDICT_PORT + value: "{{ add 8001 $i }}" + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: SERVER_TYPE + value: "prediction-{{ add $i 1 }}" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + {{- if $.Values.inferenceExtension.sidecars.predictionServers.env }} + {{- toYaml $.Values.inferenceExtension.sidecars.predictionServers.env | nindent 8 }} + {{- end }} + volumeMounts: + - name: prediction-server-{{ add $i 1 }}-storage + mountPath: /server_models + {{- end }} + {{- end }} + {{- end }} + {{- end }} volumes: - name: plugins-config-volume configMap: name: {{ include "gateway-api-inference-extension.name" . }} + {{- if .Values.inferenceExtension.sidecars }} + {{- if .Values.inferenceExtension.sidecars.trainingServer }} + {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }} + - name: training-server-storage + {{- if .Values.inferenceExtension.sidecars.trainingServer.persistence }} + {{- if .Values.inferenceExtension.sidecars.trainingServer.persistence.enabled }} + persistentVolumeClaim: + claimName: {{ .Values.inferenceExtension.sidecars.trainingServer.persistence.claimName | default "training-models-pvc" }} + {{- else }} + emptyDir: {} + {{- end }} + {{- else }} + emptyDir: {} + {{- end }} + {{- end }} + {{- end }} + {{- if .Values.inferenceExtension.sidecars.predictionServers }} + {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }} + {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }} + {{- range $i := until $replicas }} + - name: prediction-server-{{ add $i 1 }}-storage + emptyDir: {} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{- if .Values.inferenceExtension.affinity }} affinity: {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} diff --git a/config/charts/inferencepool/templates/epp-service.yaml b/config/charts/inferencepool/templates/epp-service.yaml index b1a48df91..b39a58c82 100644 --- a/config/charts/inferencepool/templates/epp-service.yaml +++ b/config/charts/inferencepool/templates/epp-service.yaml @@ -12,9 +12,33 @@ spec: - name: grpc-ext-proc protocol: TCP port: {{ .Values.inferenceExtension.extProcPort | default 9002 }} + targetPort: {{ .Values.inferenceExtension.extProcPort | default 9002 }} + appProtocol: http2 - name: http-metrics protocol: TCP port: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + targetPort: {{ .Values.inferenceExtension.metricsPort | default 9090 }} + {{- if .Values.inferenceExtension.sidecars }} + {{- if .Values.inferenceExtension.sidecars.trainingServer }} + {{- if .Values.inferenceExtension.sidecars.trainingServer.enabled }} + - name: latency-training + protocol: TCP + port: 8000 + targetPort: 8000 + {{- end }} + {{- end }} + {{- if .Values.inferenceExtension.sidecars.predictionServers }} + {{- if .Values.inferenceExtension.sidecars.predictionServers.enabled }} + {{- $replicas := int (.Values.inferenceExtension.sidecars.predictionServers.replicas | default 3) }} + {{- range $i := until $replicas }} + - name: latency-predict-{{ add $i 1 }} + protocol: TCP + port: {{ add 8001 $i }} + targetPort: {{ add 8001 $i }} + {{- end }} + {{- end }} + {{- end }} + {{- end }} {{- with .Values.inferenceExtension.extraServicePorts }} {{- toYaml . | nindent 4 }} {{- end }} diff --git a/config/charts/inferencepool/templates/latency-predictor-config.yaml b/config/charts/inferencepool/templates/latency-predictor-config.yaml new file mode 100644 index 000000000..d54a388bc --- /dev/null +++ b/config/charts/inferencepool/templates/latency-predictor-config.yaml @@ -0,0 +1,41 @@ +{{- if .Values.inferenceExtension.sidecars }} +{{- if or .Values.inferenceExtension.sidecars.trainingServer.enabled .Values.inferenceExtension.sidecars.predictionServers.enabled }} +--- +# ConfigMap for Training Server Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: latency-predictor-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +data: + LATENCY_RETRAINING_INTERVAL_SEC: {{ .Values.inferenceExtension.sidecars.trainingServer.config.retrainingIntervalSec | default "1" | quote }} + LATENCY_MIN_SAMPLES_FOR_RETRAIN: {{ .Values.inferenceExtension.sidecars.trainingServer.config.minSamplesForRetrain | default "100" | quote }} + LATENCY_TTFT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.ttftModelPath | default "/models/ttft.joblib" | quote }} + LATENCY_TPOT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.tpotModelPath | default "/models/tpot.joblib" | quote }} + LATENCY_TTFT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.ttftScalerPath | default "/models/ttft_scaler.joblib" | quote }} + LATENCY_TPOT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.trainingServer.config.tpotScalerPath | default "/models/tpot_scaler.joblib" | quote }} + LATENCY_MODEL_TYPE: {{ .Values.inferenceExtension.sidecars.trainingServer.config.modelType | default "xgboost" | quote }} +--- +# ConfigMap for Prediction Server Configuration +apiVersion: v1 +kind: ConfigMap +metadata: + name: prediction-server-config + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.labels" . | nindent 4 }} +data: + MODEL_SYNC_INTERVAL_SEC: {{ .Values.inferenceExtension.sidecars.predictionServers.config.modelSyncIntervalSec | default "10" | quote }} + LATENCY_MODEL_TYPE: {{ .Values.inferenceExtension.sidecars.predictionServers.config.modelType | default "xgboost" | quote }} + PREDICT_HOST: {{ .Values.inferenceExtension.sidecars.predictionServers.config.host | default "0.0.0.0" | quote }} + PREDICT_PORT: {{ .Values.inferenceExtension.sidecars.predictionServers.config.port | default "8001" | quote }} + TRAINING_SERVER_URL: {{ .Values.inferenceExtension.sidecars.predictionServers.config.trainingServerUrl | default "http://localhost:8000" | quote }} + LOCAL_TTFT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTtftModelPath | default "/local_models/ttft.joblib" | quote }} + LOCAL_TPOT_MODEL_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTpotModelPath | default "/local_models/tpot.joblib" | quote }} + LOCAL_TTFT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTtftScalerPath | default "/local_models/ttft_scaler.joblib" | quote }} + LOCAL_TPOT_SCALER_PATH: {{ .Values.inferenceExtension.sidecars.predictionServers.config.localTpotScalerPath | default "/local_models/tpot_scaler.joblib" | quote }} + HTTP_TIMEOUT: {{ .Values.inferenceExtension.sidecars.predictionServers.config.httpTimeout | default "30" | quote }} +{{- end }} +{{- end }} diff --git a/config/charts/inferencepool/values-slo-example.yaml b/config/charts/inferencepool/values-slo-example.yaml new file mode 100644 index 000000000..04d5016be --- /dev/null +++ b/config/charts/inferencepool/values-slo-example.yaml @@ -0,0 +1,124 @@ +# Example values file for SLO-aware routing with latency prediction +# This file demonstrates how to enable and configure the SLO prediction sidecars + +inferenceExtension: + replicas: 1 + image: + name: epp + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + tag: main + pullPolicy: Always + extProcPort: 9002 + pluginsConfigFile: "slo-plugins.yaml" # Use custom SLO plugins config + + # Enable latency prediction flag + flags: + - name: enable-latency-predictor + value: "true" + - name: v + value: "4" + + # EPP environment variables for SLO prediction + env: + - name: PREDICTION_SERVER_URL + value: "http://localhost:8001,http://localhost:8002,http://localhost:8003" + - name: TRAINING_SERVER_URL + value: "http://localhost:8000" + - name: LATENCY_MAX_SAMPLE_SIZE + value: "10000" + - name: NEG_HEADROOM_TPOT_WEIGHT + value: "0.2" + - name: NEG_HEADROOM_TTFT_WEIGHT + value: "0.8" + + # Custom plugins configuration for SLO routing + pluginsCustomConfig: + slo-plugins.yaml: | + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: kv-cache-utilization-scorer + - type: slo-request-tracker + - type: slo-scorer + - type: slo-aware-profile-handler + - type: max-score-picker + schedulingProfiles: + - name: default + plugins: + - pluginRef: slo-request-tracker + - pluginRef: queue-scorer + - pluginRef: kv-cache-utilization-scorer + - pluginRef: max-score-picker + - name: slo + plugins: + - pluginRef: slo-request-tracker + - pluginRef: slo-scorer + - pluginRef: max-score-picker + + # Enable SLO prediction sidecars + sidecars: + trainingServer: + enabled: true + image: + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + name: latency-training + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "2000m" + memory: "4Gi" + limits: + cpu: "4000m" + memory: "8Gi" + config: + retrainingIntervalSec: "1" + minSamplesForRetrain: "100" + ttftModelPath: "/models/ttft.joblib" + tpotModelPath: "/models/tpot.joblib" + ttftScalerPath: "/models/ttft_scaler.joblib" + tpotScalerPath: "/models/tpot_scaler.joblib" + modelType: "xgboost" + persistence: + enabled: false # Set to true if you want persistent model storage + # claimName: "training-models-pvc" + + predictionServers: + enabled: true + replicas: 3 # Number of prediction server replicas + image: + hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + name: latency-prediction + tag: latest + pullPolicy: Always + resources: + requests: + cpu: "500m" + memory: "1Gi" + limits: + cpu: "1000m" + memory: "2Gi" + config: + modelSyncIntervalSec: "10" + modelType: "xgboost" + host: "0.0.0.0" + port: "8001" + trainingServerUrl: "http://localhost:8000" + localTtftModelPath: "/local_models/ttft.joblib" + localTpotModelPath: "/local_models/tpot.joblib" + localTtftScalerPath: "/local_models/ttft_scaler.joblib" + localTpotScalerPath: "/local_models/tpot_scaler.joblib" + httpTimeout: "30" + +inferencePool: + targetPorts: + - number: 8000 + modelServerType: vllm + apiVersion: inference.networking.k8s.io/v1 + # modelServers: + # matchLabels: + # app: vllm-llama3-8b-instruct + +provider: + name: none diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index d45e6ed39..13e57cb79 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -40,6 +40,66 @@ inferenceExtension: tolerations: [] + # SLO-aware routing with latency prediction sidecars + # Uncomment and configure to enable SLO prediction + # sidecars: + # trainingServer: + # enabled: false + # image: + # hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + # name: latency-training + # tag: latest + # pullPolicy: Always + # resources: + # requests: + # cpu: "2000m" + # memory: "4Gi" + # limits: + # cpu: "4000m" + # memory: "8Gi" + # config: + # retrainingIntervalSec: "1" + # minSamplesForRetrain: "100" + # ttftModelPath: "/models/ttft.joblib" + # tpotModelPath: "/models/tpot.joblib" + # ttftScalerPath: "/models/ttft_scaler.joblib" + # tpotScalerPath: "/models/tpot_scaler.joblib" + # modelType: "xgboost" + # persistence: + # enabled: false + # claimName: "training-models-pvc" + # env: [] + # envFrom: [] + # + # predictionServers: + # enabled: false + # replicas: 3 + # image: + # hub: us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension + # name: latency-prediction + # tag: latest + # pullPolicy: Always + # resources: + # requests: + # cpu: "500m" + # memory: "1Gi" + # limits: + # cpu: "1000m" + # memory: "2Gi" + # config: + # modelSyncIntervalSec: "10" + # modelType: "xgboost" + # host: "0.0.0.0" + # port: "8001" + # trainingServerUrl: "http://localhost:8000" + # localTtftModelPath: "/local_models/ttft.joblib" + # localTpotModelPath: "/local_models/tpot.joblib" + # localTtftScalerPath: "/local_models/ttft_scaler.joblib" + # localTpotScalerPath: "/local_models/tpot_scaler.joblib" + # httpTimeout: "30" + # env: [] + # envFrom: [] + inferencePool: targetPorts: - number: 8000 From 2c0dcc5b6cf36031bc45fcb0bc910a4e6d169aba Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Mon, 10 Nov 2025 15:21:57 -0500 Subject: [PATCH 2/2] gracefully skip TTFT model training when not enough samples (no undefined vars) --- latencypredictor-v1/training_server.py | 32 ++++++++++++-------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/latencypredictor-v1/training_server.py b/latencypredictor-v1/training_server.py index fa8a81118..51425bfe0 100644 --- a/latencypredictor-v1/training_server.py +++ b/latencypredictor-v1/training_server.py @@ -628,22 +628,22 @@ def train(self): if len(df_ttft) >= settings.MIN_SAMPLES_FOR_RETRAIN: # Updated TTFT features to include prefix_cache_score ttft_feature_cols_tree = [ - 'kv_cache_percentage','input_token_length','num_request_waiting', - 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket' - ] - ttft_feature_cols_br = [ - 'kv_cache_percentage','input_token_length','num_request_waiting', - 'num_request_running','prefix_cache_score','effective_input_tokens' - ] - - # Build X_ttft for all model types, then trim for BR - X_ttft = df_ttft[ttft_feature_cols_tree] - if self.model_type == ModelType.BAYESIAN_RIDGE: - X_ttft = X_ttft[ttft_feature_cols_br] + 'kv_cache_percentage','input_token_length','num_request_waiting', + 'num_request_running','prefix_cache_score','effective_input_tokens','prefill_score_bucket' + ] + ttft_feature_cols_br = [ + 'kv_cache_percentage','input_token_length','num_request_waiting', + 'num_request_running','prefix_cache_score','effective_input_tokens' + ] + + # Build X_ttft for all model types, then trim for BR + X_ttft = df_ttft[ttft_feature_cols_tree] + if self.model_type == ModelType.BAYESIAN_RIDGE: + X_ttft = X_ttft[ttft_feature_cols_br] - y_ttft = raw_ttft['actual_ttft_ms'] + y_ttft = raw_ttft['actual_ttft_ms'] - try: + try: # raw_ttft still has the original columns including 'prefix_cache_score' raw_ttft['_prefix_bucket'] = raw_ttft['prefix_cache_score'].clip(0, 1).apply( lambda s: min(int(s * self.prefix_buckets), self.prefix_buckets - 1) @@ -677,8 +677,6 @@ def train(self): new_ttft_model, new_ttft_scaler, test_records, cols, 'actual_ttft_ms' ) - - if ql is not None: self.ttft_quantile_loss_scores.append(ql) self.ttft_coverage_scores.append(coverage) @@ -690,7 +688,7 @@ def train(self): else: logging.info(f"TTFT model trained on {len(df_ttft)} samples. Quantile metrics = N/A (insufficient test data)") - except Exception: + except Exception: logging.error("Error training TTFT model", exc_info=True)