diff --git a/config/charts/inferencepool/templates/_helpers.tpl b/config/charts/inferencepool/templates/_helpers.tpl index fdc9b1a2b..676f96775 100644 --- a/config/charts/inferencepool/templates/_helpers.tpl +++ b/config/charts/inferencepool/templates/_helpers.tpl @@ -31,3 +31,17 @@ Selector labels {{- define "gateway-api-inference-extension.selectorLabels" -}} inferencepool: {{ include "gateway-api-inference-extension.name" . }} {{- end -}} + +{{/* +Envoy Common labels +*/}} +{{- define "gateway-api-inference-extension.envoy-labels" -}} +app.kubernetes.io/name: {{ include "gateway-api-inference-extension.name" . }}-envoy +{{- end }} + +{{/* +Envoy Selector labels +*/}} +{{- define "gateway-api-inference-extension.envoy-selectorLabels" -}} +envoy: {{ include "gateway-api-inference-extension.name" . }}-envoy +{{- end }} diff --git a/config/charts/inferencepool/templates/envoy-service.yaml b/config/charts/inferencepool/templates/envoy-service.yaml new file mode 100644 index 000000000..d8c2aafda --- /dev/null +++ b/config/charts/inferencepool/templates/envoy-service.yaml @@ -0,0 +1,19 @@ +{{- if .Values.provider.standalone }} +--- +apiVersion: v1 +kind: Service +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-envoy + namespace: {{ .Release.Namespace }} + labels: + {{- include "gateway-api-inference-extension.envoy-labels" . | nindent 4 }} +spec: + selector: + {{- include "gateway-api-inference-extension.envoy-selectorLabels" . | nindent 4 }} + ports: + - name: http + port: {{ .Values.provider.standalone.envoy.servicePort | default 8081 }} + protocol: TCP + targetPort: {{ .Values.provider.standalone.envoy.servicePort | default 8081 }} + type: ClusterIP +{{- end }} diff --git a/config/charts/inferencepool/templates/epp-deployment.yaml b/config/charts/inferencepool/templates/epp-deployment.yaml index de892337d..d3177f552 100644 --- a/config/charts/inferencepool/templates/epp-deployment.yaml +++ b/config/charts/inferencepool/templates/epp-deployment.yaml @@ -1,3 +1,5 @@ +{{- if ne (lower .Values.provider.name) "standalone" }} + --- apiVersion: apps/v1 kind: Deployment metadata: @@ -151,3 +153,4 @@ spec: tolerations: {{- toYaml .Values.inferenceExtension.tolerations | nindent 8 }} {{- end }} +{{- end }} diff --git a/config/charts/inferencepool/templates/epp-standalone-deployment.yaml b/config/charts/inferencepool/templates/epp-standalone-deployment.yaml new file mode 100644 index 000000000..559fb7219 --- /dev/null +++ b/config/charts/inferencepool/templates/epp-standalone-deployment.yaml @@ -0,0 +1,410 @@ +{{- if eq (lower .Values.provider.name) "standalone" }} +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-envoy + namespace: {{ .Release.Namespace }} + labels: + {{ include "gateway-api-inference-extension.envoy-labels" . | nindent 4 }} +data: + envoy.yaml: | + admin: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/null + static_resources: + listeners: + - name: envoy-proxy-ready-0.0.0.0-19001 + address: + socket_address: + address: 0.0.0.0 + port_value: 19001 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: envoy-ready-http + route_config: + name: local_route + virtual_hosts: + - name: prometheus_stats + domains: ["*"] + routes: + - match: + prefix: "/stats/prometheus" + route: + cluster: "prometheus_stats" + http_filters: + - name: envoy.filters.http.health_check + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.health_check.v3.HealthCheck + pass_through_mode: false + headers: + - name: ":path" + string_match: + exact: "/ready" + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + - name: vllm + address: + socket_address: + address: 0.0.0.0 + port_value: 8081 + per_connection_buffer_limit_bytes: 32768 + access_log: + - name: envoy.access_loggers.file + filter: + response_flag_filter: + flags: ["NR"] + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + filter_chains: + - name: vllm + filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: http-8081 + route_config: + name: vllm + virtual_hosts: + - name: vllm-default + domains: ["*"] + routes: + - match: + prefix: "/" + route: + cluster: original_destination_cluster + timeout: 86400s + idle_timeout: 86400s + upgrade_configs: + - upgrade_type: websocket + typed_per_filter_config: + envoy.filters.http.ext_proc: + "@type": type.googleapis.com/envoy.config.route.v3.FilterConfig + config: {} + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: + cluster_name: ext_proc + authority: localhost:9002 + timeout: 10s + processing_mode: + request_header_mode: SEND + response_header_mode: SEND + request_body_mode: FULL_DUPLEX_STREAMED + response_body_mode: FULL_DUPLEX_STREAMED + request_trailer_mode: SEND + response_trailer_mode: SEND + message_timeout: 1000s + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppress_envoy_headers: true + http2_protocol_options: + max_concurrent_streams: 100 + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + use_remote_address: true + normalize_path: true + merge_slashes: true + server_header_transformation: PASS_THROUGH + common_http_protocol_options: + headers_with_underscores_action: REJECT_REQUEST + path_with_escaped_slashes_action: UNESCAPE_AND_REDIRECT + access_log: + - name: envoy.access_loggers.file + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.file.v3.FileAccessLog + path: /dev/stdout + log_format: + text_format_source: + inline_string: "{\"start_time\":\"%START_TIME%\",\"method\":\"%REQ(:METHOD)%\",...}\n" + clusters: + - name: prometheus_stats + type: STATIC + connect_timeout: 0.250s + load_assignment: + cluster_name: prometheus_stats + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 19000 + - name: original_destination_cluster + type: ORIGINAL_DST + connect_timeout: 1000s + lb_policy: CLUSTER_PROVIDED + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + original_dst_lb_config: + use_http_header: true + http_header_name: x-gateway-destination-endpoint + - name: ext_proc + type: STATIC + connect_timeout: 86400s + lb_policy: LEAST_REQUEST + circuit_breakers: + thresholds: + - max_connections: 40000 + max_pending_requests: 40000 + max_requests: 40000 + max_retries: 1024 + health_checks: + - timeout: 2s + interval: 10s + unhealthy_threshold: 3 + healthy_threshold: 2 + reuse_connection: true + grpc_health_check: + service_name: "envoy.service.ext_proc.v3.ExternalProcessor" + tls_options: + alpn_protocols: ["h2"] + transport_socket: + name: "envoy.transport_sockets.tls" + typed_config: + "@type": type.googleapis.com/envoy.extensions.transport_sockets.tls.v3.UpstreamTlsContext + common_tls_context: + validation_context: + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + load_assignment: + cluster_name: ext_proc + endpoints: + - locality: + region: ext_proc/e2e/0 + lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 9002 + load_balancing_weight: 1 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "gateway-api-inference-extension.name" . }}-envoy + namespace: {{ .Release.Namespace }} + labels: {{ include "gateway-api-inference-extension.envoy-labels" . | nindent 4 }} +spec: + replicas: {{ .Values.provider.standalone.envoy.replicas | default 1 }} + selector: + matchLabels: + {{ include "gateway-api-inference-extension.envoy-selectorLabels" . | nindent 6 }} + template: + metadata: + labels: {{ include "gateway-api-inference-extension.envoy-selectorLabels" . | nindent 8 }} + annotations: + prometheus.io/path: /stats/prometheus + prometheus.io/port: "19001" # This still correctly refers to the envoy container's metrics port + prometheus.io/scrape: "true" + spec: + serviceAccountName: {{ include "gateway-api-inference-extension.name" . }} + terminationGracePeriodSeconds: 130 + containers: + - name: envoy + image: {{ .Values.provider.standalone.envoy.image.hub }}/{{ .Values.provider.standalone.envoy.image.name }}:{{ .Values.provider.standalone.envoy.image.tag }} + imagePullPolicy: {{ .Values.provider.standalone.envoy.image.pullPolicy | default "Always" }} + args: + - "--service-cluster" + - "{{.Release.Namespace}}/inference-gateway" + - "--service-node" + - "envoy" + - "--log-level" + - "trace" + - "--cpuset-threads" + - "--drain-strategy" + - "immediate" + - "--drain-time-s" + - "60" + - "-c" + - "/etc/envoy/envoy.yaml" + command: + - envoy + env: + - name: ENVOY_NS_NAME + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: ENVOY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + ports: + - containerPort: 8081 + name: http-8081 + - containerPort: 19001 + name: metrics # <-- Envoy's metrics port + readinessProbe: + failureThreshold: 1 + httpGet: + path: /ready + port: 19001 + scheme: HTTP + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 1 + resources: + requests: + cpu: 100m + memory: 512Mi + volumeMounts: + - name: envoy-config-volume + mountPath: /etc/envoy + readOnly: true + - name: epp + image: {{ .Values.inferenceExtension.image.hub }}/{{ .Values.inferenceExtension.image.name }}:{{ .Values.inferenceExtension.image.tag }} + imagePullPolicy: {{ .Values.inferenceExtension.image.pullPolicy | default "Always" }} + args: + - --pool-name + - {{ .Release.Name }} + - --pool-namespace + - {{ .Release.Namespace }} + {{- if ne .Values.inferencePool.apiVersion "inference.networking.k8s.io" }} + - --pool-group + - "{{ (split "/" .Values.inferencePool.apiVersion)._0 }}" + {{- end }} + - --zap-encoder + - "json" + - --config-file + - "/config/{{ .Values.inferenceExtension.pluginsConfigFile }}" + {{- if eq (.Values.inferencePool.modelServerType | default "vllm") "triton-tensorrt-llm" }} + - --total-queued-requests-metric + - "nv_trt_llm_request_metrics{request_type=waiting}" + - --kv-cache-usage-percentage-metric + - "nv_trt_llm_kv_cache_block_metrics{kv_cache_block_type=fraction}" + - --lora-info-metric + - "" # Set an empty metric to disable LoRA metric scraping as they are not supported by Triton yet. + {{- end }} + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + - --ha-enable-leader-election + {{- end }} + # Pass additional flags via the inferenceExtension.flags field in values.yaml. + {{- range .Values.inferenceExtension.flags }} + - "--{{ .name }}" + - "{{ .value }}" + {{- end }} + {{- if .Values.inferenceExtension.tracing.enabled }} + - --tracing=true + {{- else }} + - --tracing=false + {{- end }} + {{- if not .Values.inferenceExtension.monitoring.prometheus.enabled }} + - --metrics-endpoint-auth=false + {{- end }} + ports: + - name: grpc + containerPort: 9002 + - name: grpc-health + containerPort: 9003 + - name: metrics + containerPort: 9090 + {{- if .Values.inferenceExtension.extraContainerPorts }} + {{- toYaml .Values.inferenceExtension.extraContainerPorts | nindent 8 }} + {{- end }} + livenessProbe: + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + grpc: + port: 9003 + service: liveness + {{- else }} + grpc: + port: 9003 + service: inference-extension + {{- end }} + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + {{- if gt (.Values.inferenceExtension.replicas | int) 1 }} + grpc: + port: 9003 + service: readiness + {{- else }} + grpc: + port: 9003 + service: inference-extension + {{- end }} + periodSeconds: 2 + + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- if .Values.inferenceExtension.tracing.enabled }} + - name: OTEL_SERVICE_NAME + value: "gateway-api-inference-extension" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .Values.inferenceExtension.tracing.otelExporterEndpoint | quote }} + - name: OTEL_TRACES_EXPORTER + value: "otlp" + - name: OTEL_RESOURCE_ATTRIBUTES_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + - name: OTEL_RESOURCE_ATTRIBUTES_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: OTEL_RESOURCE_ATTRIBUTES + value: 'k8s.namespace.name=$(NAMESPACE),k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME)' + - name: OTEL_TRACES_SAMPLER + value: {{ .Values.inferenceExtension.tracing.sampling.sampler | quote }} + - name: OTEL_TRACES_SAMPLER_ARG + value: {{ .Values.inferenceExtension.tracing.sampling.samplerArg | quote }} + {{- end }} + {{- if .Values.inferenceExtension.env }} + {{- toYaml .Values.inferenceExtension.env | nindent 8 }} + {{- end }} + volumeMounts: + - name: plugins-config-volume + mountPath: "/config" + volumes: + - name: envoy-config-volume + configMap: + name: {{ include "gateway-api-inference-extension.name" . }}-envoy + items: + - key: envoy.yaml + path: envoy.yaml + - name: plugins-config-volume + configMap: + name: {{ include "gateway-api-inference-extension.name" . }} + {{- if .Values.inferenceExtension.affinity }} + affinity: + {{- toYaml .Values.inferenceExtension.affinity | nindent 8 }} + {{- end }} + {{- if .Values.inferenceExtension.tolerations }} + tolerations: + {{- toYaml .Values.inferenceExtension.tolerations | nindent 8 }} + {{- end }} +--- +{{- end }} diff --git a/config/charts/inferencepool/values.yaml b/config/charts/inferencepool/values.yaml index 8b3385ab1..c2ad0ff96 100644 --- a/config/charts/inferencepool/values.yaml +++ b/config/charts/inferencepool/values.yaml @@ -76,16 +76,26 @@ inferencePool: # This will soon be deprecated when upstream GW providers support v1, just doing something simple for now. targetPortNumber: 8000 -# Options: ["gke", "istio", "none"] +# Options: ["gke", "istio", "standalone", "none"] provider: name: none - # GKE-specific configuration. # This block is only used if name is "gke". gke: # Set to true if the cluster is an Autopilot cluster. autopilot: false + standalone: + replicas: 1 + envoy: + image: + name: envoy + hub: docker.io/envoyproxy + tag: distroless-v1.33.2 + pullPolicy: Always + servicePort: 8081 + + istio: destinationRule: # Provide a way to override the default calculated host @@ -94,4 +104,6 @@ istio: trafficPolicy: {} # connectionPool: # http: - # maxRequestsPerConnection: 256000 \ No newline at end of file + # maxRequestsPerConnection: 256000 + + diff --git a/site-src/_includes/epp-latest.md b/site-src/_includes/epp-latest.md index ef08a61be..9ccfe0e93 100644 --- a/site-src/_includes/epp-latest.md +++ b/site-src/_includes/epp-latest.md @@ -30,3 +30,14 @@ --version $IGW_CHART_VERSION \ oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool ``` +=== "Standalone EPP" + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --set provider.standalone=true \ + --version $IGW_CHART_VERSION \ + oci://us-central1-docker.pkg.dev/k8s-staging-images/gateway-api-inference-extension/charts/inferencepool + ``` + diff --git a/site-src/_includes/epp.md b/site-src/_includes/epp.md index 73e24786f..4872b910f 100644 --- a/site-src/_includes/epp.md +++ b/site-src/_includes/epp.md @@ -30,3 +30,14 @@ --version $IGW_CHART_VERSION \ oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool ``` +=== "Standalone EPP" + + ```bash + export GATEWAY_PROVIDER=none + helm install vllm-llama3-8b-instruct \ + --set inferencePool.modelServers.matchLabels.app=vllm-llama3-8b-instruct \ + --set provider.name=$GATEWAY_PROVIDER \ + --set provider.standalone=true \ + --version $IGW_CHART_VERSION \ + oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + ``` \ No newline at end of file diff --git a/site-src/guides/getting-started-latest.md b/site-src/guides/getting-started-latest.md index bf7413b8e..e605da71a 100644 --- a/site-src/guides/getting-started-latest.md +++ b/site-src/guides/getting-started-latest.md @@ -43,7 +43,7 @@ kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extension/config/crd ``` -### Install the Gateway +### Install the Gateway if you are not using Standalone EPP Choose one of the following options to install Gateway. @@ -91,7 +91,9 @@ kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extens ```bash helm upgrade -i --namespace kgateway-system --version $KGTW_VERSION kgateway oci://cr.kgateway.dev/kgateway-dev/charts/kgateway --set inferenceExtension.enabled=true ``` - +=== "Standalone EPP" + Nothing to install here as you don't need a gateway + ### Deploy the InferencePool and Endpoint Picker Extension Install an InferencePool named `vllm-llama3-8b-instruct` that selects from endpoints with label `app: vllm-llama3-8b-instruct` and listening on port 8000. The Helm install command automatically installs the endpoint-picker, InferencePool along with provider specific resources. @@ -104,7 +106,7 @@ kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extens --8<-- "site-src/_includes/epp-latest.md" -### Deploy an Inference Gateway +### Deploy an Inference Gateway if not using Standalone EPP Choose one of the following options to deploy an Inference Gateway. @@ -199,6 +201,10 @@ kubectl apply -k https://github.com/kubernetes-sigs/gateway-api-inference-extens kubectl get httproute llm-route -o yaml ``` +=== "Standalone EPP" + + Nothing is needed. + ### Deploy InferenceObjective (Optional) Deploy the sample InferenceObjective which allows you to specify priority of requests. @@ -290,3 +296,6 @@ Deploy the sample InferenceObjective which allows you to specify priority of req ```bash kubectl delete ns kgateway-system ``` +=== "Standalone EPP" + + N/A \ No newline at end of file