Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
526 changes: 526 additions & 0 deletions deploy/kserve/README.md

Large diffs are not rendered by default.

161 changes: 161 additions & 0 deletions deploy/kserve/configmap-envoy-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: semantic-router-envoy-kserve-config
labels:
app: semantic-router
component: envoy
data:
envoy.yaml: |
# Envoy configuration for KServe InferenceService integration
# This config routes traffic to KServe predictors based on semantic router decisions
static_resources:
listeners:
- name: listener_0
address:
socket_address:
address: 0.0.0.0
port_value: 8801
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
access_log:
- name: envoy.access_loggers.stdout
typed_config:
"@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
log_format:
json_format:
time: "%START_TIME%"
protocol: "%PROTOCOL%"
request_method: "%REQ(:METHOD)%"
request_path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%"
response_code: "%RESPONSE_CODE%"
response_flags: "%RESPONSE_FLAGS%"
bytes_received: "%BYTES_RECEIVED%"
bytes_sent: "%BYTES_SENT%"
duration: "%DURATION%"
upstream_host: "%UPSTREAM_HOST%"
upstream_cluster: "%UPSTREAM_CLUSTER%"
upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
request_id: "%REQ(X-REQUEST-ID)%"
selected_model: "%REQ(X-SELECTED-MODEL)%"
selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
route_config:
name: local_route
virtual_hosts:
- name: local_service
domains: ["*"]
routes:
# Route /v1/models to semantic router for model aggregation
- match:
path: "/v1/models"
route:
cluster: semantic_router_cluster
timeout: 300s
# Dynamic route - destination determined by x-gateway-destination-endpoint header
- match:
prefix: "/"
route:
cluster: kserve_dynamic_cluster
timeout: 300s
http_filters:
- name: envoy.filters.http.ext_proc
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
grpc_service:
envoy_grpc:
cluster_name: extproc_service
allow_mode_override: true
processing_mode:
request_header_mode: "SEND"
response_header_mode: "SEND"
request_body_mode: "BUFFERED"
response_body_mode: "BUFFERED"
request_trailer_mode: "SKIP"
response_trailer_mode: "SKIP"
failure_mode_allow: true
message_timeout: 300s
- name: envoy.filters.http.router
typed_config:
"@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
suppress_envoy_headers: true
http2_protocol_options:
max_concurrent_streams: 100
initial_stream_window_size: 65536
initial_connection_window_size: 1048576
stream_idle_timeout: "300s"
request_timeout: "300s"
common_http_protocol_options:
idle_timeout: "300s"

clusters:
- name: extproc_service
connect_timeout: 300s
per_connection_buffer_limit_bytes: 52428800
type: STATIC
lb_policy: ROUND_ROBIN
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http2_protocol_options:
connection_keepalive:
interval: 300s
timeout: 300s
load_assignment:
cluster_name: extproc_service
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 50051

# Static cluster for semantic router API
- name: semantic_router_cluster
connect_timeout: 300s
per_connection_buffer_limit_bytes: 52428800
type: STATIC
lb_policy: ROUND_ROBIN
load_assignment:
cluster_name: semantic_router_cluster
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 8080
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http_protocol_options: {}

# Dynamic cluster for KServe InferenceService predictors
# Uses ORIGINAL_DST with header-based destination selection
# The semantic router sets x-gateway-destination-endpoint header to specify the target
# Format: <service-name>-predictor.<your-namespace>.svc.cluster.local:80
- name: kserve_dynamic_cluster
connect_timeout: 300s
per_connection_buffer_limit_bytes: 52428800
type: ORIGINAL_DST
lb_policy: CLUSTER_PROVIDED
original_dst_lb_config:
use_http_header: true
http_header_name: "x-gateway-destination-endpoint"
typed_extension_protocol_options:
envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
"@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
explicit_http_config:
http_protocol_options: {}

admin:
address:
socket_address:
address: "127.0.0.1"
port_value: 19000
235 changes: 235 additions & 0 deletions deploy/kserve/configmap-router-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: semantic-router-kserve-config
labels:
app: semantic-router
component: config
data:
config.yaml: |
bert_model:
model_id: models/all-MiniLM-L12-v2
threshold: 0.6
use_cpu: true

semantic_cache:
enabled: true
backend_type: "memory"
similarity_threshold: 0.8
max_entries: 1000
ttl_seconds: 3600
eviction_policy: "fifo"
use_hnsw: true
hnsw_m: 16
hnsw_ef_construction: 200
embedding_model: "bert"

tools:
enabled: true
top_k: 3
similarity_threshold: 0.2
tools_db_path: "config/tools_db.json"
fallback_to_empty: true

prompt_guard:
enabled: true
use_modernbert: true
model_id: "models/jailbreak_classifier_modernbert-base_model"
threshold: 0.7
use_cpu: true
jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"

# vLLM Endpoints Configuration - Using KServe InferenceService internal URLs
# IMPORTANT: These are the internal cluster URLs for the InferenceService predictors
# Format: <service-name>-predictor.<namespace>.svc.cluster.local
# Replace <namespace> with your actual namespace and configure for your deployed models
vllm_endpoints:
- name: "vllm-model-endpoint"
address: "your-model-predictor.<namespace>.svc.cluster.local"
port: 80 # KServe uses port 80 for internal service
weight: 1
# Example with granite32-8b:
# - name: "granite32-8b-endpoint"
# address: "granite32-8b-predictor.<namespace>.svc.cluster.local"
# port: 80
# weight: 1

model_config:
# Configure this to match your deployed InferenceService model name
"your-model-name":
reasoning_family: "qwen3" # Options: qwen3, deepseek, gpt, gpt-oss
preferred_endpoints: ["vllm-model-endpoint"]
pii_policy:
allow_by_default: true
pii_types_allowed: ["EMAIL_ADDRESS"]
# Example with granite32-8b:
# "granite32-8b":
# reasoning_family: "qwen3"
# preferred_endpoints: ["granite32-8b-endpoint"]
# pii_policy:
# allow_by_default: true
# pii_types_allowed: ["EMAIL_ADDRESS"]

# Classifier configuration
classifier:
category_model:
model_id: "models/category_classifier_modernbert-base_model"
use_modernbert: true
threshold: 0.6
use_cpu: true
category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
pii_model:
model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
use_modernbert: true
threshold: 0.7
use_cpu: true
pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"

# Categories with model scoring
categories:
- name: business
system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices."
model_scores:
- model: your-model-name
score: 0.7
use_reasoning: false
- name: law
system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions."
model_scores:
- model: your-model-name
score: 0.4
use_reasoning: false
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.92
model_scores:
- model: your-model-name
score: 0.6
use_reasoning: false
- name: biology
system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology."
model_scores:
- model: your-model-name
score: 0.9
use_reasoning: false
- name: chemistry
system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
model_scores:
- model: your-model-name
score: 0.6
use_reasoning: true
- name: history
system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
model_scores:
- model: your-model-name
score: 0.7
use_reasoning: false
- name: other
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.75
model_scores:
- model: your-model-name
score: 0.7
use_reasoning: false
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.95
model_scores:
- model: your-model-name
score: 0.5
use_reasoning: false
- name: economics
system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory."
model_scores:
- model: your-model-name
score: 1.0
use_reasoning: false
- name: math
system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
model_scores:
- model: your-model-name
score: 1.0
use_reasoning: true
- name: physics
system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
model_scores:
- model: your-model-name
score: 0.7
use_reasoning: true
- name: computer science
system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
model_scores:
- model: your-model-name
score: 0.6
use_reasoning: false
- name: philosophy
system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought."
model_scores:
- model: your-model-name
score: 0.5
use_reasoning: false
- name: engineering
system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering."
model_scores:
- model: your-model-name
score: 0.7
use_reasoning: false

default_model: your-model-name

# Reasoning family configurations
reasoning_families:
deepseek:
type: "chat_template_kwargs"
parameter: "thinking"
qwen3:
type: "chat_template_kwargs"
parameter: "enable_thinking"
gpt-oss:
type: "reasoning_effort"
parameter: "reasoning_effort"
gpt:
type: "reasoning_effort"
parameter: "reasoning_effort"

default_reasoning_effort: high

# API Configuration
api:
batch_classification:
max_batch_size: 100
concurrency_threshold: 5
max_concurrency: 8
metrics:
enabled: true
detailed_goroutine_tracking: true
high_resolution_timing: false
sample_rate: 1.0
duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]

# Embedding Models Configuration
embedding_models:
qwen3_model_path: "models/Qwen3-Embedding-0.6B"
gemma_model_path: "models/embeddinggemma-300m"
use_cpu: true

# Observability Configuration
observability:
tracing:
enabled: false
provider: "opentelemetry"
exporter:
type: "stdout"
endpoint: "localhost:4317"
insecure: true
sampling:
type: "always_on"
rate: 1.0
resource:
service_name: "vllm-semantic-router"
service_version: "v0.1.0"
deployment_environment: "production"
Loading
Loading