vllm-project · cooktheryan · Oct 31, 2025 · Oct 31, 2025 · Nov 2, 2025 · Nov 2, 2025
@@ -0,0 +1,161 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: semantic-router-envoy-kserve-config
+  labels:
+    app: semantic-router
+    component: envoy
+data:
+  envoy.yaml: |
+    # Envoy configuration for KServe InferenceService integration
+    # This config routes traffic to KServe predictors based on semantic router decisions
+    static_resources:
+      listeners:
+      - name: listener_0
+        address:
+          socket_address:
+            address: 0.0.0.0
+            port_value: 8801
+        filter_chains:
+        - filters:
+          - name: envoy.filters.network.http_connection_manager
+            typed_config:
+              "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
+              stat_prefix: ingress_http
+              access_log:
+              - name: envoy.access_loggers.stdout
+                typed_config:
+                  "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog
+                  log_format:
+                    json_format:
+                      time: "%START_TIME%"
+                      protocol: "%PROTOCOL%"
+                      request_method: "%REQ(:METHOD)%"
+                      request_path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%"
+                      response_code: "%RESPONSE_CODE%"
+                      response_flags: "%RESPONSE_FLAGS%"
+                      bytes_received: "%BYTES_RECEIVED%"
+                      bytes_sent: "%BYTES_SENT%"
+                      duration: "%DURATION%"
+                      upstream_host: "%UPSTREAM_HOST%"
+                      upstream_cluster: "%UPSTREAM_CLUSTER%"
+                      upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%"
+                      request_id: "%REQ(X-REQUEST-ID)%"
+                      selected_model: "%REQ(X-SELECTED-MODEL)%"
+                      selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%"
+              route_config:
+                name: local_route
+                virtual_hosts:
+                - name: local_service
+                  domains: ["*"]
+                  routes:
+                  # Route /v1/models to semantic router for model aggregation
+                  - match:
+                      path: "/v1/models"
+                    route:
+                      cluster: semantic_router_cluster
+                      timeout: 300s
+                  # Dynamic route - destination determined by x-gateway-destination-endpoint header
+                  - match:
+                      prefix: "/"
+                    route:
+                      cluster: kserve_dynamic_cluster
+                      timeout: 300s
+              http_filters:
+              - name: envoy.filters.http.ext_proc
+                typed_config:
+                  "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor
+                  grpc_service:
+                    envoy_grpc:
+                      cluster_name: extproc_service
+                  allow_mode_override: true
+                  processing_mode:
+                    request_header_mode: "SEND"
+                    response_header_mode: "SEND"
+                    request_body_mode: "BUFFERED"
+                    response_body_mode: "BUFFERED"
+                    request_trailer_mode: "SKIP"
+                    response_trailer_mode: "SKIP"
+                  failure_mode_allow: true
+                  message_timeout: 300s
+              - name: envoy.filters.http.router
+                typed_config:
+                  "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
+                  suppress_envoy_headers: true
+              http2_protocol_options:
+                max_concurrent_streams: 100
+                initial_stream_window_size: 65536
+                initial_connection_window_size: 1048576
+              stream_idle_timeout: "300s"
+              request_timeout: "300s"
+              common_http_protocol_options:
+                idle_timeout: "300s"
+
+      clusters:
+      - name: extproc_service
+        connect_timeout: 300s
+        per_connection_buffer_limit_bytes: 52428800
+        type: STATIC
+        lb_policy: ROUND_ROBIN
+        typed_extension_protocol_options:
+          envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+            "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+            explicit_http_config:
+              http2_protocol_options:
+                connection_keepalive:
+                  interval: 300s
+                  timeout: 300s
+        load_assignment:
+          cluster_name: extproc_service
+          endpoints:
+          - lb_endpoints:
+            - endpoint:
+                address:
+                  socket_address:
+                    address: 127.0.0.1
+                    port_value: 50051
+
+      # Static cluster for semantic router API
+      - name: semantic_router_cluster
+        connect_timeout: 300s
+        per_connection_buffer_limit_bytes: 52428800
+        type: STATIC
+        lb_policy: ROUND_ROBIN
+        load_assignment:
+          cluster_name: semantic_router_cluster
+          endpoints:
+          - lb_endpoints:
+            - endpoint:
+                address:
+                  socket_address:
+                    address: 127.0.0.1
+                    port_value: 8080
+        typed_extension_protocol_options:
+          envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+            "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+            explicit_http_config:
+              http_protocol_options: {}
+
+      # Dynamic cluster for KServe InferenceService predictors
+      # Uses ORIGINAL_DST with header-based destination selection
+      # The semantic router sets x-gateway-destination-endpoint header to specify the target
+      # Format: <service-name>-predictor.<your-namespace>.svc.cluster.local:80
+      - name: kserve_dynamic_cluster
+        connect_timeout: 300s
+        per_connection_buffer_limit_bytes: 52428800
+        type: ORIGINAL_DST
+        lb_policy: CLUSTER_PROVIDED
+        original_dst_lb_config:
+          use_http_header: true
+          http_header_name: "x-gateway-destination-endpoint"
+        typed_extension_protocol_options:
+          envoy.extensions.upstreams.http.v3.HttpProtocolOptions:
+            "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions
+            explicit_http_config:
+              http_protocol_options: {}
+
+    admin:
+      address:
+        socket_address:
+          address: "127.0.0.1"
+          port_value: 19000
@@ -0,0 +1,235 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: semantic-router-kserve-config
+  labels:
+    app: semantic-router
+    component: config
+data:
+  config.yaml: |
+    bert_model:
+      model_id: models/all-MiniLM-L12-v2
+      threshold: 0.6
+      use_cpu: true
+
+    semantic_cache:
+      enabled: true
+      backend_type: "memory"
+      similarity_threshold: 0.8
+      max_entries: 1000
+      ttl_seconds: 3600
+      eviction_policy: "fifo"
+      use_hnsw: true
+      hnsw_m: 16
+      hnsw_ef_construction: 200
+      embedding_model: "bert"
+
+    tools:
+      enabled: true
+      top_k: 3
+      similarity_threshold: 0.2
+      tools_db_path: "config/tools_db.json"
+      fallback_to_empty: true
+
+    prompt_guard:
+      enabled: true
+      use_modernbert: true
+      model_id: "models/jailbreak_classifier_modernbert-base_model"
+      threshold: 0.7
+      use_cpu: true
+      jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+    # vLLM Endpoints Configuration - Using KServe InferenceService internal URLs
+    # IMPORTANT: These are the internal cluster URLs for the InferenceService predictors
+    # Format: <service-name>-predictor.<namespace>.svc.cluster.local
+    # Replace <namespace> with your actual namespace and configure for your deployed models
+    vllm_endpoints:
+      - name: "vllm-model-endpoint"
+        address: "your-model-predictor.<namespace>.svc.cluster.local"
+        port: 80  # KServe uses port 80 for internal service
+        weight: 1
+      # Example with granite32-8b:
+      # - name: "granite32-8b-endpoint"
+      #   address: "granite32-8b-predictor.<namespace>.svc.cluster.local"
+      #   port: 80
+      #   weight: 1
+
+    model_config:
+      # Configure this to match your deployed InferenceService model name
+      "your-model-name":
+        reasoning_family: "qwen3"  # Options: qwen3, deepseek, gpt, gpt-oss
+        preferred_endpoints: ["vllm-model-endpoint"]
+        pii_policy:
+          allow_by_default: true
+          pii_types_allowed: ["EMAIL_ADDRESS"]
+      # Example with granite32-8b:
+      # "granite32-8b":
+      #   reasoning_family: "qwen3"
+      #   preferred_endpoints: ["granite32-8b-endpoint"]
+      #   pii_policy:
+      #     allow_by_default: true
+      #     pii_types_allowed: ["EMAIL_ADDRESS"]
+
+    # Classifier configuration
+    classifier:
+      category_model:
+        model_id: "models/category_classifier_modernbert-base_model"
+        use_modernbert: true
+        threshold: 0.6
+        use_cpu: true
+        category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+      pii_model:
+        model_id: "models/pii_classifier_modernbert-base_presidio_token_model"
+        use_modernbert: true
+        threshold: 0.7
+        use_cpu: true
+        pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json"
+
+    # Categories with model scoring
+    categories:
+      - name: business
+        system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices."
+        model_scores:
+          - model: your-model-name
+            score: 0.7
+            use_reasoning: false
+      - name: law
+        system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions."
+        model_scores:
+          - model: your-model-name
+            score: 0.4
+            use_reasoning: false
+      - name: psychology
+        system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches."
+        semantic_cache_enabled: true
+        semantic_cache_similarity_threshold: 0.92
+        model_scores:
+          - model: your-model-name
+            score: 0.6
+            use_reasoning: false
+      - name: biology
+        system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology."
+        model_scores:
+          - model: your-model-name
+            score: 0.9
+            use_reasoning: false
+      - name: chemistry
+        system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations."
+        model_scores:
+          - model: your-model-name
+            score: 0.6
+            use_reasoning: true
+      - name: history
+        system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
+        model_scores:
+          - model: your-model-name
+            score: 0.7
+            use_reasoning: false
+      - name: other
+        system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
+        semantic_cache_enabled: true
+        semantic_cache_similarity_threshold: 0.75
+        model_scores:
+          - model: your-model-name
+            score: 0.7
+            use_reasoning: false
+      - name: health
+        system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness."
+        semantic_cache_enabled: true
+        semantic_cache_similarity_threshold: 0.95
+        model_scores:
+          - model: your-model-name
+            score: 0.5
+            use_reasoning: false
+      - name: economics
+        system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory."
+        model_scores:
+          - model: your-model-name
+            score: 1.0
+            use_reasoning: false
+      - name: math
+        system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way."
+        model_scores:
+          - model: your-model-name
+            score: 1.0
+            use_reasoning: true
+      - name: physics
+        system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
+        model_scores:
+          - model: your-model-name
+            score: 0.7
+            use_reasoning: true
+      - name: computer science
+        system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
+        model_scores:
+          - model: your-model-name
+            score: 0.6
+            use_reasoning: false
+      - name: philosophy
+        system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought."
+        model_scores:
+          - model: your-model-name
+            score: 0.5
+            use_reasoning: false
+      - name: engineering
+        system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering."
+        model_scores:
+          - model: your-model-name
+            score: 0.7
+            use_reasoning: false
+
+    default_model: your-model-name
+
+    # Reasoning family configurations
+    reasoning_families:
+      deepseek:
+        type: "chat_template_kwargs"
+        parameter: "thinking"
+      qwen3:
+        type: "chat_template_kwargs"
+        parameter: "enable_thinking"
+      gpt-oss:
+        type: "reasoning_effort"
+        parameter: "reasoning_effort"
+      gpt:
+        type: "reasoning_effort"
+        parameter: "reasoning_effort"
+
+    default_reasoning_effort: high
+
+    # API Configuration
+    api:
+      batch_classification:
+        max_batch_size: 100
+        concurrency_threshold: 5
+        max_concurrency: 8
+        metrics:
+          enabled: true
+          detailed_goroutine_tracking: true
+          high_resolution_timing: false
+          sample_rate: 1.0
+          duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+          size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
+
+    # Embedding Models Configuration
+    embedding_models:
+      qwen3_model_path: "models/Qwen3-Embedding-0.6B"
+      gemma_model_path: "models/embeddinggemma-300m"
+      use_cpu: true
+
+    # Observability Configuration
+    observability:
+      tracing:
+        enabled: false
+        provider: "opentelemetry"
+        exporter:
+          type: "stdout"
+          endpoint: "localhost:4317"
+          insecure: true
+        sampling:
+          type: "always_on"
+          rate: 1.0
+        resource:
+          service_name: "vllm-semantic-router"
+          service_version: "v0.1.0"
+          deployment_environment: "production"