From ef9f2c3744e85a19ea791dda72188a8bc518e899 Mon Sep 17 00:00:00 2001 From: Ryan Cook Date: Fri, 31 Oct 2025 15:45:54 -0400 Subject: [PATCH 1/4] WIP: kserve functionality Signed-off-by: Ryan Cook --- deploy/kserve/README.md | 511 ++++++++++++++++++ deploy/kserve/configmap-envoy-config.yaml | 161 ++++++ deploy/kserve/configmap-router-config.yaml | 235 ++++++++ deploy/kserve/deployment.yaml | 269 +++++++++ deploy/kserve/example-multi-model-config.yaml | 294 ++++++++++ deploy/kserve/inference-examples/README.md | 23 + .../inferenceservice-granite32-8b.yaml | 36 ++ .../servingruntime-granite32-8b.yaml | 52 ++ deploy/kserve/kustomization.yaml | 22 + deploy/kserve/pvc.yaml | 33 ++ deploy/kserve/route.yaml | 21 + deploy/kserve/service.yaml | 42 ++ deploy/kserve/serviceaccount.yaml | 6 + deploy/kserve/test-semantic-routing.sh | 226 ++++++++ 14 files changed, 1931 insertions(+) create mode 100644 deploy/kserve/README.md create mode 100644 deploy/kserve/configmap-envoy-config.yaml create mode 100644 deploy/kserve/configmap-router-config.yaml create mode 100644 deploy/kserve/deployment.yaml create mode 100644 deploy/kserve/example-multi-model-config.yaml create mode 100644 deploy/kserve/inference-examples/README.md create mode 100644 deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml create mode 100644 deploy/kserve/inference-examples/servingruntime-granite32-8b.yaml create mode 100644 deploy/kserve/kustomization.yaml create mode 100644 deploy/kserve/pvc.yaml create mode 100644 deploy/kserve/route.yaml create mode 100644 deploy/kserve/service.yaml create mode 100644 deploy/kserve/serviceaccount.yaml create mode 100755 deploy/kserve/test-semantic-routing.sh diff --git a/deploy/kserve/README.md b/deploy/kserve/README.md new file mode 100644 index 000000000..218e4ccd0 --- /dev/null +++ b/deploy/kserve/README.md @@ -0,0 +1,511 @@ +# Semantic Router Integration with OpenShift AI KServe + +This directory contains Kubernetes manifests for deploying the vLLM Semantic Router to work with OpenShift AI's KServe InferenceService endpoints. + +## Overview + +The semantic router acts as an intelligent gateway that routes OpenAI-compatible API requests to appropriate vLLM models deployed via KServe InferenceServices. It provides: + +- **Intelligent Model Selection**: Automatically routes requests to the best model based on semantic understanding +- **PII Detection & Protection**: Blocks or redacts sensitive information +- **Prompt Guard**: Detects and blocks jailbreak attempts +- **Semantic Caching**: Reduces latency and costs through intelligent caching +- **Category-Specific Prompts**: Injects domain-specific system prompts +- **Tools Auto-Selection**: Automatically selects relevant tools for function calling + +## Architecture + +``` +Client Request (OpenAI API) + ↓ +[OpenShift Route - HTTPS] + ↓ +[Envoy Proxy Container] ← [Semantic Router Container] + ↓ ↓ + | [Classification & Selection] + | ↓ + | [Sets x-gateway-destination-endpoint] + ↓ +[KServe InferenceService Predictor] + ↓ +[vLLM Model Response] +``` + +The deployment runs two containers in a single pod: +1. **Semantic Router**: ExtProc service that performs classification and routing logic +2. **Envoy Proxy**: HTTP proxy that integrates with the semantic router via gRPC + +## Prerequisites + +1. **OpenShift Cluster** with OpenShift AI (RHOAI) installed +2. **KServe InferenceServices** deployed in your namespace (see `inference-examples/` for sample configurations) +3. **Storage Class** available for PersistentVolumeClaims +4. **Namespace** where you want to deploy + +### Verify Your InferenceServices + +Check your deployed InferenceServices: + +```bash +oc get inferenceservice +``` + +Example output: +``` +NAME URL READY PREV LATEST +granite32-8b https://granite32-8b-your-ns.apps... True 100 +``` + +Get the internal service URL for the predictor: + +```bash +oc get inferenceservice granite32-8b -o jsonpath='{.status.components.predictor.address.url}' +``` + +Example output: +``` +http://granite32-8b-predictor.your-namespace.svc.cluster.local +``` + +## Configuration + +### Step 1: Configure InferenceService Endpoints + +Edit `configmap-router-config.yaml` to add your InferenceService endpoints: + +```yaml +vllm_endpoints: + - name: "your-model-endpoint" + address: "your-model-predictor..svc.cluster.local" # Replace with your model and namespace + port: 80 # KServe uses port 80 for internal service + weight: 1 +``` + +**Important**: +- Replace `` with your actual namespace +- Replace `your-model` with your InferenceService name +- Use the **internal cluster URL** format: `-predictor..svc.cluster.local` +- Use **port 80** for KServe internal services (not the external HTTPS port) + +### Step 2: Configure Model Settings + +Update the `model_config` section to match your models: + +```yaml +model_config: + "your-model-name": # Must match the model name from your InferenceService + reasoning_family: "qwen3" # Options: qwen3, deepseek, gpt, gpt-oss - adjust based on your model family + preferred_endpoints: ["your-model-endpoint"] + pii_policy: + allow_by_default: true + pii_types_allowed: ["EMAIL_ADDRESS"] +``` + +### Step 3: Configure Category Routing + +Update the `categories` section to define which models handle which types of queries: + +```yaml +categories: + - name: math + system_prompt: "You are a mathematics expert..." + model_scores: + - model: your-model-name # Must match model_config key + score: 1.0 # Higher score = preferred for this category + use_reasoning: true # Enable extended reasoning +``` + +**Category Scoring**: +- Scores range from 0.0 to 1.0 +- Higher scores indicate better suitability for the category +- The router selects the model with the highest score for each query category +- Use `use_reasoning: true` for complex tasks (math, chemistry, physics) + +### Step 4: Adjust Storage Requirements + +Edit `pvc.yaml` to set appropriate storage sizes: + +```yaml +resources: + requests: + storage: 10Gi # Adjust based on model sizes +``` + +Model storage requirements: +- Category classifier: ~500MB +- PII classifier: ~500MB +- Jailbreak classifier: ~500MB +- PII token classifier: ~500MB +- BERT embeddings: ~500MB +- **Total**: ~2.5GB minimum, recommend 10Gi for headroom + +## Deployment + +### Option 1: Deploy with Kustomize (Recommended) + +```bash +# Switch to your namespace +oc project your-namespace + +# Deploy all resources +oc apply -k deploy/kserve/ + +# Verify deployment +oc get pods -l app=semantic-router +oc get svc semantic-router-kserve +oc get route semantic-router-kserve +``` + +### Option 2: Deploy Individual Resources + +```bash +# Switch to your namespace (or create it) +oc project your-namespace +# OR: oc new-project your-namespace + +# Deploy in order +oc apply -f deploy/kserve/serviceaccount.yaml +oc apply -f deploy/kserve/pvc.yaml +oc apply -f deploy/kserve/configmap-router-config.yaml +oc apply -f deploy/kserve/configmap-envoy-config.yaml +oc apply -f deploy/kserve/deployment.yaml +oc apply -f deploy/kserve/service.yaml +oc apply -f deploy/kserve/route.yaml +``` + +### Monitor Deployment + +Watch the pod initialization (model downloads take a few minutes): + +```bash +# Watch pod status +oc get pods -l app=semantic-router -w + +# Check init container logs (model download) +oc logs -l app=semantic-router -c model-downloader -f + +# Check semantic router logs +oc logs -l app=semantic-router -c semantic-router -f + +# Check Envoy logs +oc logs -l app=semantic-router -c envoy-proxy -f +``` + +### Verify Deployment + +```bash +# Get the external route URL +ROUTER_URL=$(oc get route semantic-router-kserve -o jsonpath='{.spec.host}') +echo "https://$ROUTER_URL" + +# Test health check +curl -k "https://$ROUTER_URL/v1/models" + +# Test classification API +curl -k "https://$ROUTER_URL/v1/classify" \ + -H "Content-Type: application/json" \ + -d '{"text": "What is the derivative of x^2?"}' + +# Test chat completion (replace 'your-model-name' with your actual model name) +curl -k "https://$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "your-model-name", + "messages": [{"role": "user", "content": "Explain quantum entanglement"}] + }' +``` + +## Testing with Different Categories + +The router automatically classifies queries and routes to the best model. Test different categories: + +```bash +ROUTER_URL=$(oc get route semantic-router-kserve -o jsonpath='{.spec.host}') +MODEL_NAME="your-model-name" # Replace with your model name + +# Math query (high reasoning enabled) +curl -k "https://$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$MODEL_NAME\", + \"messages\": [{\"role\": \"user\", \"content\": \"Solve the integral of x^2 dx\"}] + }" + +# Business query +curl -k "https://$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$MODEL_NAME\", + \"messages\": [{\"role\": \"user\", \"content\": \"What is a good marketing strategy for SaaS?\"}] + }" + +# Test PII detection +curl -k "https://$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{ + \"model\": \"$MODEL_NAME\", + \"messages\": [{\"role\": \"user\", \"content\": \"My SSN is 123-45-6789\"}] + }" +``` + +## Monitoring + +### Prometheus Metrics + +Metrics are exposed on port 9190 at `/metrics`: + +```bash +POD_NAME=$(oc get pods -l app=semantic-router -o jsonpath='{.items[0].metadata.name}') +oc port-forward $POD_NAME 9190:9190 + +# View metrics +curl http://localhost:9190/metrics +``` + +Key metrics: +- `semantic_router_classification_duration_seconds`: Classification latency +- `semantic_router_cache_hit_total`: Cache hit count +- `semantic_router_pii_detections_total`: PII detection count +- `semantic_router_requests_total`: Total requests processed + +### Envoy Admin Interface + +Access Envoy admin interface: + +```bash +POD_NAME=$(oc get pods -l app=semantic-router -o jsonpath='{.items[0].metadata.name}') +oc port-forward $POD_NAME 19000:19000 + +# View stats +curl http://localhost:19000/stats +curl http://localhost:19000/clusters +``` + +### View Logs + +```bash +# Combined logs from all containers +oc logs -l app=semantic-router --all-containers=true -f + +# Semantic router only +oc logs -l app=semantic-router -c semantic-router -f + +# Envoy only +oc logs -l app=semantic-router -c envoy-proxy -f +``` + +## Troubleshooting + +### Pod Not Starting + +```bash +# Check pod events +oc describe pod -l app=semantic-router + +# Check PVC status +oc get pvc +``` + +**Common issues**: +- PVC pending: No storage class available or insufficient capacity +- ImagePullBackOff: Check image registry permissions +- Init container failing: Network issues downloading models from HuggingFace + +### Model Download Issues + +```bash +# Check init container logs +oc logs -l app=semantic-router -c model-downloader + +# If models fail to download, you can pre-populate them: +# 1. Create a Job or pod with the model-downloader init container +# 2. Verify models exist in the PVC before starting the main deployment +``` + +### Routing Issues + +```bash +# Check if semantic router can reach KServe predictors +POD_NAME=$(oc get pods -l app=semantic-router -o jsonpath='{.items[0].metadata.name}') +NAMESPACE=$(oc project -q) + +# Test connectivity to InferenceService (replace 'your-model' with your InferenceService name) +oc exec $POD_NAME -c semantic-router -- \ + curl -v http://your-model-predictor.$NAMESPACE.svc.cluster.local/v1/models + +# Check Envoy configuration +oc exec $POD_NAME -c envoy-proxy -- \ + curl http://localhost:19000/config_dump +``` + +### Classification Not Working + +```bash +# Test the classification API directly +ROUTER_URL=$(oc get route semantic-router-kserve -o jsonpath='{.spec.host}') + +curl -k "https://$ROUTER_URL/v1/classify" \ + -H "Content-Type: application/json" \ + -d '{"text": "What is 2+2?"}' + +# Expected output should include category and model selection +``` + +### 503 Service Unavailable + +**Possible causes**: +1. InferenceService is not ready +2. Incorrect endpoint address in config +3. Network policy blocking traffic + +**Solutions**: +```bash +# Verify InferenceService is ready +oc get inferenceservice + +# Check if predictor pods are running +oc get pods | grep predictor + +# Verify network connectivity (replace 'your-model' with your InferenceService name) +POD_NAME=$(oc get pods -l app=semantic-router -o jsonpath='{.items[0].metadata.name}') +NAMESPACE=$(oc project -q) +oc exec $POD_NAME -c envoy-proxy -- \ + wget -O- http://your-model-predictor.$NAMESPACE.svc.cluster.local/v1/models +``` + +## Adding More InferenceServices + +To add additional models: + +1. **Deploy InferenceService** (if not already deployed) +2. **Update ConfigMap** (`configmap-router-config.yaml`): + ```yaml + vllm_endpoints: + - name: "new-model-endpoint" + address: "new-model-predictor..svc.cluster.local" # Replace + port: 80 + weight: 1 + + model_config: + "new-model": + reasoning_family: "qwen3" + preferred_endpoints: ["new-model-endpoint"] + pii_policy: + allow_by_default: true + + categories: + - name: coding + system_prompt: "You are an expert programmer..." + model_scores: + - model: new-model + score: 0.9 + use_reasoning: false + ``` + +3. **Apply updated ConfigMap**: + ```bash + oc apply -f configmap-router-config.yaml + + # Restart deployment to pick up changes + oc rollout restart deployment/semantic-router-kserve + ``` + +## Performance Tuning + +### Resource Limits + +Adjust resource requests/limits in `deployment.yaml` based on load: + +```yaml +resources: + requests: + memory: "3Gi" # Increase for more models/cache + cpu: "1" + limits: + memory: "6Gi" + cpu: "2" +``` + +### Semantic Cache + +Tune cache settings in `configmap-router-config.yaml`: + +```yaml +semantic_cache: + enabled: true + similarity_threshold: 0.8 # Lower = more cache hits, higher = more accurate + max_entries: 1000 # Increase for more cache capacity + ttl_seconds: 3600 # Cache entry lifetime +``` + +### Scaling + +Scale the deployment for high availability: + +```bash +# Scale to multiple replicas +oc scale deployment/semantic-router-kserve --replicas=3 + +# Note: With multiple replicas, use Redis or Milvus for shared cache +``` + +## Integration with Applications + +Point your OpenAI client to the semantic router: + +**Python Example**: +```python +from openai import OpenAI + +# Get your route URL from: oc get route semantic-router-kserve +client = OpenAI( + base_url="https://semantic-router-your-namespace.apps.your-cluster.com/v1", + api_key="not-needed" # KServe doesn't require API key by default +) + +response = client.chat.completions.create( + model="your-model-name", # Replace with your model name + messages=[{"role": "user", "content": "Explain machine learning"}] +) +print(response.choices[0].message.content) +``` + +**cURL Example**: +```bash +curl -k "https://semantic-router-your-namespace.apps.your-cluster.com/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d '{ + "model": "your-model-name", + "messages": [{"role": "user", "content": "Hello!"}] + }' +``` + +## Cleanup + +Remove all resources: + +```bash +# Delete using kustomize +oc delete -k deploy/kserve/ + +# Or delete individual resources +oc delete route semantic-router-kserve +oc delete service semantic-router-kserve +oc delete deployment semantic-router-kserve +oc delete configmap semantic-router-kserve-config semantic-router-envoy-kserve-config +oc delete pvc semantic-router-models semantic-router-cache +oc delete serviceaccount semantic-router +``` + +## Additional Resources + +- [vLLM Semantic Router Documentation](https://vllm-semantic-router.com) +- [OpenShift AI Documentation](https://access.redhat.com/documentation/en-us/red_hat_openshift_ai) +- [KServe Documentation](https://kserve.github.io/website/) +- [Envoy Proxy Documentation](https://www.envoyproxy.io/docs) + +## Support + +For issues and questions: +- GitHub Issues: https://github.com/vllm-project/semantic-router/issues +- Documentation: https://vllm-semantic-router.com/docs diff --git a/deploy/kserve/configmap-envoy-config.yaml b/deploy/kserve/configmap-envoy-config.yaml new file mode 100644 index 000000000..51007c45a --- /dev/null +++ b/deploy/kserve/configmap-envoy-config.yaml @@ -0,0 +1,161 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: semantic-router-envoy-kserve-config + labels: + app: semantic-router + component: envoy +data: + envoy.yaml: | + # Envoy configuration for KServe InferenceService integration + # This config routes traffic to KServe predictors based on semantic router decisions + static_resources: + listeners: + - name: listener_0 + address: + socket_address: + address: 0.0.0.0 + port_value: 8801 + filter_chains: + - filters: + - name: envoy.filters.network.http_connection_manager + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager + stat_prefix: ingress_http + access_log: + - name: envoy.access_loggers.stdout + typed_config: + "@type": type.googleapis.com/envoy.extensions.access_loggers.stream.v3.StdoutAccessLog + log_format: + json_format: + time: "%START_TIME%" + protocol: "%PROTOCOL%" + request_method: "%REQ(:METHOD)%" + request_path: "%REQ(X-ENVOY-ORIGINAL-PATH?:PATH)%" + response_code: "%RESPONSE_CODE%" + response_flags: "%RESPONSE_FLAGS%" + bytes_received: "%BYTES_RECEIVED%" + bytes_sent: "%BYTES_SENT%" + duration: "%DURATION%" + upstream_host: "%UPSTREAM_HOST%" + upstream_cluster: "%UPSTREAM_CLUSTER%" + upstream_local_address: "%UPSTREAM_LOCAL_ADDRESS%" + request_id: "%REQ(X-REQUEST-ID)%" + selected_model: "%REQ(X-SELECTED-MODEL)%" + selected_endpoint: "%REQ(X-GATEWAY-DESTINATION-ENDPOINT)%" + route_config: + name: local_route + virtual_hosts: + - name: local_service + domains: ["*"] + routes: + # Route /v1/models to semantic router for model aggregation + - match: + path: "/v1/models" + route: + cluster: semantic_router_cluster + timeout: 300s + # Dynamic route - destination determined by x-gateway-destination-endpoint header + - match: + prefix: "/" + route: + cluster: kserve_dynamic_cluster + timeout: 300s + http_filters: + - name: envoy.filters.http.ext_proc + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.ext_proc.v3.ExternalProcessor + grpc_service: + envoy_grpc: + cluster_name: extproc_service + allow_mode_override: true + processing_mode: + request_header_mode: "SEND" + response_header_mode: "SEND" + request_body_mode: "BUFFERED" + response_body_mode: "BUFFERED" + request_trailer_mode: "SKIP" + response_trailer_mode: "SKIP" + failure_mode_allow: true + message_timeout: 300s + - name: envoy.filters.http.router + typed_config: + "@type": type.googleapis.com/envoy.extensions.filters.http.router.v3.Router + suppress_envoy_headers: true + http2_protocol_options: + max_concurrent_streams: 100 + initial_stream_window_size: 65536 + initial_connection_window_size: 1048576 + stream_idle_timeout: "300s" + request_timeout: "300s" + common_http_protocol_options: + idle_timeout: "300s" + + clusters: + - name: extproc_service + connect_timeout: 300s + per_connection_buffer_limit_bytes: 52428800 + type: STATIC + lb_policy: ROUND_ROBIN + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http2_protocol_options: + connection_keepalive: + interval: 300s + timeout: 300s + load_assignment: + cluster_name: extproc_service + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 50051 + + # Static cluster for semantic router API + - name: semantic_router_cluster + connect_timeout: 300s + per_connection_buffer_limit_bytes: 52428800 + type: STATIC + lb_policy: ROUND_ROBIN + load_assignment: + cluster_name: semantic_router_cluster + endpoints: + - lb_endpoints: + - endpoint: + address: + socket_address: + address: 127.0.0.1 + port_value: 8080 + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http_protocol_options: {} + + # Dynamic cluster for KServe InferenceService predictors + # Uses ORIGINAL_DST with header-based destination selection + # The semantic router sets x-gateway-destination-endpoint header to specify the target + # Format: -predictor..svc.cluster.local:80 + - name: kserve_dynamic_cluster + connect_timeout: 300s + per_connection_buffer_limit_bytes: 52428800 + type: ORIGINAL_DST + lb_policy: CLUSTER_PROVIDED + original_dst_lb_config: + use_http_header: true + http_header_name: "x-gateway-destination-endpoint" + typed_extension_protocol_options: + envoy.extensions.upstreams.http.v3.HttpProtocolOptions: + "@type": type.googleapis.com/envoy.extensions.upstreams.http.v3.HttpProtocolOptions + explicit_http_config: + http_protocol_options: {} + + admin: + address: + socket_address: + address: "127.0.0.1" + port_value: 19000 diff --git a/deploy/kserve/configmap-router-config.yaml b/deploy/kserve/configmap-router-config.yaml new file mode 100644 index 000000000..75dfa4eba --- /dev/null +++ b/deploy/kserve/configmap-router-config.yaml @@ -0,0 +1,235 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: semantic-router-kserve-config + labels: + app: semantic-router + component: config +data: + config.yaml: | + bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + + semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.8 + max_entries: 1000 + ttl_seconds: 3600 + eviction_policy: "fifo" + use_hnsw: true + hnsw_m: 16 + hnsw_ef_construction: 200 + embedding_model: "bert" + + tools: + enabled: true + top_k: 3 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + + prompt_guard: + enabled: true + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + + # vLLM Endpoints Configuration - Using KServe InferenceService internal URLs + # IMPORTANT: These are the internal cluster URLs for the InferenceService predictors + # Format: -predictor..svc.cluster.local + # Replace with your actual namespace and configure for your deployed models + vllm_endpoints: + - name: "vllm-model-endpoint" + address: "your-model-predictor..svc.cluster.local" + port: 80 # KServe uses port 80 for internal service + weight: 1 + # Example with granite32-8b: + # - name: "granite32-8b-endpoint" + # address: "granite32-8b-predictor..svc.cluster.local" + # port: 80 + # weight: 1 + + model_config: + # Configure this to match your deployed InferenceService model name + "your-model-name": + reasoning_family: "qwen3" # Options: qwen3, deepseek, gpt, gpt-oss + preferred_endpoints: ["vllm-model-endpoint"] + pii_policy: + allow_by_default: true + pii_types_allowed: ["EMAIL_ADDRESS"] + # Example with granite32-8b: + # "granite32-8b": + # reasoning_family: "qwen3" + # preferred_endpoints: ["granite32-8b-endpoint"] + # pii_policy: + # allow_by_default: true + # pii_types_allowed: ["EMAIL_ADDRESS"] + + # Classifier configuration + classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + + # Categories with model scoring + categories: + - name: business + system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices." + model_scores: + - model: your-model-name + score: 0.7 + use_reasoning: false + - name: law + system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions." + model_scores: + - model: your-model-name + score: 0.4 + use_reasoning: false + - name: psychology + system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.92 + model_scores: + - model: your-model-name + score: 0.6 + use_reasoning: false + - name: biology + system_prompt: "You are a biology expert with comprehensive knowledge spanning molecular biology, genetics, cell biology, ecology, evolution, anatomy, physiology, and biotechnology." + model_scores: + - model: your-model-name + score: 0.9 + use_reasoning: false + - name: chemistry + system_prompt: "You are a chemistry expert specializing in chemical reactions, molecular structures, and laboratory techniques. Provide detailed, step-by-step explanations." + model_scores: + - model: your-model-name + score: 0.6 + use_reasoning: true + - name: history + system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis." + model_scores: + - model: your-model-name + score: 0.7 + use_reasoning: false + - name: other + system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.75 + model_scores: + - model: your-model-name + score: 0.7 + use_reasoning: false + - name: health + system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: your-model-name + score: 0.5 + use_reasoning: false + - name: economics + system_prompt: "You are an economics expert with deep understanding of microeconomics, macroeconomics, econometrics, financial markets, monetary policy, fiscal policy, international trade, and economic theory." + model_scores: + - model: your-model-name + score: 1.0 + use_reasoning: false + - name: math + system_prompt: "You are a mathematics expert. Provide step-by-step solutions, show your work clearly, and explain mathematical concepts in an understandable way." + model_scores: + - model: your-model-name + score: 1.0 + use_reasoning: true + - name: physics + system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate." + model_scores: + - model: your-model-name + score: 0.7 + use_reasoning: true + - name: computer science + system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful." + model_scores: + - model: your-model-name + score: 0.6 + use_reasoning: false + - name: philosophy + system_prompt: "You are a philosophy expert with comprehensive knowledge of philosophical traditions, ethical theories, logic, metaphysics, epistemology, political philosophy, and the history of philosophical thought." + model_scores: + - model: your-model-name + score: 0.5 + use_reasoning: false + - name: engineering + system_prompt: "You are an engineering expert with knowledge across multiple engineering disciplines including mechanical, electrical, civil, chemical, software, and systems engineering." + model_scores: + - model: your-model-name + score: 0.7 + use_reasoning: false + + default_model: your-model-name + + # Reasoning family configurations + reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" + + default_reasoning_effort: high + + # API Configuration + api: + batch_classification: + max_batch_size: 100 + concurrency_threshold: 5 + max_concurrency: 8 + metrics: + enabled: true + detailed_goroutine_tracking: true + high_resolution_timing: false + sample_rate: 1.0 + duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] + + # Embedding Models Configuration + embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" + gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true + + # Observability Configuration + observability: + tracing: + enabled: false + provider: "opentelemetry" + exporter: + type: "stdout" + endpoint: "localhost:4317" + insecure: true + sampling: + type: "always_on" + rate: 1.0 + resource: + service_name: "vllm-semantic-router" + service_version: "v0.1.0" + deployment_environment: "production" diff --git a/deploy/kserve/deployment.yaml b/deploy/kserve/deployment.yaml new file mode 100644 index 000000000..f039f2a47 --- /dev/null +++ b/deploy/kserve/deployment.yaml @@ -0,0 +1,269 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: semantic-router-kserve + labels: + app: semantic-router + component: gateway + annotations: + opendatahub.io/dashboard: "true" +spec: + replicas: 1 + selector: + matchLabels: + app: semantic-router + component: gateway + template: + metadata: + labels: + app: semantic-router + component: gateway + annotations: + sidecar.istio.io/inject: "false" # Disable Istio injection to avoid conflicts with Envoy + spec: + serviceAccountName: semantic-router # Create ServiceAccount if RBAC required + # OpenShift security context - let OpenShift assign UID/GID + securityContext: + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + + initContainers: + # Init container to download models from HuggingFace + - name: model-downloader + image: python:3.11-slim + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + command: ["/bin/bash", "-c"] + args: + - | + set -e + echo "Installing Hugging Face CLI..." + pip install --no-cache-dir huggingface_hub[cli] + + echo "Downloading models to persistent volume..." + cd /app/models + + # Download category classifier model + if [ ! -d "category_classifier_modernbert-base_model" ] || [ -z "$(find category_classifier_modernbert-base_model -name '*.safetensors' -o -name '*.bin' -o -name 'pytorch_model.*' 2>/dev/null)" ]; then + echo "Downloading category classifier model..." + huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model \ + --local-dir category_classifier_modernbert-base_model \ + --cache-dir /app/cache/hf + else + echo "Category classifier model already exists, skipping..." + fi + + # Download PII classifier model + if [ ! -d "pii_classifier_modernbert-base_model" ] || [ -z "$(find pii_classifier_modernbert-base_model -name '*.safetensors' -o -name '*.bin' -o -name 'pytorch_model.*' 2>/dev/null)" ]; then + echo "Downloading PII classifier model..." + huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model \ + --local-dir pii_classifier_modernbert-base_model \ + --cache-dir /app/cache/hf + else + echo "PII classifier model already exists, skipping..." + fi + + # Download jailbreak classifier model + if [ ! -d "jailbreak_classifier_modernbert-base_model" ] || [ -z "$(find jailbreak_classifier_modernbert-base_model -name '*.safetensors' -o -name '*.bin' -o -name 'pytorch_model.*' 2>/dev/null)" ]; then + echo "Downloading jailbreak classifier model..." + huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model \ + --local-dir jailbreak_classifier_modernbert-base_model \ + --cache-dir /app/cache/hf + else + echo "Jailbreak classifier model already exists, skipping..." + fi + + # Download PII token classifier model + if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ] || [ -z "$(find pii_classifier_modernbert-base_presidio_token_model -name '*.safetensors' -o -name '*.bin' -o -name 'pytorch_model.*' 2>/dev/null)" ]; then + echo "Downloading PII token classifier model..." + huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model \ + --local-dir pii_classifier_modernbert-base_presidio_token_model \ + --cache-dir /app/cache/hf + else + echo "PII token classifier model already exists, skipping..." + fi + + # Download embedding model for semantic cache (BERT) + if [ ! -d "all-MiniLM-L12-v2" ]; then + echo "Downloading BERT embedding model for semantic cache..." + huggingface-cli download sentence-transformers/all-MiniLM-L12-v2 \ + --local-dir all-MiniLM-L12-v2 \ + --cache-dir /app/cache/hf + else + echo "BERT embedding model already exists, skipping..." + fi + + echo "All models downloaded successfully!" + ls -la /app/models/ + + echo "Setting proper permissions for models directory..." + find /app/models -type f -exec chmod 644 {} \; || echo "Warning: Could not change model file permissions" + find /app/models -type d -exec chmod 755 {} \; || echo "Warning: Could not change model directory permissions" + + echo "Creating cache directories..." + mkdir -p /app/cache/hf /app/cache/transformers /app/cache/sentence_transformers /app/cache/xdg /app/cache/bert + chmod -R 777 /app/cache/ || echo "Warning: Could not change cache directory permissions" + + echo "Model download complete." + env: + - name: HF_HUB_CACHE + value: /app/cache/hf + - name: HF_HOME + value: /app/cache/hf + - name: TRANSFORMERS_CACHE + value: /app/cache/transformers + - name: PIP_CACHE_DIR + value: /tmp/pip_cache + - name: PYTHONUSERBASE + value: /tmp/python_user + - name: PATH + value: /tmp/python_user/bin:/usr/local/bin:/usr/bin:/bin + resources: + requests: + memory: "512Mi" + cpu: "250m" + limits: + memory: "1Gi" + cpu: "500m" + volumeMounts: + - name: models-volume + mountPath: /app/models + - name: cache-volume + mountPath: /app/cache + + containers: + # Semantic Router container + - name: semantic-router + image: ghcr.io/vllm-project/semantic-router/extproc:latest + imagePullPolicy: Always + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + ports: + - containerPort: 50051 + name: grpc + protocol: TCP + - containerPort: 9190 + name: metrics + protocol: TCP + - containerPort: 8080 + name: classify-api + protocol: TCP + env: + - name: LD_LIBRARY_PATH + value: "/app/lib" + - name: HF_HOME + value: "/app/cache/hf" + - name: TRANSFORMERS_CACHE + value: "/app/cache/transformers" + - name: SENTENCE_TRANSFORMERS_HOME + value: "/app/cache/sentence_transformers" + - name: XDG_CACHE_HOME + value: "/app/cache/xdg" + - name: HOME + value: "/tmp/home" + volumeMounts: + - name: config-volume + mountPath: /app/config + readOnly: true + - name: models-volume + mountPath: /app/models + - name: cache-volume + mountPath: /app/cache + livenessProbe: + tcpSocket: + port: 50051 + initialDelaySeconds: 60 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + tcpSocket: + port: 50051 + initialDelaySeconds: 90 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + resources: + requests: + memory: "3Gi" + cpu: "1" + limits: + memory: "6Gi" + cpu: "2" + + # Envoy proxy container - routes to KServe endpoints + - name: envoy-proxy + image: envoyproxy/envoy:v1.35.3 + ports: + - containerPort: 8801 + name: envoy-http + protocol: TCP + - containerPort: 19000 + name: envoy-admin + protocol: TCP + command: ["/usr/local/bin/envoy"] + args: + - "-c" + - "/etc/envoy/envoy.yaml" + - "--component-log-level" + - "ext_proc:info,router:info,http:info" + env: + - name: loglevel + value: "info" + volumeMounts: + - name: envoy-config-volume + mountPath: /etc/envoy + readOnly: true + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault + livenessProbe: + tcpSocket: + port: 8801 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 10 + failureThreshold: 3 + readinessProbe: + tcpSocket: + port: 8801 + initialDelaySeconds: 10 + periodSeconds: 15 + timeoutSeconds: 10 + failureThreshold: 3 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" + + volumes: + - name: config-volume + configMap: + name: semantic-router-kserve-config + - name: envoy-config-volume + configMap: + name: semantic-router-envoy-kserve-config + - name: models-volume + persistentVolumeClaim: + claimName: semantic-router-models + - name: cache-volume + persistentVolumeClaim: + claimName: semantic-router-cache diff --git a/deploy/kserve/example-multi-model-config.yaml b/deploy/kserve/example-multi-model-config.yaml new file mode 100644 index 000000000..4faea00fd --- /dev/null +++ b/deploy/kserve/example-multi-model-config.yaml @@ -0,0 +1,294 @@ +# Example configuration for multiple KServe InferenceServices +# This shows how to configure the semantic router to route between multiple models +# based on query category and complexity + +apiVersion: v1 +kind: ConfigMap +metadata: + name: semantic-router-kserve-config + labels: + app: semantic-router + component: config +data: + config.yaml: | + bert_model: + model_id: models/all-MiniLM-L12-v2 + threshold: 0.6 + use_cpu: true + + semantic_cache: + enabled: true + backend_type: "memory" + similarity_threshold: 0.85 + max_entries: 5000 + ttl_seconds: 7200 + eviction_policy: "lru" + use_hnsw: true + hnsw_m: 16 + hnsw_ef_construction: 200 + embedding_model: "bert" + + tools: + enabled: true + top_k: 5 + similarity_threshold: 0.2 + tools_db_path: "config/tools_db.json" + fallback_to_empty: true + + prompt_guard: + enabled: true + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + + # Multiple vLLM Endpoints - KServe InferenceServices + # Example: Small model for simple queries, large model for complex ones + # Replace with your actual namespace + vllm_endpoints: + # Small, fast model (e.g., Granite 3.2 8B) + - name: "granite32-8b-endpoint" + address: "granite32-8b-predictor..svc.cluster.local" + port: 80 + weight: 1 + + # Larger, more capable model (e.g., Granite 3.2 78B or Llama 3.1 70B) + # - name: "granite32-78b-endpoint" + # address: "granite32-78b-predictor..svc.cluster.local" + # port: 80 + # weight: 1 + + # Specialized coding model (e.g., CodeLlama or Granite Code) + # - name: "granite-code-endpoint" + # address: "granite-code-predictor..svc.cluster.local" + # port: 80 + # weight: 1 + + model_config: + # Small model - good for general queries, fast + "granite32-8b": + reasoning_family: "qwen3" + preferred_endpoints: ["granite32-8b-endpoint"] + pii_policy: + allow_by_default: true + pii_types_allowed: ["EMAIL_ADDRESS"] + + # Large model - better for complex reasoning + # "granite32-78b": + # reasoning_family: "qwen3" + # preferred_endpoints: ["granite32-78b-endpoint"] + # pii_policy: + # allow_by_default: true + # pii_types_allowed: ["EMAIL_ADDRESS"] + + # Code-specialized model + # "granite-code": + # reasoning_family: "qwen3" + # preferred_endpoints: ["granite-code-endpoint"] + # pii_policy: + # allow_by_default: true + + classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + pii_model: + model_id: "models/pii_classifier_modernbert-base_presidio_token_model" + use_modernbert: true + threshold: 0.7 + use_cpu: true + pii_mapping_path: "models/pii_classifier_modernbert-base_presidio_token_model/pii_type_mapping.json" + + # Category-based routing strategy + # Higher scores route to that model for the category + categories: + # Simple categories → small model + - name: business + system_prompt: "You are a senior business consultant and strategic advisor." + model_scores: + - model: granite32-8b + score: 0.8 + use_reasoning: false + # - model: granite32-78b + # score: 0.6 + # use_reasoning: false + + - name: other + system_prompt: "You are a helpful assistant." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.75 + model_scores: + - model: granite32-8b + score: 1.0 + use_reasoning: false + + # Complex reasoning categories → large model + - name: math + system_prompt: "You are a mathematics expert." + model_scores: + - model: granite32-8b + score: 0.7 + use_reasoning: true + # - model: granite32-78b + # score: 1.0 + # use_reasoning: true + + - name: physics + system_prompt: "You are a physics expert." + model_scores: + - model: granite32-8b + score: 0.7 + use_reasoning: true + # - model: granite32-78b + # score: 0.9 + # use_reasoning: true + + # Coding → specialized code model + - name: computer science + system_prompt: "You are a computer science expert." + model_scores: + # - model: granite-code + # score: 1.0 + # use_reasoning: false + - model: granite32-8b + score: 0.8 + use_reasoning: false + # - model: granite32-78b + # score: 0.6 + # use_reasoning: false + + # Other categories + - name: law + system_prompt: "You are a knowledgeable legal expert." + model_scores: + - model: granite32-8b + score: 0.5 + use_reasoning: false + # - model: granite32-78b + # score: 0.9 + # use_reasoning: false + + - name: psychology + system_prompt: "You are a psychology expert." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.92 + model_scores: + - model: granite32-8b + score: 0.7 + use_reasoning: false + + - name: biology + system_prompt: "You are a biology expert." + model_scores: + - model: granite32-8b + score: 0.9 + use_reasoning: false + + - name: chemistry + system_prompt: "You are a chemistry expert." + model_scores: + - model: granite32-8b + score: 0.7 + use_reasoning: true + # - model: granite32-78b + # score: 0.9 + # use_reasoning: true + + - name: history + system_prompt: "You are a historian." + model_scores: + - model: granite32-8b + score: 0.8 + use_reasoning: false + + - name: health + system_prompt: "You are a health and medical information expert." + semantic_cache_enabled: true + semantic_cache_similarity_threshold: 0.95 + model_scores: + - model: granite32-8b + score: 0.6 + use_reasoning: false + # - model: granite32-78b + # score: 0.8 + # use_reasoning: false + + - name: economics + system_prompt: "You are an economics expert." + model_scores: + - model: granite32-8b + score: 0.9 + use_reasoning: false + + - name: philosophy + system_prompt: "You are a philosophy expert." + model_scores: + - model: granite32-8b + score: 0.6 + use_reasoning: false + # - model: granite32-78b + # score: 0.8 + # use_reasoning: false + + - name: engineering + system_prompt: "You are an engineering expert." + model_scores: + - model: granite32-8b + score: 0.8 + use_reasoning: false + + default_model: granite32-8b + + reasoning_families: + deepseek: + type: "chat_template_kwargs" + parameter: "thinking" + qwen3: + type: "chat_template_kwargs" + parameter: "enable_thinking" + gpt-oss: + type: "reasoning_effort" + parameter: "reasoning_effort" + gpt: + type: "reasoning_effort" + parameter: "reasoning_effort" + + default_reasoning_effort: high + + api: + batch_classification: + max_batch_size: 100 + concurrency_threshold: 5 + max_concurrency: 8 + metrics: + enabled: true + detailed_goroutine_tracking: true + high_resolution_timing: false + sample_rate: 1.0 + duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30] + size_buckets: [1, 2, 5, 10, 20, 50, 100, 200] + + embedding_models: + qwen3_model_path: "models/Qwen3-Embedding-0.6B" + gemma_model_path: "models/embeddinggemma-300m" + use_cpu: true + + observability: + tracing: + enabled: false + provider: "opentelemetry" + exporter: + type: "stdout" + endpoint: "localhost:4317" + insecure: true + sampling: + type: "always_on" + rate: 1.0 + resource: + service_name: "vllm-semantic-router" + service_version: "v0.1.0" + deployment_environment: "production" diff --git a/deploy/kserve/inference-examples/README.md b/deploy/kserve/inference-examples/README.md new file mode 100644 index 000000000..a38e6398f --- /dev/null +++ b/deploy/kserve/inference-examples/README.md @@ -0,0 +1,23 @@ +# KServe InferenceService Examples + +This directory contains example KServe resource configurations for deploying vLLM models on OpenShift AI. + +## Files + +- `servingruntime-granite32-8b.yaml` - ServingRuntime configuration for vLLM with Granite 3.2 8B +- `inferenceservice-granite32-8b.yaml` - InferenceService to deploy the Granite 3.2 8B model + +## Usage + +```bash +# Deploy the ServingRuntime +oc apply -f servingruntime-granite32-8b.yaml + +# Deploy the InferenceService +oc apply -f inferenceservice-granite32-8b.yaml + +# Get the internal service URL for use in semantic router config +oc get inferenceservice granite32-8b -o jsonpath='{.status.components.predictor.address.url}' +``` + +These examples can be customized for your specific models and resource requirements. diff --git a/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml b/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml new file mode 100644 index 000000000..85c900991 --- /dev/null +++ b/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml @@ -0,0 +1,36 @@ +apiVersion: serving.kserve.io/v1beta1 +kind: InferenceService +metadata: + annotations: + openshift.io/display-name: granite3.2-8b + serving.knative.openshift.io/enablePassthrough: "true" + sidecar.istio.io/inject: "true" + sidecar.istio.io/rewriteAppHTTPProbers: "true" + labels: + opendatahub.io/dashboard: "true" + name: granite32-8b +spec: + predictor: + containerConcurrency: 1 + maxReplicas: 1 + minReplicas: 1 + model: + modelFormat: + name: vLLM + name: "" + resources: + limits: + cpu: "2" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "2" + memory: 8Gi + nvidia.com/gpu: "1" + runtime: granite32-8b + storageUri: oci://quay.io/redhat-ai-services/modelcar-catalog:granite-3.2-8b-instruct + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Equal + value: "True" diff --git a/deploy/kserve/inference-examples/servingruntime-granite32-8b.yaml b/deploy/kserve/inference-examples/servingruntime-granite32-8b.yaml new file mode 100644 index 000000000..aa54e4b8d --- /dev/null +++ b/deploy/kserve/inference-examples/servingruntime-granite32-8b.yaml @@ -0,0 +1,52 @@ +apiVersion: serving.kserve.io/v1alpha1 +kind: ServingRuntime +metadata: + annotations: + opendatahub.io/apiProtocol: REST + opendatahub.io/recommended-accelerators: '["nvidia.com/gpu"]' + opendatahub.io/template-display-name: vLLM ServingRuntime for KServe + opendatahub.io/template-name: vllm-runtime + openshift.io/display-name: granite32-8b + labels: + opendatahub.io/dashboard: "true" + name: granite32-8b +spec: + annotations: + prometheus.io/path: /metrics + prometheus.io/port: "8080" + containers: + - args: + - --port=8080 + - --model=/mnt/models + - --served-model-name={{.Name}} + - --enable-auto-tool-choice + - --tool-call-parser + - granite + - --chat-template + - /app/data/template/tool_chat_template_granite.jinja + - --max-model-len + - "120000" + command: + - python + - -m + - vllm.entrypoints.openai.api_server + env: + - name: HF_HOME + value: /tmp/hf_home + image: quay.io/modh/vllm@sha256:4f550996130e7d16cacb24ca9a2865e7cf51eddaab014ceaf31a1ea6ef86d4ec + name: kserve-container + ports: + - containerPort: 8080 + protocol: TCP + volumeMounts: + - mountPath: /dev/shm + name: shm + multiModel: false + supportedModelFormats: + - autoSelect: true + name: vLLM + volumes: + - emptyDir: + medium: Memory + sizeLimit: 2Gi + name: shm diff --git a/deploy/kserve/kustomization.yaml b/deploy/kserve/kustomization.yaml new file mode 100644 index 000000000..c6cc416e8 --- /dev/null +++ b/deploy/kserve/kustomization.yaml @@ -0,0 +1,22 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Set your namespace here or use: oc apply -k . -n +# namespace: your-namespace + +resources: + - serviceaccount.yaml + - pvc.yaml + - configmap-router-config.yaml + - configmap-envoy-config.yaml + - deployment.yaml + - service.yaml + - route.yaml + +commonLabels: + app.kubernetes.io/name: semantic-router + app.kubernetes.io/component: gateway + app.kubernetes.io/part-of: vllm-semantic-router + +# Optional: Add namespace creation if needed +# - namespace.yaml diff --git a/deploy/kserve/pvc.yaml b/deploy/kserve/pvc.yaml new file mode 100644 index 000000000..3e8a6ba2b --- /dev/null +++ b/deploy/kserve/pvc.yaml @@ -0,0 +1,33 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: semantic-router-models + labels: + app: semantic-router + component: storage +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi # Adjust based on model size requirements + # storageClassName: gp3-csi # Uncomment and set to your storage class if needed + volumeMode: Filesystem + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: semantic-router-cache + labels: + app: semantic-router + component: storage +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 5Gi # Cache storage - adjust as needed + # storageClassName: gp3-csi # Uncomment and set to your storage class if needed + volumeMode: Filesystem diff --git a/deploy/kserve/route.yaml b/deploy/kserve/route.yaml new file mode 100644 index 000000000..4d3fd7300 --- /dev/null +++ b/deploy/kserve/route.yaml @@ -0,0 +1,21 @@ +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: semantic-router-kserve + labels: + app: semantic-router + component: gateway + annotations: + haproxy.router.openshift.io/timeout: "300s" + haproxy.router.openshift.io/balance: "roundrobin" +spec: + to: + kind: Service + name: semantic-router-kserve + weight: 100 + port: + targetPort: envoy-http + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None diff --git a/deploy/kserve/service.yaml b/deploy/kserve/service.yaml new file mode 100644 index 000000000..6656f099d --- /dev/null +++ b/deploy/kserve/service.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +kind: Service +metadata: + name: semantic-router-kserve + labels: + app: semantic-router + component: gateway + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9190" + prometheus.io/path: "/metrics" +spec: + type: ClusterIP + selector: + app: semantic-router + component: gateway + ports: + - name: envoy-http + port: 80 + targetPort: 8801 + protocol: TCP + - name: envoy-http-direct + port: 8801 + targetPort: 8801 + protocol: TCP + - name: grpc + port: 50051 + targetPort: 50051 + protocol: TCP + - name: metrics + port: 9190 + targetPort: 9190 + protocol: TCP + - name: classify-api + port: 8080 + targetPort: 8080 + protocol: TCP + - name: envoy-admin + port: 19000 + targetPort: 19000 + protocol: TCP + sessionAffinity: None diff --git a/deploy/kserve/serviceaccount.yaml b/deploy/kserve/serviceaccount.yaml new file mode 100644 index 000000000..10277c03e --- /dev/null +++ b/deploy/kserve/serviceaccount.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: semantic-router + labels: + app: semantic-router diff --git a/deploy/kserve/test-semantic-routing.sh b/deploy/kserve/test-semantic-routing.sh new file mode 100755 index 000000000..b6f71e7b7 --- /dev/null +++ b/deploy/kserve/test-semantic-routing.sh @@ -0,0 +1,226 @@ +#!/bin/bash +# Simple test script to verify semantic routing is working +# Tests different query categories and verifies routing decisions + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Configuration +NAMESPACE="${NAMESPACE:-$(oc project -q)}" +ROUTE_NAME="semantic-router-kserve" +# Model name to use for testing - get from configmap or override with MODEL_NAME env var +MODEL_NAME="${MODEL_NAME:-$(oc get configmap semantic-router-kserve-config -n "$NAMESPACE" -o jsonpath='{.data.config\.yaml}' 2>/dev/null | grep 'default_model:' | awk '{print $2}' || echo 'your-model-name')}" + +# Get the route URL +echo "Using namespace: $NAMESPACE" +echo "Using model: $MODEL_NAME" +echo "Getting semantic router URL..." +ROUTER_URL=$(oc get route "$ROUTE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.host}' 2>/dev/null) + +if [ -z "$ROUTER_URL" ]; then + echo -e "${RED}✗${NC} Could not find route '$ROUTE_NAME' in namespace '$NAMESPACE'" + echo "Make sure the semantic router is deployed" + echo "Set NAMESPACE environment variable if using a different namespace" + exit 1 +fi + +# Determine protocol +if oc get route "$ROUTE_NAME" -n "$NAMESPACE" -o jsonpath='{.spec.tls.termination}' 2>/dev/null | grep -q .; then + ROUTER_URL="https://$ROUTER_URL" +else + ROUTER_URL="http://$ROUTER_URL" +fi + +echo -e "${GREEN}✓${NC} Semantic router URL: $ROUTER_URL" +echo "" + +# Function to test classification +test_classification() { + local query="$1" + local expected_category="$2" + + echo -e "${BLUE}Testing:${NC} \"$query\"" + echo -n "Expected category: $expected_category ... " + + # Call classification endpoint + response=$(curl -s -k -X POST "$ROUTER_URL/v1/classify" \ + -H "Content-Type: application/json" \ + -d "{\"text\": \"$query\"}" 2>/dev/null) + + if [ -z "$response" ]; then + echo -e "${RED}FAIL${NC} - No response from server" + return 1 + fi + + # Extract category from response + category=$(echo "$response" | grep -o '"category":"[^"]*"' | cut -d'"' -f4) + model=$(echo "$response" | grep -o '"selected_model":"[^"]*"' | cut -d'"' -f4) + + if [ -z "$category" ]; then + echo -e "${RED}FAIL${NC} - Could not parse category from response" + echo "Response: $response" + return 1 + fi + + if [ "$category" == "$expected_category" ]; then + echo -e "${GREEN}PASS${NC} - Category: $category, Model: $model" + return 0 + else + echo -e "${YELLOW}PARTIAL${NC} - Got: $category (expected: $expected_category), Model: $model" + return 0 + fi +} + +# Function to test chat completion +test_chat_completion() { + local query="$1" + local model="${2:-$MODEL_NAME}" + + echo -e "${BLUE}Testing chat completion:${NC} \"$query\"" + echo -n "Sending request to model: $model ... " + + response=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\": \"$model\", \"messages\": [{\"role\": \"user\", \"content\": \"$query\"}], \"max_tokens\": 50}" 2>/dev/null) + + if [ -z "$response" ]; then + echo -e "${RED}FAIL${NC} - No response" + return 1 + fi + + # Check for error in response + if echo "$response" | grep -q '"error"'; then + echo -e "${RED}FAIL${NC}" + echo "Error: $(echo "$response" | grep -o '"message":"[^"]*"' | cut -d'"' -f4)" + return 1 + fi + + # Check for completion + if echo "$response" | grep -q '"choices"'; then + echo -e "${GREEN}PASS${NC}" + # Extract first few words of response + content=$(echo "$response" | grep -o '"content":"[^"]*"' | head -1 | cut -d'"' -f4 | cut -c1-100) + echo " Response preview: $content..." + return 0 + else + echo -e "${RED}FAIL${NC} - Invalid response format" + return 1 + fi +} + +echo "==================================================" +echo "Semantic Routing Validation Tests" +echo "==================================================" +echo "" + +# Test 1: Check /v1/models endpoint +echo -e "${BLUE}Test 1:${NC} Checking /v1/models endpoint" +models_response=$(curl -s -k "$ROUTER_URL/v1/models" 2>/dev/null) +if echo "$models_response" | grep -q '"object":"list"'; then + echo -e "${GREEN}✓${NC} Models endpoint responding correctly" + echo "Available models: $(echo "$models_response" | grep -o '"id":"[^"]*"' | cut -d'"' -f4 | tr '\n' ', ' | sed 's/,$//')" +else + echo -e "${RED}✗${NC} Models endpoint not responding correctly" + echo "Response: $models_response" +fi +echo "" + +# Test 2: Classification tests for different categories +echo -e "${BLUE}Test 2:${NC} Testing category classification" +echo "" + +test_classification "What is the derivative of x squared?" "math" +test_classification "Explain quantum entanglement in physics" "physics" +test_classification "Write a function to reverse a string in Python" "computer science" +test_classification "What are the main causes of World War II?" "history" +test_classification "How do I start a small business?" "business" +test_classification "What is the molecular structure of water?" "chemistry" +test_classification "Explain photosynthesis in plants" "biology" +test_classification "Hello, how are you today?" "other" + +echo "" + +# Test 3: End-to-end chat completion +echo -e "${BLUE}Test 3:${NC} Testing end-to-end chat completion" +echo "" + +test_chat_completion "What is 2+2? Answer briefly." +test_chat_completion "Tell me a joke" + +echo "" + +# Test 4: PII detection (if enabled) +echo -e "${BLUE}Test 4:${NC} Testing PII detection" +echo "" + +echo -e "${BLUE}Testing:${NC} Query with PII (SSN)" +response=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"My SSN is 123-45-6789\"}], \"max_tokens\": 50}" 2>/dev/null) + +if echo "$response" | grep -qi "pii\|blocked\|detected"; then + echo -e "${GREEN}✓${NC} PII detection working - request blocked or flagged" +elif echo "$response" | grep -q '"error"'; then + echo -e "${GREEN}✓${NC} PII protection active - request rejected" + echo " Message: $(echo "$response" | grep -o '"message":"[^"]*"' | cut -d'"' -f4)" +else + echo -e "${YELLOW}⚠${NC} PII may have passed through (check if PII policy allows it)" +fi + +echo "" + +# Test 5: Semantic caching +echo -e "${BLUE}Test 5:${NC} Testing semantic caching" +echo "" + +CACHE_QUERY="What is the capital of France?" + +echo "First request (cache miss expected)..." +time1_start=$(date +%s%N) +response1=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"$CACHE_QUERY\"}], \"max_tokens\": 20}" 2>/dev/null) +time1_end=$(date +%s%N) +time1=$((($time1_end - $time1_start) / 1000000)) + +sleep 1 + +echo "Second request (cache hit expected)..." +time2_start=$(date +%s%N) +response2=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ + -H "Content-Type: application/json" \ + -d "{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"$CACHE_QUERY\"}], \"max_tokens\": 20}" 2>/dev/null) +time2_end=$(date +%s%N) +time2=$((($time2_end - $time2_start) / 1000000)) + +echo "First request: ${time1}ms" +echo "Second request: ${time2}ms" + +if [ "$time2" -lt "$time1" ]; then + speedup=$((($time1 - $time2) * 100 / $time1)) + echo -e "${GREEN}✓${NC} Cache appears to be working (${speedup}% faster)" +else + echo -e "${YELLOW}⚠${NC} Cache behavior unclear or not significant" +fi + +echo "" +echo "==================================================" +echo "Validation Complete" +echo "==================================================" +echo "" +echo "Semantic routing is operational!" +echo "" +echo "Next steps:" +echo " • Review the test results above" +echo " • Check logs: oc logs -n $NAMESPACE -l app=semantic-router -c semantic-router" +echo " • View metrics: oc port-forward -n $NAMESPACE svc/$ROUTE_NAME 9190:9190" +echo " • Test with your own queries: curl -k \"$ROUTER_URL/v1/chat/completions\" \\" +echo " -H 'Content-Type: application/json' \\" +echo " -d '{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"Your query here\"}]}'" +echo "" From a2c5fed5d212a2c73b8f0c5e62d23446bafa6a96 Mon Sep 17 00:00:00 2001 From: Ryan Cook Date: Fri, 31 Oct 2025 16:17:08 -0400 Subject: [PATCH 2/4] fix of lint Signed-off-by: Ryan Cook --- deploy/kserve/README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/deploy/kserve/README.md b/deploy/kserve/README.md index 218e4ccd0..aa2b5f8c6 100644 --- a/deploy/kserve/README.md +++ b/deploy/kserve/README.md @@ -32,6 +32,7 @@ Client Request (OpenAI API) ``` The deployment runs two containers in a single pod: + 1. **Semantic Router**: ExtProc service that performs classification and routing logic 2. **Envoy Proxy**: HTTP proxy that integrates with the semantic router via gRPC @@ -51,6 +52,7 @@ oc get inferenceservice ``` Example output: + ``` NAME URL READY PREV LATEST granite32-8b https://granite32-8b-your-ns.apps... True 100 @@ -63,6 +65,7 @@ oc get inferenceservice granite32-8b -o jsonpath='{.status.components.predictor. ``` Example output: + ``` http://granite32-8b-predictor.your-namespace.svc.cluster.local ``` @@ -82,6 +85,7 @@ vllm_endpoints: ``` **Important**: + - Replace `` with your actual namespace - Replace `your-model` with your InferenceService name - Use the **internal cluster URL** format: `-predictor..svc.cluster.local` @@ -116,6 +120,7 @@ categories: ``` **Category Scoring**: + - Scores range from 0.0 to 1.0 - Higher scores indicate better suitability for the category - The router selects the model with the highest score for each query category @@ -132,6 +137,7 @@ resources: ``` Model storage requirements: + - Category classifier: ~500MB - PII classifier: ~500MB - Jailbreak classifier: ~500MB @@ -263,6 +269,7 @@ curl http://localhost:9190/metrics ``` Key metrics: + - `semantic_router_classification_duration_seconds`: Classification latency - `semantic_router_cache_hit_total`: Cache hit count - `semantic_router_pii_detections_total`: PII detection count @@ -307,6 +314,7 @@ oc get pvc ``` **Common issues**: + - PVC pending: No storage class available or insufficient capacity - ImagePullBackOff: Check image registry permissions - Init container failing: Network issues downloading models from HuggingFace @@ -354,11 +362,13 @@ curl -k "https://$ROUTER_URL/v1/classify" \ ### 503 Service Unavailable **Possible causes**: + 1. InferenceService is not ready 2. Incorrect endpoint address in config 3. Network policy blocking traffic **Solutions**: + ```bash # Verify InferenceService is ready oc get inferenceservice @@ -379,6 +389,7 @@ To add additional models: 1. **Deploy InferenceService** (if not already deployed) 2. **Update ConfigMap** (`configmap-router-config.yaml`): + ```yaml vllm_endpoints: - name: "new-model-endpoint" @@ -403,6 +414,7 @@ To add additional models: ``` 3. **Apply updated ConfigMap**: + ```bash oc apply -f configmap-router-config.yaml @@ -454,6 +466,7 @@ oc scale deployment/semantic-router-kserve --replicas=3 Point your OpenAI client to the semantic router: **Python Example**: + ```python from openai import OpenAI @@ -471,6 +484,7 @@ print(response.choices[0].message.content) ``` **cURL Example**: + ```bash curl -k "https://semantic-router-your-namespace.apps.your-cluster.com/v1/chat/completions" \ -H "Content-Type: application/json" \ @@ -507,5 +521,6 @@ oc delete serviceaccount semantic-router ## Support For issues and questions: + - GitHub Issues: https://github.com/vllm-project/semantic-router/issues - Documentation: https://vllm-semantic-router.com/docs From 7558477958472166de0969bd3e84380b6598a9dd Mon Sep 17 00:00:00 2001 From: Ryan Cook Date: Sun, 2 Nov 2025 15:21:35 -0500 Subject: [PATCH 3/4] removal of spellcheck errs Signed-off-by: Ryan Cook --- deploy/kserve/test-semantic-routing.sh | 6 +++--- website/package-lock.json | 29 ++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/deploy/kserve/test-semantic-routing.sh b/deploy/kserve/test-semantic-routing.sh index b6f71e7b7..e2861e8ec 100755 --- a/deploy/kserve/test-semantic-routing.sh +++ b/deploy/kserve/test-semantic-routing.sh @@ -187,7 +187,7 @@ response1=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -d "{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"$CACHE_QUERY\"}], \"max_tokens\": 20}" 2>/dev/null) time1_end=$(date +%s%N) -time1=$((($time1_end - $time1_start) / 1000000)) +time1=$(((time1_end - time1_start) / 1000000)) sleep 1 @@ -197,13 +197,13 @@ response2=$(curl -s -k -X POST "$ROUTER_URL/v1/chat/completions" \ -H "Content-Type: application/json" \ -d "{\"model\": \"$MODEL_NAME\", \"messages\": [{\"role\": \"user\", \"content\": \"$CACHE_QUERY\"}], \"max_tokens\": 20}" 2>/dev/null) time2_end=$(date +%s%N) -time2=$((($time2_end - $time2_start) / 1000000)) +time2=$(((time2_end - time2_start) / 1000000)) echo "First request: ${time1}ms" echo "Second request: ${time2}ms" if [ "$time2" -lt "$time1" ]; then - speedup=$((($time1 - $time2) * 100 / $time1)) + speedup=$(((time1 - time2) * 100 / time1)) echo -e "${GREEN}✓${NC} Cache appears to be working (${speedup}% faster)" else echo -e "${YELLOW}⚠${NC} Cache behavior unclear or not significant" diff --git a/website/package-lock.json b/website/package-lock.json index 2e3db8bc5..2870663a9 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -179,6 +179,7 @@ "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-5.37.0.tgz", "integrity": "sha512-DAFVUvEg+u7jUs6BZiVz9zdaUebYULPiQ4LM2R4n8Nujzyj7BZzGr2DCd85ip4p/cx7nAZWKM8pLcGtkTRTdsg==", "license": "MIT", + "peer": true, "dependencies": { "@algolia/client-common": "5.37.0", "@algolia/requester-browser-xhr": "5.37.0", @@ -326,6 +327,7 @@ "resolved": "https://registry.npmmirror.com/@babel/core/-/core-7.28.4.tgz", "integrity": "sha512-2BCOP7TN8M+gVDj7/ht3hsaO/B/n5oDbiAyyvnRlNOs+u1o+JWNYTQrmpuNp1/Wq2gcFrI01JAW+paEKDMx/CA==", "license": "MIT", + "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.3", @@ -2160,6 +2162,7 @@ } ], "license": "MIT", + "peer": true, "engines": { "node": ">=18" }, @@ -2182,6 +2185,7 @@ } ], "license": "MIT", + "peer": true, "engines": { "node": ">=18" } @@ -2291,6 +2295,7 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", + "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -2683,6 +2688,7 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", + "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -3614,6 +3620,7 @@ "resolved": "https://registry.npmmirror.com/@docusaurus/plugin-content-docs/-/plugin-content-docs-3.8.1.tgz", "integrity": "sha512-oByRkSZzeGNQByCMaX+kif5Nl2vmtj2IHQI2fWjCfCootsdKZDPFLonhIp5s3IGJO7PLUfe0POyw0Xh/RrGXJA==", "license": "MIT", + "peer": true, "dependencies": { "@docusaurus/core": "3.8.1", "@docusaurus/logger": "3.8.1", @@ -5079,6 +5086,7 @@ "resolved": "https://registry.npmmirror.com/@mdx-js/react/-/react-3.1.1.tgz", "integrity": "sha512-f++rKLQgUVYDAtECQ6fn/is15GkEH9+nZPM3MS0RcxVqoTfawHvDlSCH7JbMhAM6uJ32v3eXLvLmLvjGu7PTQw==", "license": "MIT", + "peer": true, "dependencies": { "@types/mdx": "^2.0.0" }, @@ -5410,6 +5418,7 @@ "resolved": "https://registry.npmmirror.com/@svgr/core/-/core-8.1.0.tgz", "integrity": "sha512-8QqtOQT5ACVlmsvKOJNEaWmRPmcojMOzCz4Hs2BGG/toAp/K38LcsMRyLp349glq5AzJbCEeimEoxaX6v/fLrA==", "license": "MIT", + "peer": true, "dependencies": { "@babel/core": "^7.21.3", "@svgr/babel-preset": "8.1.0", @@ -6059,6 +6068,7 @@ "resolved": "https://registry.npmmirror.com/@types/react/-/react-19.1.16.tgz", "integrity": "sha512-WBM/nDbEZmDUORKnh5i1bTnAz6vTohUf9b8esSMu+b24+srbaxa04UbJgWx78CVfNXA20sNu0odEIluZDFdCog==", "license": "MIT", + "peer": true, "dependencies": { "csstype": "^3.0.2" } @@ -6242,6 +6252,7 @@ "integrity": "sha512-TGf22kon8KW+DeKaUmOibKWktRY8b2NSAZNdtWh798COm1NWx8+xJ6iFBtk3IvLdv6+LGLJLRlyhrhEDZWargQ==", "dev": true, "license": "MIT", + "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.45.0", "@typescript-eslint/types": "8.45.0", @@ -6633,6 +6644,7 @@ "resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "license": "MIT", + "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -6700,6 +6712,7 @@ "resolved": "https://registry.npmmirror.com/ajv/-/ajv-6.12.6.tgz", "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", "license": "MIT", + "peer": true, "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", @@ -6764,6 +6777,7 @@ "resolved": "https://registry.npmmirror.com/algoliasearch/-/algoliasearch-5.37.0.tgz", "integrity": "sha512-y7gau/ZOQDqoInTQp0IwTOjkrHc4Aq4R8JgpmCleFwiLl+PbN2DMWoDUWZnrK8AhNJwT++dn28Bt4NZYNLAmuA==", "license": "MIT", + "peer": true, "dependencies": { "@algolia/abtesting": "1.3.0", "@algolia/client-abtesting": "5.37.0", @@ -7396,6 +7410,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001737", "electron-to-chromium": "^1.5.211", @@ -7679,6 +7694,7 @@ "resolved": "https://registry.npmmirror.com/chevrotain/-/chevrotain-11.0.3.tgz", "integrity": "sha512-ci2iJH6LeIkvP9eJW6gpueU8cnZhv85ELY8w8WiFtNjMHA5ad6pQLaJo9mEly/9qUyCpvqX8/POVUTf18/HFdw==", "license": "Apache-2.0", + "peer": true, "dependencies": { "@chevrotain/cst-dts-gen": "11.0.3", "@chevrotain/gast": "11.0.3", @@ -8389,6 +8405,7 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", + "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -8708,6 +8725,7 @@ "resolved": "https://registry.npmmirror.com/cytoscape/-/cytoscape-3.33.1.tgz", "integrity": "sha512-iJc4TwyANnOGR1OmWhsS9ayRS3s+XQ185FmuHObThD+5AeJCakAAbWv8KimMTt08xCCLNgneQwFp+JRJOr9qGQ==", "license": "MIT", + "peer": true, "engines": { "node": ">=0.10" } @@ -9117,6 +9135,7 @@ "resolved": "https://registry.npmmirror.com/d3-selection/-/d3-selection-3.0.0.tgz", "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "license": "ISC", + "peer": true, "engines": { "node": ">=12" } @@ -9998,6 +10017,7 @@ "resolved": "https://registry.npmmirror.com/eslint/-/eslint-9.18.0.tgz", "integrity": "sha512-+waTfRWQlSbpt3KWE+CjrPPYnbq9kfZIYUqapc0uBXyjTp8aYXZDsUH16m39Ryq3NjAVP4tjuF7KaukeqoCoaA==", "license": "MIT", + "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.12.1", @@ -16589,6 +16609,7 @@ } ], "license": "MIT", + "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -17492,6 +17513,7 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", + "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -18322,6 +18344,7 @@ "resolved": "https://registry.npmmirror.com/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -18334,6 +18357,7 @@ "resolved": "https://registry.npmmirror.com/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", + "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -18390,6 +18414,7 @@ "resolved": "https://registry.npmmirror.com/@docusaurus/react-loadable/-/react-loadable-6.0.0.tgz", "integrity": "sha512-YMMxTUQV/QFSnbgrP3tjDzLHRg7vsbMn8e9HAa8o/1iXoiomo48b7sk/kkmWEuWNDPJVlKSJRB6Y2fHqdJk+SQ==", "license": "MIT", + "peer": true, "dependencies": { "@types/react": "*" }, @@ -18418,6 +18443,7 @@ "resolved": "https://registry.npmmirror.com/react-router/-/react-router-5.3.4.tgz", "integrity": "sha512-Ys9K+ppnJah3QuaRiLxk+jDWOR1MekYQrlytiXxC1RyfbdsZkS5pvKAzCCr031xHixZwpnsYNT5xysdFHQaYsA==", "license": "MIT", + "peer": true, "dependencies": { "@babel/runtime": "^7.12.13", "history": "^4.9.0", @@ -19293,6 +19319,7 @@ "resolved": "https://registry.npmmirror.com/ajv/-/ajv-8.17.1.tgz", "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", "license": "MIT", + "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", @@ -20675,6 +20702,7 @@ "resolved": "https://registry.npmmirror.com/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "license": "Apache-2.0", + "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -21260,6 +21288,7 @@ "resolved": "https://registry.npmmirror.com/webpack/-/webpack-5.101.3.tgz", "integrity": "sha512-7b0dTKR3Ed//AD/6kkx/o7duS8H3f1a4w3BYpIriX4BzIhjkn4teo05cptsxvLesHFKK5KObnadmCHBwGc+51A==", "license": "MIT", + "peer": true, "dependencies": { "@types/eslint-scope": "^3.7.7", "@types/estree": "^1.0.8", From 04ae77629ec704f0241f55852c3749881ff4ca52 Mon Sep 17 00:00:00 2001 From: Ryan Cook Date: Mon, 3 Nov 2025 15:30:25 -0500 Subject: [PATCH 4/4] remove toleration Signed-off-by: Ryan Cook --- .../inferenceservice-granite32-8b.yaml | 5 ---- website/package-lock.json | 29 ------------------- 2 files changed, 34 deletions(-) diff --git a/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml b/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml index 85c900991..873ea0b08 100644 --- a/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml +++ b/deploy/kserve/inference-examples/inferenceservice-granite32-8b.yaml @@ -29,8 +29,3 @@ spec: nvidia.com/gpu: "1" runtime: granite32-8b storageUri: oci://quay.io/redhat-ai-services/modelcar-catalog:granite-3.2-8b-instruct - tolerations: - - effect: NoSchedule - key: nvidia.com/gpu - operator: Equal - value: "True" diff --git a/website/package-lock.json b/website/package-lock.json index 2870663a9..2e3db8bc5 100644 --- a/website/package-lock.json +++ b/website/package-lock.json @@ -179,7 +179,6 @@ "resolved": "https://registry.npmmirror.com/@algolia/client-search/-/client-search-5.37.0.tgz", "integrity": "sha512-DAFVUvEg+u7jUs6BZiVz9zdaUebYULPiQ4LM2R4n8Nujzyj7BZzGr2DCd85ip4p/cx7nAZWKM8pLcGtkTRTdsg==", "license": "MIT", - "peer": true, "dependencies": { "@algolia/client-common": "5.37.0", "@algolia/requester-browser-xhr": "5.37.0", @@ -327,7 +326,6 @@ "resolved": "https://registry.npmmirror.com/@babel/core/-/core-7.28.4.tgz", "integrity": "sha512-2BCOP7TN8M+gVDj7/ht3hsaO/B/n5oDbiAyyvnRlNOs+u1o+JWNYTQrmpuNp1/Wq2gcFrI01JAW+paEKDMx/CA==", "license": "MIT", - "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.3", @@ -2162,7 +2160,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" }, @@ -2185,7 +2182,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -2295,7 +2291,6 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", - "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -2688,7 +2683,6 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", - "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -3620,7 +3614,6 @@ "resolved": "https://registry.npmmirror.com/@docusaurus/plugin-content-docs/-/plugin-content-docs-3.8.1.tgz", "integrity": "sha512-oByRkSZzeGNQByCMaX+kif5Nl2vmtj2IHQI2fWjCfCootsdKZDPFLonhIp5s3IGJO7PLUfe0POyw0Xh/RrGXJA==", "license": "MIT", - "peer": true, "dependencies": { "@docusaurus/core": "3.8.1", "@docusaurus/logger": "3.8.1", @@ -5086,7 +5079,6 @@ "resolved": "https://registry.npmmirror.com/@mdx-js/react/-/react-3.1.1.tgz", "integrity": "sha512-f++rKLQgUVYDAtECQ6fn/is15GkEH9+nZPM3MS0RcxVqoTfawHvDlSCH7JbMhAM6uJ32v3eXLvLmLvjGu7PTQw==", "license": "MIT", - "peer": true, "dependencies": { "@types/mdx": "^2.0.0" }, @@ -5418,7 +5410,6 @@ "resolved": "https://registry.npmmirror.com/@svgr/core/-/core-8.1.0.tgz", "integrity": "sha512-8QqtOQT5ACVlmsvKOJNEaWmRPmcojMOzCz4Hs2BGG/toAp/K38LcsMRyLp349glq5AzJbCEeimEoxaX6v/fLrA==", "license": "MIT", - "peer": true, "dependencies": { "@babel/core": "^7.21.3", "@svgr/babel-preset": "8.1.0", @@ -6068,7 +6059,6 @@ "resolved": "https://registry.npmmirror.com/@types/react/-/react-19.1.16.tgz", "integrity": "sha512-WBM/nDbEZmDUORKnh5i1bTnAz6vTohUf9b8esSMu+b24+srbaxa04UbJgWx78CVfNXA20sNu0odEIluZDFdCog==", "license": "MIT", - "peer": true, "dependencies": { "csstype": "^3.0.2" } @@ -6252,7 +6242,6 @@ "integrity": "sha512-TGf22kon8KW+DeKaUmOibKWktRY8b2NSAZNdtWh798COm1NWx8+xJ6iFBtk3IvLdv6+LGLJLRlyhrhEDZWargQ==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.45.0", "@typescript-eslint/types": "8.45.0", @@ -6644,7 +6633,6 @@ "resolved": "https://registry.npmmirror.com/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", "license": "MIT", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -6712,7 +6700,6 @@ "resolved": "https://registry.npmmirror.com/ajv/-/ajv-6.12.6.tgz", "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", "license": "MIT", - "peer": true, "dependencies": { "fast-deep-equal": "^3.1.1", "fast-json-stable-stringify": "^2.0.0", @@ -6777,7 +6764,6 @@ "resolved": "https://registry.npmmirror.com/algoliasearch/-/algoliasearch-5.37.0.tgz", "integrity": "sha512-y7gau/ZOQDqoInTQp0IwTOjkrHc4Aq4R8JgpmCleFwiLl+PbN2DMWoDUWZnrK8AhNJwT++dn28Bt4NZYNLAmuA==", "license": "MIT", - "peer": true, "dependencies": { "@algolia/abtesting": "1.3.0", "@algolia/client-abtesting": "5.37.0", @@ -7410,7 +7396,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001737", "electron-to-chromium": "^1.5.211", @@ -7694,7 +7679,6 @@ "resolved": "https://registry.npmmirror.com/chevrotain/-/chevrotain-11.0.3.tgz", "integrity": "sha512-ci2iJH6LeIkvP9eJW6gpueU8cnZhv85ELY8w8WiFtNjMHA5ad6pQLaJo9mEly/9qUyCpvqX8/POVUTf18/HFdw==", "license": "Apache-2.0", - "peer": true, "dependencies": { "@chevrotain/cst-dts-gen": "11.0.3", "@chevrotain/gast": "11.0.3", @@ -8405,7 +8389,6 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", - "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -8725,7 +8708,6 @@ "resolved": "https://registry.npmmirror.com/cytoscape/-/cytoscape-3.33.1.tgz", "integrity": "sha512-iJc4TwyANnOGR1OmWhsS9ayRS3s+XQ185FmuHObThD+5AeJCakAAbWv8KimMTt08xCCLNgneQwFp+JRJOr9qGQ==", "license": "MIT", - "peer": true, "engines": { "node": ">=0.10" } @@ -9135,7 +9117,6 @@ "resolved": "https://registry.npmmirror.com/d3-selection/-/d3-selection-3.0.0.tgz", "integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==", "license": "ISC", - "peer": true, "engines": { "node": ">=12" } @@ -10017,7 +9998,6 @@ "resolved": "https://registry.npmmirror.com/eslint/-/eslint-9.18.0.tgz", "integrity": "sha512-+waTfRWQlSbpt3KWE+CjrPPYnbq9kfZIYUqapc0uBXyjTp8aYXZDsUH16m39Ryq3NjAVP4tjuF7KaukeqoCoaA==", "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.2.0", "@eslint-community/regexpp": "^4.12.1", @@ -16609,7 +16589,6 @@ } ], "license": "MIT", - "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -17513,7 +17492,6 @@ "resolved": "https://registry.npmmirror.com/postcss-selector-parser/-/postcss-selector-parser-7.1.0.tgz", "integrity": "sha512-8sLjZwK0R+JlxlYcTuVnyT2v+htpdrjDOKuMcOVdYjt52Lh8hWRYpxBPoKx/Zg+bcjc3wx6fmQevMmUztS/ccA==", "license": "MIT", - "peer": true, "dependencies": { "cssesc": "^3.0.0", "util-deprecate": "^1.0.2" @@ -18344,7 +18322,6 @@ "resolved": "https://registry.npmmirror.com/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", "license": "MIT", - "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -18357,7 +18334,6 @@ "resolved": "https://registry.npmmirror.com/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", "license": "MIT", - "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -18414,7 +18390,6 @@ "resolved": "https://registry.npmmirror.com/@docusaurus/react-loadable/-/react-loadable-6.0.0.tgz", "integrity": "sha512-YMMxTUQV/QFSnbgrP3tjDzLHRg7vsbMn8e9HAa8o/1iXoiomo48b7sk/kkmWEuWNDPJVlKSJRB6Y2fHqdJk+SQ==", "license": "MIT", - "peer": true, "dependencies": { "@types/react": "*" }, @@ -18443,7 +18418,6 @@ "resolved": "https://registry.npmmirror.com/react-router/-/react-router-5.3.4.tgz", "integrity": "sha512-Ys9K+ppnJah3QuaRiLxk+jDWOR1MekYQrlytiXxC1RyfbdsZkS5pvKAzCCr031xHixZwpnsYNT5xysdFHQaYsA==", "license": "MIT", - "peer": true, "dependencies": { "@babel/runtime": "^7.12.13", "history": "^4.9.0", @@ -19319,7 +19293,6 @@ "resolved": "https://registry.npmmirror.com/ajv/-/ajv-8.17.1.tgz", "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==", "license": "MIT", - "peer": true, "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", @@ -20702,7 +20675,6 @@ "resolved": "https://registry.npmmirror.com/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -21288,7 +21260,6 @@ "resolved": "https://registry.npmmirror.com/webpack/-/webpack-5.101.3.tgz", "integrity": "sha512-7b0dTKR3Ed//AD/6kkx/o7duS8H3f1a4w3BYpIriX4BzIhjkn4teo05cptsxvLesHFKK5KObnadmCHBwGc+51A==", "license": "MIT", - "peer": true, "dependencies": { "@types/eslint-scope": "^3.7.7", "@types/estree": "^1.0.8",