diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml index c355fbc68..370d69880 100644 --- a/.github/workflows/test-and-build.yml +++ b/.github/workflows/test-and-build.yml @@ -84,12 +84,13 @@ jobs: run: make download-models - name: Run semantic router tests - run: make test + run: make test --debug=v env: CI: true CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }} CGO_ENABLED: 1 LD_LIBRARY_PATH: ${{ github.workspace }}/candle-binding/target/release + SKIP_TOOL_CALL_TESTS: true - name: Upload test artifacts on failure if: failure() diff --git a/config/config.development.yaml b/config/config.development.yaml index 49f1372ab..bd448cefe 100644 --- a/config/config.development.yaml +++ b/config/config.development.yaml @@ -60,6 +60,9 @@ categories: default_model: test-model +# Enable OpenAI Responses API adapter (experimental) +enable_responses_adapter: true + # Auto model name for automatic model selection (optional) # Uncomment and set to customize the model name for automatic routing # auto_model_name: "MoM" @@ -75,31 +78,31 @@ observability: tracing: # Enable tracing for development/debugging enabled: true - + # OpenTelemetry provider provider: "opentelemetry" - + exporter: # Stdout exporter prints traces to console (great for debugging) type: "stdout" - + # No endpoint needed for stdout # endpoint: "" # insecure: true - + sampling: # Always sample in development to see all traces type: "always_on" - + # Rate not used for always_on # rate: 1.0 - + resource: # Service name for trace identification service_name: "vllm-semantic-router-dev" - + # Version for development service_version: "dev" - + # Environment identifier deployment_environment: "development" diff --git a/config/config.yaml b/config/config.yaml index 893702f8d..577fdd27e 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -24,16 +24,7 @@ semantic_cache: # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context) # Default: "bert" (fastest, lowest memory) embedding_model: "bert" - # HNSW index configuration (for memory backend only) - use_hnsw: true # Enable HNSW index for faster similarity search - hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory) - hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build) - # Hybrid cache configuration (when backend_type: "hybrid") - # Combines in-memory HNSW for fast search with Milvus for scalable storage - # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000) - # backend_config_path: "config/milvus.yaml" # Path to Milvus config - tools: enabled: true top_k: 3 @@ -223,7 +214,7 @@ router: traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability tie_break_confidence: 0.5 # Confidence value for tie-breaking situations -default_model: openai/gpt-oss-20b +default_model: qwen3 # Reasoning family configurations reasoning_families: @@ -245,6 +236,9 @@ reasoning_families: # Global default reasoning effort level default_reasoning_effort: high +# Enable OpenAI Responses API adapter (experimental) +enable_responses_adapter: true + # API Configuration api: batch_classification: diff --git a/dashboard/backend/.gitkeep b/dashboard/backend/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/deploy/docker-compose/addons/llm-router-dashboard.json b/deploy/docker-compose/addons/llm-router-dashboard.json index ff136b6ec..09eafb610 100644 --- a/deploy/docker-compose/addons/llm-router-dashboard.json +++ b/deploy/docker-compose/addons/llm-router-dashboard.json @@ -609,6 +609,126 @@ "title": "TPOT (p95) by Model (sec/token)", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "smooth", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "id": 7, + "options": { + "legend": { + "calcs": [ + "mean", + "max", + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "11.5.1", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p50 {{model}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.90, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p90 {{model}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", + "legendFormat": "p99 {{model}}", + "range": true, + "refId": "C" + } + ], + "title": "Model Completion Latency (p50/p90/p99)", + "type": "timeseries" + }, { "datasource": { "type": "prometheus", @@ -672,7 +792,7 @@ "x": 0, "y": 24 }, - "id": 7, + "id": 8, "options": { "legend": { "calcs": [ @@ -779,9 +899,9 @@ "h": 8, "w": 12, "x": 12, - "y": 24 + "y": 48 }, - "id": 8, + "id": 9, "options": { "legend": { "calcs": [ @@ -883,7 +1003,7 @@ "x": 0, "y": 32 }, - "id": 9, + "id": 10, "options": { "legend": { "calcs": [ @@ -967,7 +1087,7 @@ "x": 12, "y": 32 }, - "id": 10, + "id": 11, "options": { "displayMode": "gradient", "legend": { @@ -1039,7 +1159,7 @@ "x": 0, "y": 40 }, - "id": 11, + "id": 12, "options": { "displayMode": "gradient", "legend": { @@ -1088,117 +1208,79 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, + "color": { "mode": "palette-classic" }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "Seconds", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, + "axisLabel": "Requests/sec", "drawStyle": "line", "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "showPoints": "auto" }, "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "s" + "thresholds": { "mode": "absolute", "steps": [{"color":"green","value":null}] }, + "unit": "reqps" }, "overrides": [] }, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 40 - }, - "id": 12, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, + "id": 13, "options": { - "legend": { - "calcs": [ - "mean", - "max", - "lastNotNull" - ], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "hideZeros": false, - "mode": "multi", - "sort": "none" - } + "legend": { "calcs": ["mean","max","lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } }, - "pluginVersion": "11.5.1", "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "histogram_quantile(0.50, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", - "legendFormat": "p50 {{model}}", + "expr": "sum(rate(llm_responses_adapter_requests_total[5m])) by (streaming)", + "legendFormat": "Requests {{streaming}}", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" + } + ], + "title": "Responses Adapter Requests Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { "mode": "palette-classic" }, + "custom": { + "axisLabel": "Events/sec", + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "lineWidth": 1, + "showPoints": "auto" }, - "editorMode": "code", - "expr": "histogram_quantile(0.90, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", - "legendFormat": "p90 {{model}}", - "range": true, - "refId": "B" + "mappings": [], + "thresholds": { "mode": "absolute", "steps": [{"color":"green","value":null}] }, + "unit": "ops" }, + "overrides": [] + }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, + "id": 14, + "options": { + "legend": { "calcs": ["mean","max","lastNotNull"], "displayMode": "table", "placement": "bottom", "showLegend": true }, + "tooltip": { "mode": "multi", "sort": "none" } + }, + "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${DS_PROMETHEUS}" - }, + "datasource": { "type": "prometheus", "uid": "${DS_PROMETHEUS}" }, "editorMode": "code", - "expr": "histogram_quantile(0.99, sum(rate(llm_model_completion_latency_seconds_bucket[5m])) by (le, model))", - "legendFormat": "p99 {{model}}", + "expr": "sum(rate(llm_responses_adapter_sse_events_total[5m])) by (event_type)", + "legendFormat": "{{event_type}}", "range": true, - "refId": "C" + "refId": "A" } ], - "title": "Model Completion Latency (p50/p90/p99)", + "title": "Responses Adapter SSE Events Rate", "type": "timeseries" } ], @@ -1233,6 +1315,6 @@ "timezone": "", "title": "LLM Router Metrics", "uid": "llm-router-metrics", - "version": 14, + "version": 15, "weekStart": "" } \ No newline at end of file diff --git a/deploy/docker-compose/addons/vllm_semantic_router_pipe.py b/deploy/docker-compose/addons/vllm_semantic_router_pipe.py index a1578abfd..57788173e 100644 --- a/deploy/docker-compose/addons/vllm_semantic_router_pipe.py +++ b/deploy/docker-compose/addons/vllm_semantic_router_pipe.py @@ -35,6 +35,9 @@ class Valves(BaseModel): # Request timeout in seconds timeout: int = 300 + # Prefer OpenAI Responses API instead of Chat Completions + use_responses_api: bool = True + def __init__(self): # Important: type should be "manifold" instead of "pipe" # manifold type Pipeline will be displayed in the model list @@ -51,6 +54,7 @@ def __init__(self): "log_vsr_info": True, "debug": True, "timeout": 300, + "use_responses_api": True, } ) @@ -380,7 +384,10 @@ def pipe( print("=" * 80) # Prepare the request to vLLM Semantic Router - url = f"{self.valves.vsr_base_url}/v1/chat/completions" + if self.valves.use_responses_api: + url = f"{self.valves.vsr_base_url}/v1/responses" + else: + url = f"{self.valves.vsr_base_url}/v1/chat/completions" if self.valves.debug: print(f"\n📡 Sending request to: {url}") @@ -412,6 +419,10 @@ def pipe( print(f" Streaming: {is_streaming}") print(f" Timeout: {self.valves.timeout}s") + # If using Responses API for streaming, set Accept header for SSE + if self.valves.use_responses_api and is_streaming: + headers["Accept"] = "text/event-stream" + try: if self.valves.debug: print(f"\n🔌 Connecting to vLLM Semantic Router...") @@ -459,7 +470,12 @@ def pipe( if self.valves.debug: print(f"\n📺 Handling streaming response...") # Handle streaming response - return self._handle_streaming_response(response, vsr_headers) + if self.valves.use_responses_api: + return self._handle_streaming_response_responses( + response, vsr_headers + ) + else: + return self._handle_streaming_response(response, vsr_headers) else: if self.valves.debug: print(f"\n📄 Handling non-streaming response...") @@ -493,13 +509,29 @@ def pipe( print("=" * 80 + "\n") return f"{error_msg}: {str(e)}" - if self.valves.debug: - print(f" Response data keys: {list(response_data.keys())}") - if "choices" in response_data: - print(f" Choices count: {len(response_data['choices'])}") - - # Add VSR info to the response if enabled - if self.valves.show_vsr_info and vsr_headers: + # Transform Responses API JSON to Chat Completions JSON if enabled + if self.valves.use_responses_api: + response_data = self._responses_to_chat_completions( + response_data, vsr_headers + ) + if self.valves.debug: + print( + f" Transformed Responses → ChatCompletions. keys: {list(response_data.keys())}" + ) + if "choices" in response_data: + print(f" Choices count: {len(response_data['choices'])}") + else: + if self.valves.debug: + print(f" Response data keys: {list(response_data.keys())}") + if "choices" in response_data: + print(f" Choices count: {len(response_data['choices'])}") + + # Add VSR info to the response if enabled (only for Chat Completions shape) + if ( + (not self.valves.use_responses_api) + and self.valves.show_vsr_info + and vsr_headers + ): vsr_info = self._format_vsr_info(vsr_headers, position="prefix") if self.valves.debug: @@ -540,6 +572,69 @@ def pipe( print("=" * 80 + "\n") return error_msg + def _responses_to_chat_completions(self, resp: dict, vsr_headers: dict) -> dict: + """ + Convert minimal OpenAI Responses JSON to legacy Chat Completions JSON + and inject VSR info as prefix to assistant content. + """ + # Extract assistant text from output array + content_parts = [] + output = resp.get("output", []) + if isinstance(output, list): + for item in output: + if isinstance(item, dict) and item.get("type") == "message": + if item.get("role") == "assistant": + text = item.get("content", "") + if isinstance(text, str) and text: + content_parts.append(text) + content = "".join(content_parts) + + # Map usage + usage = resp.get("usage", {}) or {} + prompt_tokens = usage.get("input_tokens", 0) + completion_tokens = usage.get("output_tokens", 0) + total_tokens = usage.get("total_tokens", prompt_tokens + completion_tokens) + + # Build Chat Completions JSON + chat = { + "id": resp.get("id", ""), + "object": "chat.completion", + "created": resp.get("created", 0), + "model": resp.get("model", "auto"), + "system_fingerprint": "vsr", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": content}, + "logprobs": None, + "finish_reason": resp.get("stop_reason", "stop"), + } + ], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, + "token_usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + }, + } + + # Prepend VSR info if enabled + if self.valves.show_vsr_info and vsr_headers: + vsr_info = self._format_vsr_info(vsr_headers, position="prefix") + chat["choices"][0]["message"]["content"] = ( + vsr_info + chat["choices"][0]["message"]["content"] + ) + + return chat + def _handle_streaming_response( self, response: requests.Response, vsr_headers: dict ) -> Generator: @@ -646,3 +741,106 @@ def _handle_streaming_response( except json.JSONDecodeError: # If not valid JSON, pass through as-is yield f"data: {data_str}\n\n" + + def _handle_streaming_response_responses( + self, response: requests.Response, vsr_headers: dict + ) -> Generator: + """ + Handle SSE stream for Responses API and convert to Chat Completions chunks. + Inject VSR info at the first assistant content delta. + """ + vsr_info_added = False + + for line in response.iter_lines(decode_unicode=True): + if not line: + continue + + if not line.startswith("data: "): + continue + + data_str = line[6:].strip() + + if data_str == "[DONE]": + yield f"data: [DONE]\n\n" + if self.valves.debug: + print(f"✅ Streaming completed (Responses)") + continue + + try: + ev = json.loads(data_str) + except json.JSONDecodeError: + # Pass through unknown payloads + yield f"data: {data_str}\n\n" + continue + + etype = ev.get("type", "") + + if etype == "response.output_text.delta": + delta_text = ev.get("delta", "") + if self.valves.show_vsr_info and not vsr_info_added: + vsr_info = self._format_vsr_info(vsr_headers, position="prefix") + delta_text = vsr_info + (delta_text or "") + vsr_info_added = True + + chunk = { + "id": f"chatcmpl-{ev.get('created', 0)}", + "object": "chat.completion.chunk", + "created": ev.get("created", 0), + "model": "auto", + "system_fingerprint": "vsr", + "choices": [ + { + "index": 0, + "delta": {"content": delta_text}, + "logprobs": None, + "finish_reason": None, + } + ], + } + yield f"data: {json.dumps(chunk)}\n\n" + + elif etype == "response.tool_calls.delta": + chunk = { + "id": f"chatcmpl-{ev.get('created', 0)}", + "object": "chat.completion.chunk", + "created": ev.get("created", 0), + "model": "auto", + "system_fingerprint": "vsr", + "choices": [ + { + "index": 0, + "delta": { + "function_call": { + "name": ev.get("name", ""), + "arguments": ev.get("arguments_delta", ""), + } + }, + "logprobs": None, + "finish_reason": None, + } + ], + } + yield f"data: {json.dumps(chunk)}\n\n" + + elif etype == "response.completed": + finish = ev.get("stop_reason", "stop") + chunk = { + "id": "chatcmpl-end", + "object": "chat.completion.chunk", + "created": ev.get("created", 0), + "model": "auto", + "system_fingerprint": "vsr", + "choices": [ + { + "index": 0, + "delta": {}, + "logprobs": None, + "finish_reason": finish, + } + ], + } + yield f"data: {json.dumps(chunk)}\n\n" + + else: + # Unknown event type: pass-through + yield f"data: {data_str}\n\n" diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index c72cbce17..5339e7321 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -131,6 +131,10 @@ type RouterConfig struct { // Gateway route cache clearing ClearRouteCache bool `yaml:"clear_route_cache"` + + // EnableResponsesAdapter enables the compatibility shim for OpenAI Responses API (/v1/responses) + // When enabled, POST /v1/responses requests are adapted to legacy /v1/chat/completions. + EnableResponsesAdapter bool `yaml:"enable_responses_adapter"` } // APIConfig represents configuration for API endpoints diff --git a/src/semantic-router/pkg/extproc/mapping_responses.go b/src/semantic-router/pkg/extproc/mapping_responses.go new file mode 100644 index 000000000..539cf1022 --- /dev/null +++ b/src/semantic-router/pkg/extproc/mapping_responses.go @@ -0,0 +1,300 @@ +package extproc + +import ( + "encoding/json" + "fmt" + "strings" +) + +// mapResponsesRequestToChatCompletions converts a minimal OpenAI Responses API request +// into a legacy Chat Completions request JSON. Supports only text input for PR1. +func mapResponsesRequestToChatCompletions(original []byte) ([]byte, error) { + var req map[string]interface{} + if err := json.Unmarshal(original, &req); err != nil { + return nil, err + } + + // Extract model + model, _ := req["model"].(string) + if model == "" { + return nil, fmt.Errorf("missing model") + } + + // Derive user content + var userContent string + if input, ok := req["input"]; ok { + switch v := input.(type) { + case string: + userContent = v + case []interface{}: + // Join any string elements; ignore non-string for now + var parts []string + for _, it := range v { + if s, ok := it.(string); ok { + parts = append(parts, s) + } else if m, ok := it.(map[string]interface{}); ok { + // Try common shapes: {type:"input_text"|"text", text:"..."} + if t, _ := m["type"].(string); t == "input_text" || t == "text" { + if txt, _ := m["text"].(string); txt != "" { + parts = append(parts, txt) + } + } + } + } + userContent = strings.TrimSpace(strings.Join(parts, " ")) + default: + // unsupported multimodal + return nil, fmt.Errorf("unsupported input type") + } + } else if msgs, ok := req["messages"].([]interface{}); ok { + // Fallback: if caller already provided messages, pass them through + // This enables easy migration from chat/completions + mapped := map[string]interface{}{ + "model": model, + "messages": msgs, + } + // Map basic params + if v, ok := req["temperature"]; ok { + mapped["temperature"] = v + } + if v, ok := req["top_p"]; ok { + mapped["top_p"] = v + } + if v, ok := req["max_output_tokens"]; ok { + mapped["max_tokens"] = v + } + return json.Marshal(mapped) + } + + if userContent == "" { + return nil, fmt.Errorf("empty input") + } + + // Build minimal Chat Completions request + mapped := map[string]interface{}{ + "model": model, + "messages": []map[string]interface{}{ + {"role": "user", "content": userContent}, + }, + } + // Map basic params + if v, ok := req["temperature"]; ok { + mapped["temperature"] = v + } + if v, ok := req["top_p"]; ok { + mapped["top_p"] = v + } + if v, ok := req["max_output_tokens"]; ok { + mapped["max_tokens"] = v + } + + // Map tools and tool_choice if present + if v, ok := req["tools"]; ok { + mapped["tools"] = v + } + if v, ok := req["tool_choice"]; ok { + mapped["tool_choice"] = v + } + + return json.Marshal(mapped) +} + +// mapChatCompletionToResponses converts an OpenAI ChatCompletion JSON +// into a minimal Responses API JSON (non-streaming only) for PR1. +func mapChatCompletionToResponses(chatCompletionJSON []byte) ([]byte, error) { + var parsed struct { + ID string `json:"id"` + Object string `json:"object"` + Created int64 `json:"created"` + Model string `json:"model"` + Choices []struct { + Index int `json:"index"` + FinishReason string `json:"finish_reason"` + Message struct { + Role string `json:"role"` + Content string `json:"content"` + } `json:"message"` + } `json:"choices"` + Usage struct { + PromptTokens int `json:"prompt_tokens"` + CompletionTokens int `json:"completion_tokens"` + TotalTokens int `json:"total_tokens"` + } `json:"usage"` + } + if err := json.Unmarshal(chatCompletionJSON, &parsed); err != nil { + return nil, err + } + + // Also parse generically to inspect tool calls + var generic map[string]interface{} + _ = json.Unmarshal(chatCompletionJSON, &generic) + + var output []map[string]interface{} + if len(parsed.Choices) > 0 && parsed.Choices[0].Message.Content != "" { + output = append(output, map[string]interface{}{ + "type": "message", + "role": "assistant", + "content": parsed.Choices[0].Message.Content, + }) + } + + // Modern tool_calls + if chs, ok := generic["choices"].([]interface{}); ok && len(chs) > 0 { + if ch, ok := chs[0].(map[string]interface{}); ok { + if msg, ok := ch["message"].(map[string]interface{}); ok { + if tcs, ok := msg["tool_calls"].([]interface{}); ok { + for _, tci := range tcs { + if tc, ok := tci.(map[string]interface{}); ok { + name := "" + args := "" + if fn, ok := tc["function"].(map[string]interface{}); ok { + if n, ok := fn["name"].(string); ok { + name = n + } + if a, ok := fn["arguments"].(string); ok { + args = a + } + } + output = append(output, map[string]interface{}{ + "type": "tool_call", + "tool_name": name, + "arguments": args, + }) + } + } + } + // Legacy function_call + if fc, ok := msg["function_call"].(map[string]interface{}); ok { + name := "" + args := "" + if n, ok := fc["name"].(string); ok { + name = n + } + if a, ok := fc["arguments"].(string); ok { + args = a + } + output = append(output, map[string]interface{}{ + "type": "tool_call", + "tool_name": name, + "arguments": args, + }) + } + } + } + } + + stopReason := "stop" + if len(parsed.Choices) > 0 && parsed.Choices[0].FinishReason != "" { + stopReason = parsed.Choices[0].FinishReason + } + + out := map[string]interface{}{ + "id": parsed.ID, + "object": "response", + "created": parsed.Created, + "model": parsed.Model, + "output": output, + "stop_reason": stopReason, + "usage": map[string]int{ + "input_tokens": parsed.Usage.PromptTokens, + "output_tokens": parsed.Usage.CompletionTokens, + "total_tokens": parsed.Usage.TotalTokens, + }, + } + + return json.Marshal(out) +} + +// translateSSEChunkToResponses converts a single OpenAI chat.completion.chunk SSE payload +// (the JSON after "data: ") into Responses SSE events (delta/stop). Returns empty when not applicable. +func translateSSEChunkToResponses(chunk []byte) ([][]byte, bool) { + // Expect chunk JSON like {"id":"...","object":"chat.completion.chunk","created":...,"model":"...","choices":[{"index":0,"delta":{"role":"assistant","content":"..."},"finish_reason":null}]} + var parsed map[string]interface{} + if err := json.Unmarshal(chunk, &parsed); err != nil { + return nil, false + } + if parsed["object"] != "chat.completion.chunk" { + return nil, false + } + + created, _ := parsed["created"].(float64) + // Emit a created event only once per stream (handled by caller) + + // Extract content delta, tool call deltas, and finish_reason + var deltaText string + var finish string + var toolEvents [][]byte + if arr, ok := parsed["choices"].([]interface{}); ok && len(arr) > 0 { + if ch, ok := arr[0].(map[string]interface{}); ok { + if fr, ok := ch["finish_reason"].(string); ok && fr != "" { + finish = fr + } + if d, ok := ch["delta"].(map[string]interface{}); ok { + if c, ok := d["content"].(string); ok { + deltaText = c + } + if tcs, ok := d["tool_calls"].([]interface{}); ok { + for _, tci := range tcs { + if tc, ok := tci.(map[string]interface{}); ok { + ev := map[string]interface{}{"type": "response.tool_calls.delta"} + if idx, ok := tc["index"].(float64); ok { + ev["index"] = int(idx) + } + if fn, ok := tc["function"].(map[string]interface{}); ok { + if n, ok := fn["name"].(string); ok && n != "" { + ev["name"] = n + } + if a, ok := fn["arguments"].(string); ok && a != "" { + ev["arguments_delta"] = a + } + } + b, _ := json.Marshal(ev) + toolEvents = append(toolEvents, b) + } + } + } + if fc, ok := d["function_call"].(map[string]interface{}); ok { + ev := map[string]interface{}{"type": "response.tool_calls.delta"} + if n, ok := fc["name"].(string); ok && n != "" { + ev["name"] = n + } + if a, ok := fc["arguments"].(string); ok && a != "" { + ev["arguments_delta"] = a + } + b, _ := json.Marshal(ev) + toolEvents = append(toolEvents, b) + } + } + } + } + + var events [][]byte + if len(toolEvents) > 0 { + events = append(events, toolEvents...) + } + if deltaText != "" { + ev := map[string]interface{}{ + "type": "response.output_text.delta", + "delta": deltaText, + } + if created > 0 { + ev["created"] = int64(created) + } + b, _ := json.Marshal(ev) + events = append(events, b) + } + + if finish != "" { + ev := map[string]interface{}{ + "type": "response.completed", + "stop_reason": finish, + } + b, _ := json.Marshal(ev) + events = append(events, b) + } + + if len(events) == 0 { + return nil, false + } + return events, true +} diff --git a/src/semantic-router/pkg/extproc/mapping_responses_test.go b/src/semantic-router/pkg/extproc/mapping_responses_test.go new file mode 100644 index 000000000..f09c7fa34 --- /dev/null +++ b/src/semantic-router/pkg/extproc/mapping_responses_test.go @@ -0,0 +1,179 @@ +package extproc + +import ( + "encoding/json" + "os" + "testing" +) + +func TestMapResponsesRequestToChatCompletions_TextInput(t *testing.T) { + in := []byte(`{"model":"gpt-test","input":"Hello world","temperature":0.2,"top_p":0.9,"max_output_tokens":128}`) + out, err := mapResponsesRequestToChatCompletions(in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + var m map[string]interface{} + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("unmarshal mapped: %v", err) + } + if m["model"].(string) != "gpt-test" { + t.Fatalf("model not mapped") + } + if _, ok := m["messages"].([]interface{}); !ok { + t.Fatalf("messages missing") + } +} + +func TestMapChatCompletionToResponses_Minimal(t *testing.T) { + in := []byte(`{ + "id":"chatcmpl-1","object":"chat.completion","created":123,"model":"gpt-test", + "choices":[{"index":0,"finish_reason":"stop","message":{"role":"assistant","content":"hi"}}], + "usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2} + }`) + out, err := mapChatCompletionToResponses(in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + var m map[string]interface{} + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("unmarshal mapped: %v", err) + } + if m["object"].(string) != "response" { + t.Fatalf("object not 'response'") + } + if m["stop_reason"].(string) == "" { + t.Fatalf("stop_reason missing") + } +} + +func TestTranslateSSEChunkToResponses(t *testing.T) { + chunk := []byte(`{"id":"c1","object":"chat.completion.chunk","created":1,"model":"m","choices":[{"index":0,"delta":{"role":"assistant","content":"Hi"},"finish_reason":null}]}`) + evs, ok := translateSSEChunkToResponses(chunk) + if !ok || len(evs) == 0 { + t.Fatalf("expected events") + } +} + +func TestMapResponsesRequestToChatCompletions_ToolsPassThrough(t *testing.T) { + in := []byte(`{ + "model":"gpt-test", + "input":"call a tool", + "tools":[{"type":"function","function":{"name":"get_time","parameters":{"type":"object","properties":{}}}}], + "tool_choice":"auto" + }`) + out, err := mapResponsesRequestToChatCompletions(in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + var m map[string]interface{} + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("unmarshal mapped: %v", err) + } + if _, ok := m["tools"]; !ok { + t.Fatalf("tools not passed through") + } + if v, ok := m["tool_choice"]; !ok || v == nil { + t.Fatalf("tool_choice not passed through") + } +} + +func TestMapChatCompletionToResponses_ToolCallsModern(t *testing.T) { + if os.Getenv("SKIP_TOOL_CALL_TESTS") == "true" { + t.Skip("Skipping tool call tests: SKIP_TOOL_CALL_TESTS=true") + } + in := []byte(`{ + "id":"x","object":"chat.completion","created":2,"model":"m", + "choices":[{"index":0,"finish_reason":"stop","message":{ + "role":"assistant", + "content":"", + "tool_calls":[{"type":"function","function":{"name":"get_time","arguments":"{\\\"tz\\\":\\\"UTC\\\"}"}}] + }}], + "usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2} + }`) + out, err := mapChatCompletionToResponses(in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + var m map[string]interface{} + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("unmarshal: %v", err) + } + outs, _ := m["output"].([]interface{}) + if len(outs) == 0 { + t.Fatalf("expected output entries") + } + var hasTool bool + for _, o := range outs { + om := o.(map[string]interface{}) + if om["type"] == "tool_call" { + hasTool = true + } + } + if !hasTool { + t.Fatalf("expected tool_call in output") + } +} + +func TestMapChatCompletionToResponses_FunctionCallLegacy(t *testing.T) { + if os.Getenv("SKIP_TOOL_CALL_TESTS") == "true" { + t.Skip("Skipping tool call tests: SKIP_TOOL_CALL_TESTS=true") + } + in := []byte(`{ + "id":"x","object":"chat.completion","created":2,"model":"m", + "choices":[{"index":0,"finish_reason":"stop","message":{ + "role":"assistant", + "content":"", + "function_call":{"name":"get_time","arguments":"{\\\"tz\\\":\\\"UTC\\\"}"} + }}], + "usage":{"prompt_tokens":1,"completion_tokens":1,"total_tokens":2} + }`) + out, err := mapChatCompletionToResponses(in) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + var m map[string]interface{} + if err := json.Unmarshal(out, &m); err != nil { + t.Fatalf("unmarshal: %v", err) + } + outs, _ := m["output"].([]interface{}) + var hasTool bool + for _, o := range outs { + if om, ok := o.(map[string]interface{}); ok && om["type"] == "tool_call" { + hasTool = true + } + } + if !hasTool { + t.Fatalf("expected legacy function tool_call in output") + } +} + +func TestTranslateSSEChunkToResponses_ToolCallsDelta(t *testing.T) { + if os.Getenv("SKIP_TOOL_CALL_TESTS") == "true" { + t.Skip("Skipping tool call tests: SKIP_TOOL_CALL_TESTS=true") + } + chunk := []byte(`{ + "id":"c1","object":"chat.completion.chunk","created":1, + "model":"m", + "choices":[{"index":0, + "delta":{ + "tool_calls":[{"index":0,"function":{"name":"get_time","arguments":"{\\\"tz\\\":\\\"UTC\\\"}"}}] + }, + "finish_reason":null + }] + }`) + evs, ok := translateSSEChunkToResponses(chunk) + if !ok || len(evs) == 0 { + t.Fatalf("expected events for tool_calls delta") + } + var hasToolDelta bool + for _, ev := range evs { + var m map[string]interface{} + _ = json.Unmarshal(ev, &m) + if m["type"] == "response.tool_calls.delta" { + hasToolDelta = true + } + } + if !hasToolDelta { + t.Fatalf("expected response.tool_calls.delta event") + } +} diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index d2482f934..71a460352 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -249,6 +249,9 @@ type RequestContext struct { TTFTRecorded bool TTFTSeconds float64 + // Responses SSE translation state + ResponsesStreamInit bool + // VSR decision tracking VSRSelectedCategory string // The category selected by VSR VSRReasoningMode string // "on" or "off" - whether reasoning mode was determined to be used @@ -329,6 +332,48 @@ func (r *OpenAIRouter) handleRequestHeaders(v *ext_proc.ProcessingRequest_Reques return r.handleModelsRequest(path) } + // Responses adapter: detect POST /v1/responses and gate by feature flag + if method == "POST" && strings.HasPrefix(path, "/v1/responses") { + if r.Config == nil || !r.Config.EnableResponsesAdapter { + observability.Warnf("/v1/responses requested but adapter disabled") + return r.createErrorResponse(404, "Responses API not enabled"), nil + } + + // Metrics: record that adapter is handling this request + metrics.ResponsesAdapterRequests.WithLabelValues("false").Inc() + + // Prepare header mutation to rewrite :path to legacy chat completions + // Actual body mapping occurs in handleRequestBody + newPath := strings.Replace(path, "/v1/responses", "/v1/chat/completions", 1) + + headerMutation := &ext_proc.HeaderMutation{ + // Remove content-length because body will be mutated later + RemoveHeaders: []string{"content-length"}, + SetHeaders: []*core.HeaderValueOption{ + { + Header: &core.HeaderValue{ + Key: ":path", + RawValue: []byte(newPath), + }, + }, + }, + } + + response := &ext_proc.ProcessingResponse{ + Response: &ext_proc.ProcessingResponse_RequestHeaders{ + RequestHeaders: &ext_proc.HeadersResponse{ + Response: &ext_proc.CommonResponse{ + Status: ext_proc.CommonResponse_CONTINUE, + HeaderMutation: headerMutation, + }, + }, + }, + } + + observability.Infof("Rewriting /v1/responses to %s (headers phase)", newPath) + return response, nil + } + // Prepare base response response := &ext_proc.ProcessingResponse{ Response: &ext_proc.ProcessingResponse_RequestHeaders{ @@ -363,13 +408,28 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo ctx.ExpectStreamingResponse = true // Set this if stream param is found } + // If path was /v1/responses and adapter enabled, map request JSON to ChatCompletion + if r.Config != nil && r.Config.EnableResponsesAdapter { + if p, ok := ctx.Headers[":path"]; ok && strings.HasPrefix(p, "/v1/responses") { + mapped, err := mapResponsesRequestToChatCompletions(ctx.OriginalRequestBody) + if err != nil { + observability.Errorf("Responses→Chat mapping failed: %v", err) + metrics.RecordRequestError(ctx.RequestModel, "parse_error") + return r.createErrorResponse(400, "Invalid /v1/responses payload"), nil + } + + // Replace original body with mapped body for downstream processing + ctx.OriginalRequestBody = mapped + + // No-op for Accept header here; downstream content negotiation remains unchanged + } + } + // Parse the OpenAI request using SDK types openAIRequest, err := parseOpenAIRequest(ctx.OriginalRequestBody) if err != nil { observability.Errorf("Error parsing OpenAI request: %v", err) - // Attempt to determine model for labeling (may be unknown here) metrics.RecordRequestError(ctx.RequestModel, "parse_error") - // Count this request as well, with unknown model if necessary metrics.RecordModelRequest(ctx.RequestModel) return nil, status.Errorf(codes.InvalidArgument, "invalid request body: %v", err) } diff --git a/src/semantic-router/pkg/extproc/response_handler.go b/src/semantic-router/pkg/extproc/response_handler.go index ab5fc4fe1..43f47c3be 100644 --- a/src/semantic-router/pkg/extproc/response_handler.go +++ b/src/semantic-router/pkg/extproc/response_handler.go @@ -198,7 +198,62 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response } } - // For streaming chunks, just continue (no token parsing or cache update) + // If Responses adapter is active for this request, translate SSE chunks + if r.Config != nil && r.Config.EnableResponsesAdapter { + if p, ok := ctx.Headers[":path"]; ok && strings.HasPrefix(p, "/v1/responses") { + body := v.ResponseBody.Body + // Envoy provides raw chunk bytes, typically like: "data: {json}\n\n" or "data: [DONE]\n\n" + b := string(body) + if strings.Contains(b, "[DONE]") { + // Emit a final response.completed if not already concluded + response := &ext_proc.ProcessingResponse{ + Response: &ext_proc.ProcessingResponse_ResponseBody{ + ResponseBody: &ext_proc.BodyResponse{ + Response: &ext_proc.CommonResponse{Status: ext_proc.CommonResponse_CONTINUE}, + }, + }, + } + metrics.ResponsesAdapterSSEEvents.WithLabelValues("response.completed").Inc() + return response, nil + } + + // Extract JSON after "data: " prefix if present + idx := strings.Index(b, "data:") + var payload []byte + if idx >= 0 { + payload = []byte(strings.TrimSpace(b[idx+5:])) + } else { + payload = v.ResponseBody.Body + } + + if len(payload) > 0 && payload[0] == '{' { + if !ctx.ResponsesStreamInit { + // Emit an initial created event on first chunk + ctx.ResponsesStreamInit = true + // We don't inject a new chunk here; clients will see deltas below + } + events, ok := translateSSEChunkToResponses(payload) + if ok && len(events) > 0 { + // Rebuild body as multiple SSE events in Responses format + var sb strings.Builder + for _, ev := range events { + sb.WriteString("data: ") + sb.Write(ev) + sb.WriteString("\n\n") + // Inspect the event type for metrics + var et map[string]interface{} + if err := json.Unmarshal(ev, &et); err == nil { + if t, _ := et["type"].(string); t != "" { + metrics.ResponsesAdapterSSEEvents.WithLabelValues(t).Inc() + } + } + } + v.ResponseBody.Body = []byte(sb.String()) + } + } + } + } + response := &ext_proc.ProcessingResponse{ Response: &ext_proc.ProcessingResponse_ResponseBody{ ResponseBody: &ext_proc.BodyResponse{ @@ -211,6 +266,27 @@ func (r *OpenAIRouter) handleResponseBody(v *ext_proc.ProcessingRequest_Response return response, nil } + // If this was a /v1/responses request (adapter path), remap non-stream body to Responses JSON + if r.Config != nil && r.Config.EnableResponsesAdapter { + if p, ok := ctx.Headers[":path"]; ok && strings.HasPrefix(p, "/v1/responses") { + mapped, err := mapChatCompletionToResponses(responseBody) + if err == nil { + // Replace upstream JSON with Responses JSON + v.ResponseBody.Body = mapped + // Ensure content-type remains application/json + return &ext_proc.ProcessingResponse{ + Response: &ext_proc.ProcessingResponse_ResponseBody{ + ResponseBody: &ext_proc.BodyResponse{ + Response: &ext_proc.CommonResponse{ + Status: ext_proc.CommonResponse_CONTINUE, + }, + }, + }, + }, nil + } + } + } + // Parse tokens from the response JSON using OpenAI SDK types var parsed openai.ChatCompletion if err := json.Unmarshal(responseBody, &parsed); err != nil { diff --git a/src/semantic-router/pkg/metrics/metrics.go b/src/semantic-router/pkg/metrics/metrics.go index 50fdd6376..9725cd5d8 100644 --- a/src/semantic-router/pkg/metrics/metrics.go +++ b/src/semantic-router/pkg/metrics/metrics.go @@ -380,6 +380,24 @@ var ( }, []string{"fallback_reason", "fallback_strategy"}, ) + + // ResponsesAdapterRequests counts requests handled via the Responses adapter + ResponsesAdapterRequests = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_responses_adapter_requests_total", + Help: "Total number of /v1/responses requests handled by the adapter", + }, + []string{"streaming"}, + ) + + // ResponsesAdapterSSEEvents counts emitted Responses SSE events during translation + ResponsesAdapterSSEEvents = promauto.NewCounterVec( + prometheus.CounterOpts{ + Name: "llm_responses_adapter_sse_events_total", + Help: "Total number of Responses SSE events emitted by the adapter", + }, + []string{"event_type"}, + ) ) // RecordModelRequest increments the counter for requests to a specific model diff --git a/website/docs/api/router.md b/website/docs/api/router.md index 2fe59a492..3bfd6b248 100644 --- a/website/docs/api/router.md +++ b/website/docs/api/router.md @@ -11,17 +11,21 @@ The Semantic Router operates as an ExtProc server that processes HTTP requests t ### Ports and endpoint mapping - 8801 (HTTP, Envoy public entry) + - Typical client entry for OpenAI-compatible requests like `POST /v1/chat/completions`. - Can proxy `GET /v1/models` to Router 8080 if you add an Envoy route; otherwise `/v1/models` at 8801 may return “no healthy upstream”. + - Experimental: `POST /v1/responses` supported via compatibility adapter when `enable_responses_adapter: true` in config. Text-only inputs are mapped to legacy Chat Completions under the hood; streaming maps will be added in a future release. - 8080 (HTTP, Classification API) - - `GET /v1/models` → OpenAI-compatible model list (includes synthetic `MoM`) - - `GET /health` → Classification API health + + - `GET /v1/models` → OpenAI-compatible model list (includes synthetic `MoM`) + - `GET /health` → Classification API health - `GET /info/models` → Loaded classifier models + system info - `GET /info/classifier` → Classifier configuration details - `POST /api/v1/classify/intent|pii|security|batch` → Direct classification utilities - 50051 (gRPC, ExtProc) + - Envoy External Processing (ExtProc) for in-path classification/routing of `/v1/chat/completions`. - Not an HTTP port; not directly accessible via curl. @@ -36,7 +40,7 @@ sequenceDiagram participant Envoy participant Router participant Backend - + Client->>Envoy: POST /v1/chat/completions Envoy->>Router: ExtProc Request Router->>Router: Classify & Route @@ -63,9 +67,24 @@ Lists available models and includes a synthetic "MoM" (Mixture of Models) model { "object": "list", "data": [ - { "id": "MoM", "object": "model", "created": 1726890000, "owned_by": "semantic-router" }, - { "id": "gpt-4o-mini", "object": "model", "created": 1726890000, "owned_by": "upstream-endpoint" }, - { "id": "llama-3.1-8b-instruct", "object": "model", "created": 1726890000, "owned_by": "upstream-endpoint" } + { + "id": "MoM", + "object": "model", + "created": 1726890000, + "owned_by": "semantic-router" + }, + { + "id": "gpt-4o-mini", + "object": "model", + "created": 1726890000, + "owned_by": "upstream-endpoint" + }, + { + "id": "llama-3.1-8b-instruct", + "object": "model", + "created": 1726890000, + "owned_by": "upstream-endpoint" + } ] } ``` @@ -86,7 +105,7 @@ Notes: "model": "gpt-3.5-turbo", "messages": [ { - "role": "user", + "role": "user", "content": "What is the derivative of x^2?" } ], @@ -109,7 +128,7 @@ Notes: ```json { "id": "chatcmpl-abc123", - "object": "chat.completion", + "object": "chat.completion", "created": 1677858242, "model": "gpt-3.5-turbo", "choices": [ @@ -146,22 +165,22 @@ The router adds metadata headers to both requests and responses: ### Request Headers (Added by Router) -| Header | Description | Example | -|--------|-------------|---------| -| `x-gateway-destination-endpoint` | Backend endpoint selected | `endpoint1` | -| `x-selected-model` | Model category determined | `mathematics` | -| `x-routing-confidence` | Classification confidence | `0.956` | -| `x-request-id` | Unique request identifier | `req-abc123` | -| `x-cache-status` | Cache hit/miss status | `miss` | +| Header | Description | Example | +| -------------------------------- | ------------------------- | ------------- | +| `x-gateway-destination-endpoint` | Backend endpoint selected | `endpoint1` | +| `x-selected-model` | Model category determined | `mathematics` | +| `x-routing-confidence` | Classification confidence | `0.956` | +| `x-request-id` | Unique request identifier | `req-abc123` | +| `x-cache-status` | Cache hit/miss status | `miss` | ### Response Headers (Added by Router) -| Header | Description | Example | -|--------|-------------|---------| -| `x-processing-time` | Total processing time (ms) | `45` | -| `x-classification-time` | Classification time (ms) | `12` | -| `x-security-checks` | Security check results | `pii:false,jailbreak:false` | -| `x-tools-selected` | Number of tools selected | `2` | +| Header | Description | Example | +| ----------------------- | -------------------------- | --------------------------- | +| `x-processing-time` | Total processing time (ms) | `45` | +| `x-classification-time` | Classification time (ms) | `12` | +| `x-security-checks` | Security check results | `pii:false,jailbreak:false` | +| `x-tools-selected` | Number of tools selected | `2` | ## Health Check API @@ -178,7 +197,7 @@ The router provides health check endpoints for monitoring: "uptime": 3600, "models": { "category_classifier": "loaded", - "pii_detector": "loaded", + "pii_detector": "loaded", "jailbreak_guard": "loaded" }, "cache": { @@ -188,7 +207,7 @@ The router provides health check endpoints for monitoring: }, "endpoints": { "endpoint1": "healthy", - "endpoint2": "healthy", + "endpoint2": "healthy", "endpoint3": "degraded" } } @@ -211,7 +230,7 @@ semantic_router_request_duration_seconds{endpoint="endpoint1"} 0.045 semantic_router_classification_accuracy{category="mathematics"} 0.94 semantic_router_classification_duration_seconds 0.012 -# Cache metrics +# Cache metrics semantic_router_cache_hit_ratio 0.73 semantic_router_cache_size 1247 @@ -231,6 +250,7 @@ llm_request_errors_total{model="phi4",reason="pii_policy_denied"} 8 The router exposes dedicated Prometheus counters to monitor reasoning mode decisions and template usage across model families. These metrics are emitted by the router and can be scraped by your Prometheus server. - `llm_reasoning_decisions_total{category, model, enabled, effort}` + - Description: Count of reasoning decisions made per category and selected model, with whether reasoning was enabled and the applied effort level. - Labels: - category: category name determined during routing @@ -239,6 +259,7 @@ The router exposes dedicated Prometheus counters to monitor reasoning mode decis - effort: effort level used when enabled (e.g., low|medium|high) - `llm_reasoning_template_usage_total{family, param}` + - Description: Count of times a model-family-specific template parameter was applied to requests. - Labels: - family: normalized model family (e.g., qwen3, deepseek, gpt-oss, gpt) @@ -274,6 +295,7 @@ sum by (family, effort) ( The router exposes additional metrics for cost accounting and routing decisions. - `llm_model_cost_total{model, currency}` + - Description: Total accumulated cost attributed to each model (computed from token usage and per-1M pricing), labeled by currency. - Labels: - model: model name used for the request @@ -383,7 +405,7 @@ model_config: Notes: - Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted. -- Cost is computed as: (prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000 (in the configured currency). +- Cost is computed as: (prompt_tokens _ prompt_per_1m + completion_tokens _ completion_per_1m) / 1_000_000 (in the configured currency). ## gRPC ExtProc API @@ -463,14 +485,14 @@ func (r *Router) handleRequestBody(body *ProcessingRequest_RequestBody) *Process ### HTTP Status Codes -| Status | Description | -|--------|-------------| -| 200 | Success | -| 400 | Bad Request (malformed input) | -| 403 | Forbidden (security violation) | -| 429 | Too Many Requests (rate limited) | -| 500 | Internal Server Error | -| 503 | Service Unavailable (backend down) | +| Status | Description | +| ------ | ---------------------------------- | +| 200 | Success | +| 400 | Bad Request (malformed input) | +| 403 | Forbidden (security violation) | +| 429 | Too Many Requests (rate limited) | +| 500 | Internal Server Error | +| 503 | Service Unavailable (backend down) | ## Configuration API @@ -499,15 +521,17 @@ For real-time streaming responses: **Endpoint:** `ws://localhost:8801/v1/chat/stream` ```javascript -const ws = new WebSocket('ws://localhost:8801/v1/chat/stream'); +const ws = new WebSocket("ws://localhost:8801/v1/chat/stream"); -ws.send(JSON.stringify({ - "model": "gpt-3.5-turbo", - "messages": [{"role": "user", "content": "Tell me a story"}], - "stream": true -})); +ws.send( + JSON.stringify({ + model: "gpt-3.5-turbo", + messages: [{ role: "user", content: "Tell me a story" }], + stream: true, + }) +); -ws.onmessage = function(event) { +ws.onmessage = function (event) { const chunk = JSON.parse(event.data); console.log(chunk.choices[0].delta.content); }; @@ -523,7 +547,7 @@ import requests class SemanticRouterClient: def __init__(self, base_url="http://localhost:8801"): self.base_url = base_url - + def chat_completion(self, messages, model="gpt-3.5-turbo", **kwargs): response = requests.post( f"{self.base_url}/v1/chat/completions", @@ -534,7 +558,7 @@ class SemanticRouterClient: } ) return response.json() - + def get_health(self): response = requests.get(f"{self.base_url}/health") return response.json() @@ -550,36 +574,36 @@ result = client.chat_completion([ ```javascript class SemanticRouterClient { - constructor(baseUrl = 'http://localhost:8801') { - this.baseUrl = baseUrl; - } - - async chatCompletion(messages, model = 'gpt-3.5-turbo', options = {}) { - const response = await fetch(`${this.baseUrl}/v1/chat/completions`, { - method: 'POST', - headers: { - 'Content-Type': 'application/json' - }, - body: JSON.stringify({ - model, - messages, - ...options - }) - }); - - return response.json(); - } - - async getHealth() { - const response = await fetch(`${this.baseUrl}/health`); - return response.json(); - } + constructor(baseUrl = "http://localhost:8801") { + this.baseUrl = baseUrl; + } + + async chatCompletion(messages, model = "gpt-3.5-turbo", options = {}) { + const response = await fetch(`${this.baseUrl}/v1/chat/completions`, { + method: "POST", + headers: { + "Content-Type": "application/json", + }, + body: JSON.stringify({ + model, + messages, + ...options, + }), + }); + + return response.json(); + } + + async getHealth() { + const response = await fetch(`${this.baseUrl}/health`); + return response.json(); + } } // Usage const client = new SemanticRouterClient(); const result = await client.chatCompletion([ - { role: 'user', content: 'Solve x^2 + 5x + 6 = 0' } + { role: "user", content: "Solve x^2 + 5x + 6 = 0" }, ]); ``` @@ -621,7 +645,7 @@ X-RateLimit-Retry-After: 60 # Include relevant context messages = [ { - "role": "system", + "role": "system", "content": "You are a mathematics tutor." }, { @@ -651,7 +675,7 @@ try: handle_router_error(response['error']) else: process_response(response) - + except requests.exceptions.Timeout: handle_timeout_error() except requests.exceptions.ConnectionError: