vllm-project · JaredforReal · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025 · Oct 27, 2025
@@ -84,12 +84,13 @@ jobs:
         run: make download-models
 
       - name: Run semantic router tests
-        run: make test
+        run: make test --debug=v
         env:
           CI: true
           CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
           CGO_ENABLED: 1
           LD_LIBRARY_PATH: ${{ github.workspace }}/candle-binding/target/release
+          SKIP_TOOL_CALL_TESTS: true
 
       - name: Upload test artifacts on failure
         if: failure()

@@ -60,6 +60,9 @@ categories:
 
 default_model: test-model
 
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: true
+
 # Auto model name for automatic model selection (optional)
 # Uncomment and set to customize the model name for automatic routing
 # auto_model_name: "MoM"
@@ -75,31 +78,31 @@ observability:
   tracing:
     # Enable tracing for development/debugging
     enabled: true
-    
+
     # OpenTelemetry provider
     provider: "opentelemetry"
-    
+
     exporter:
       # Stdout exporter prints traces to console (great for debugging)
       type: "stdout"
-      
+
       # No endpoint needed for stdout
       # endpoint: ""
       # insecure: true
-    
+
     sampling:
       # Always sample in development to see all traces
       type: "always_on"
-      
+
       # Rate not used for always_on
       # rate: 1.0
-    
+
     resource:
       # Service name for trace identification
       service_name: "vllm-semantic-router-dev"
-      
+
       # Version for development
       service_version: "dev"
-      
+
       # Environment identifier
       deployment_environment: "development"
@@ -24,16 +24,7 @@ semantic_cache:
   # Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
   # Default: "bert" (fastest, lowest memory)
   embedding_model: "bert"
-  # HNSW index configuration (for memory backend only)
-  use_hnsw: true # Enable HNSW index for faster similarity search
-  hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
-  hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)
 
-  # Hybrid cache configuration (when backend_type: "hybrid")
-  # Combines in-memory HNSW for fast search with Milvus for scalable storage
-  # max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
-  # backend_config_path: "config/milvus.yaml" # Path to Milvus config
-
 tools:
   enabled: true
   top_k: 3
@@ -223,7 +214,7 @@ router:
   traditional_attention_dropout_prob: 0.1          # Traditional model attention dropout probability
   tie_break_confidence: 0.5                        # Confidence value for tie-breaking situations
 
-default_model: openai/gpt-oss-20b
+default_model: qwen3
 
 # Reasoning family configurations
 reasoning_families:
@@ -245,6 +236,9 @@ reasoning_families:
 # Global default reasoning effort level
 default_reasoning_effort: high
 
+# Enable OpenAI Responses API adapter (experimental)
+enable_responses_adapter: true
+
 # API Configuration
 api:
   batch_classification: