Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/test-and-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,13 @@ jobs:
run: make download-models

- name: Run semantic router tests
run: make test
run: make test --debug=v
env:
CI: true
CI_MINIMAL_MODELS: ${{ github.event_name == 'pull_request' }}
CGO_ENABLED: 1
LD_LIBRARY_PATH: ${{ github.workspace }}/candle-binding/target/release
SKIP_TOOL_CALL_TESTS: true

- name: Upload test artifacts on failure
if: failure()
Expand Down
19 changes: 11 additions & 8 deletions config/config.development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ categories:

default_model: test-model

# Enable OpenAI Responses API adapter (experimental)
enable_responses_adapter: true

# Auto model name for automatic model selection (optional)
# Uncomment and set to customize the model name for automatic routing
# auto_model_name: "MoM"
Expand All @@ -75,31 +78,31 @@ observability:
tracing:
# Enable tracing for development/debugging
enabled: true

# OpenTelemetry provider
provider: "opentelemetry"

exporter:
# Stdout exporter prints traces to console (great for debugging)
type: "stdout"

# No endpoint needed for stdout
# endpoint: ""
# insecure: true

sampling:
# Always sample in development to see all traces
type: "always_on"

# Rate not used for always_on
# rate: 1.0

resource:
# Service name for trace identification
service_name: "vllm-semantic-router-dev"

# Version for development
service_version: "dev"

# Environment identifier
deployment_environment: "development"
14 changes: 4 additions & 10 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,16 +24,7 @@ semantic_cache:
# Options: "bert" (fast, 384-dim), "qwen3" (high quality, 1024-dim, 32K context), "gemma" (balanced, 768-dim, 8K context)
# Default: "bert" (fastest, lowest memory)
embedding_model: "bert"
# HNSW index configuration (for memory backend only)
use_hnsw: true # Enable HNSW index for faster similarity search
hnsw_m: 16 # Number of bi-directional links (higher = better recall, more memory)
hnsw_ef_construction: 200 # Construction parameter (higher = better quality, slower build)

# Hybrid cache configuration (when backend_type: "hybrid")
# Combines in-memory HNSW for fast search with Milvus for scalable storage
# max_memory_entries: 100000 # Max entries in HNSW index (default: 100,000)
# backend_config_path: "config/milvus.yaml" # Path to Milvus config

tools:
enabled: true
top_k: 3
Expand Down Expand Up @@ -223,7 +214,7 @@ router:
traditional_attention_dropout_prob: 0.1 # Traditional model attention dropout probability
tie_break_confidence: 0.5 # Confidence value for tie-breaking situations

default_model: openai/gpt-oss-20b
default_model: qwen3

# Reasoning family configurations
reasoning_families:
Expand All @@ -245,6 +236,9 @@ reasoning_families:
# Global default reasoning effort level
default_reasoning_effort: high

# Enable OpenAI Responses API adapter (experimental)
enable_responses_adapter: true

# API Configuration
api:
batch_classification:
Expand Down
Empty file removed dashboard/backend/.gitkeep
Empty file.
Loading
Loading