Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions config/config.development.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ classifier:
categories:
- name: test
system_prompt: "You are a test assistant."
# Example: Category-level cache settings
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.85
model_scores:
- model: test-model
score: 1.0
Expand Down
9 changes: 9 additions & 0 deletions config/config.e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,9 @@ categories:
score: 0.4
use_reasoning: false
- name: psychology
# Example: Strict cache threshold for psychology - clinical nuances matter
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92
model_scores:
- model: "Model-A"
score: 0.6
Expand Down Expand Up @@ -156,6 +159,9 @@ categories:
score: 0.4
use_reasoning: false
- name: other
# Example: Lower threshold for general queries - better cache hit rate
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.75
model_scores:
- model: "Model-B"
score: 0.8
Expand All @@ -168,6 +174,9 @@ categories:
score: 0.6
use_reasoning: false
- name: health
# Example: Very strict cache threshold for health - word changes matter medically
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.95
model_scores:
- model: "Model-B"
score: 0.8
Expand Down
6 changes: 6 additions & 0 deletions config/config.production.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,18 @@ classifier:
categories:
- name: math
system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
# Example: High threshold for math - precision matters
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
use_reasoning: true
- name: other
system_prompt: "You are a helpful assistant."
# Example: Lower threshold for general queries - more cache hits
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.75
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
6 changes: 6 additions & 0 deletions config/config.recipe-accuracy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ categories:
use_reasoning: true # Enable reasoning for legal analysis
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
# Category-level cache override (if global cache is enabled)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.92 # Strict for clinical nuances
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
Expand Down Expand Up @@ -117,6 +120,9 @@ categories:
use_reasoning: false # Default queries don't need reasoning
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
# Category-level cache override (if global cache is enabled)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.95 # Very strict - medical accuracy critical
model_scores:
- model: openai/gpt-oss-20b
score: 1.0
Expand Down
3 changes: 3 additions & 0 deletions config/config.recipe-latency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ categories:
use_reasoning: false
- name: other
system_prompt: "Provide helpful responses."
# Category-level cache (optional, already enabled globally with low threshold)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.65 # Even lower for general queries
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
3 changes: 3 additions & 0 deletions config/config.recipe-token-efficiency.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,9 @@ categories:
use_reasoning: false
- name: other
system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
# Category-level cache (optional, already enabled globally)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.7 # Match global or slightly lower
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
3 changes: 3 additions & 0 deletions config/config.testing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ model_config:

categories:
- name: other
# Category-level cache settings (optional - falls back to global if not set)
# semantic_cache_enabled: true
# semantic_cache_similarity_threshold: 0.8
model_scores:
- model: openai/gpt-oss-20b
score: 0.7
Expand Down
6 changes: 6 additions & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ categories:
use_reasoning: false
- name: psychology
system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
model_scores:
- model: qwen3
score: 0.6
Expand All @@ -98,12 +100,16 @@ categories:
use_reasoning: false
- name: other
system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
model_scores:
- model: qwen3
score: 0.7
use_reasoning: false
- name: health
system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
semantic_cache_enabled: true
semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
model_scores:
- model: qwen3
score: 0.5
Expand Down
5 changes: 5 additions & 0 deletions src/semantic-router/pkg/cache/cache_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ type CacheBackend interface {
// Returns the cached response, match status, and any error
FindSimilar(model string, query string) ([]byte, bool, error)

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
// This allows category-specific similarity thresholds
// Returns the cached response, match status, and any error
FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)

// Close releases all resources held by the cache backend
Close() error

Expand Down
127 changes: 127 additions & 0 deletions src/semantic-router/pkg/cache/inmemory_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,133 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
return nil, false, nil
}

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
Comment on lines +215 to +216
Copy link

Copilot AI Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FindSimilarWithThreshold method duplicates significant logic from the existing FindSimilar method. Consider refactoring FindSimilar to call FindSimilarWithThreshold with the default threshold to eliminate code duplication and ensure consistent behavior between both methods.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please address this and other duplicate functions

start := time.Now()

if !c.enabled {
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
return nil, false, nil
}
queryPreview := query
if len(query) > 50 {
queryPreview = query[:50] + "..."
}
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
model, queryPreview, len(query), threshold)

// Generate semantic embedding for similarity comparison
queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
if err != nil {
metrics.RecordCacheOperation("memory", "find_similar", "error", time.Since(start).Seconds())
return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
}

c.mu.RLock()
var (
bestIndex = -1
bestEntry CacheEntry
bestSimilarity float32
entriesChecked int
expiredCount int
)
// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
now := time.Now()

// Compare with completed entries for the same model, tracking only the best match
for entryIndex, entry := range c.entries {
// Skip incomplete entries
if entry.ResponseBody == nil {
continue
}

// Only consider entries for the same model
if entry.Model != model {
continue
}

// Skip entries that have expired before considering them
if c.isExpired(entry, now) {
expiredCount++
continue
}

// Compute semantic similarity using dot product
var dotProduct float32
for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
dotProduct += queryEmbedding[i] * entry.Embedding[i]
}

entriesChecked++
if bestIndex == -1 || dotProduct > bestSimilarity {
bestSimilarity = dotProduct
bestIndex = entryIndex
}
}
// Snapshot the best entry before releasing the read lock
if bestIndex >= 0 {
bestEntry = c.entries[bestIndex]
}

// Unlock the read lock since we need the write lock to update the access info
c.mu.RUnlock()

// Log if any expired entries were skipped
if expiredCount > 0 {
observability.Debugf("InMemoryCache: excluded %d expired entries during search (TTL: %ds)",
expiredCount, c.ttlSeconds)
observability.LogEvent("cache_expired_entries_found", map[string]interface{}{
"backend": "memory",
"expired_count": expiredCount,
"ttl_seconds": c.ttlSeconds,
})
}

// Handle case where no suitable entries exist
if bestIndex < 0 {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

// Check if the best match meets the similarity threshold
if bestSimilarity >= threshold {
atomic.AddInt64(&c.hitCount, 1)

c.mu.Lock()
c.updateAccessInfo(bestIndex, bestEntry)
c.mu.Unlock()

observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestSimilarity, threshold, len(bestEntry.ResponseBody))
observability.LogEvent("cache_hit", map[string]interface{}{
"backend": "memory",
"similarity": bestSimilarity,
"threshold": threshold,
"model": model,
})
metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
metrics.RecordCacheHit()
return bestEntry.ResponseBody, true, nil
}

atomic.AddInt64(&c.missCount, 1)
observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
bestSimilarity, threshold, entriesChecked)
observability.LogEvent("cache_miss", map[string]interface{}{
"backend": "memory",
"best_similarity": bestSimilarity,
"threshold": threshold,
"model": model,
"entries_checked": entriesChecked,
})
metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

// Close releases all resources held by the cache
func (c *InMemoryCache) Close() error {
c.mu.Lock()
Expand Down
106 changes: 106 additions & 0 deletions src/semantic-router/pkg/cache/milvus_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,112 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
return responseBody, true, nil
}

// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
Comment on lines +493 to +494
Copy link

Copilot AI Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FindSimilarWithThreshold method duplicates significant logic from the existing FindSimilar method. Consider refactoring FindSimilar to call FindSimilarWithThreshold with the default threshold to eliminate code duplication and ensure consistent behavior between both methods.

Copilot uses AI. Check for mistakes.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@copilot please address this and other duplicate functions

start := time.Now()

if !c.enabled {
observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled")
return nil, false, nil
}
queryPreview := query
if len(query) > 50 {
queryPreview = query[:50] + "..."
}
observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
model, queryPreview, len(query), threshold)

// Generate semantic embedding for similarity comparison
queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
if err != nil {
metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
}

ctx := context.Background()

// Define search parameters
searchParam, err := entity.NewIndexHNSWSearchParam(c.config.Search.Params.Ef)
if err != nil {
return nil, false, fmt.Errorf("failed to create search parameters: %w", err)
}

// Use Milvus Search for efficient similarity search
searchResult, err := c.client.Search(
ctx,
c.collectionName,
[]string{},
fmt.Sprintf("model == \"%s\" && response_body != \"\"", model),
[]string{"response_body"},
[]entity.Vector{entity.FloatVector(queryEmbedding)},
c.config.Collection.VectorField.Name,
entity.MetricType(c.config.Collection.VectorField.MetricType),
c.config.Search.TopK,
searchParam,
)
if err != nil {
observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err)
atomic.AddInt64(&c.missCount, 1)
metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

if len(searchResult) == 0 || searchResult[0].ResultCount == 0 {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found")
metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

bestScore := searchResult[0].Scores[0]
if bestScore < threshold {
atomic.AddInt64(&c.missCount, 1)
observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
bestScore, threshold)
observability.LogEvent("cache_miss", map[string]interface{}{
"backend": "milvus",
"best_similarity": bestScore,
"threshold": threshold,
"model": model,
"collection": c.collectionName,
})
metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

// Cache Hit
var responseBody []byte
responseBodyColumn, ok := searchResult[0].Fields[0].(*entity.ColumnVarChar)
if ok && responseBodyColumn.Len() > 0 {
responseBody = []byte(responseBodyColumn.Data()[0])
}

if responseBody == nil {
observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string")
atomic.AddInt64(&c.missCount, 1)
metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
metrics.RecordCacheMiss()
return nil, false, nil
}

atomic.AddInt64(&c.hitCount, 1)
observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
bestScore, threshold, len(responseBody))
observability.LogEvent("cache_hit", map[string]interface{}{
"backend": "milvus",
"similarity": bestScore,
"threshold": threshold,
"model": model,
"collection": c.collectionName,
})
metrics.RecordCacheOperation("milvus", "find_similar", "hit", time.Since(start).Seconds())
metrics.RecordCacheHit()
return responseBody, true, nil
}

// Close releases all resources held by the cache
func (c *MilvusCache) Close() error {
if c.client != nil {
Expand Down
Loading
Loading