vllm-project · Xunzhuo · Oct 22, 2025 · Oct 20, 2025 · Oct 20, 2025 · Oct 20, 2025
@@ -47,6 +47,9 @@ classifier:
 categories:
   - name: test
     system_prompt: "You are a test assistant."
+    # Example: Category-level cache settings
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.85
     model_scores:
       - model: test-model
         score: 1.0

@@ -107,6 +107,9 @@ categories:
         score: 0.4
         use_reasoning: false
   - name: psychology
+    # Example: Strict cache threshold for psychology - clinical nuances matter
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92
     model_scores:
       - model: "Model-A"
         score: 0.6
@@ -156,6 +159,9 @@ categories:
         score: 0.4
         use_reasoning: false
   - name: other
+    # Example: Lower threshold for general queries - better cache hit rate
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.75
     model_scores:
       - model: "Model-B"
         score: 0.8
@@ -168,6 +174,9 @@ categories:
         score: 0.6
         use_reasoning: false
   - name: health
+    # Example: Very strict cache threshold for health - word changes matter medically
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.95
     model_scores:
       - model: "Model-B"
         score: 0.8

@@ -60,12 +60,18 @@ classifier:
 categories:
   - name: math
     system_prompt: "You are a mathematics expert. Provide step-by-step solutions."
+    # Example: High threshold for math - precision matters
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0
         use_reasoning: true
   - name: other
     system_prompt: "You are a helpful assistant."
+    # Example: Lower threshold for general queries - more cache hits
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.75
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7

@@ -87,6 +87,9 @@ categories:
         use_reasoning: true  # Enable reasoning for legal analysis
   - name: psychology
     system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+    # Category-level cache override (if global cache is enabled)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.92  # Strict for clinical nuances
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0
@@ -117,6 +120,9 @@ categories:
         use_reasoning: false  # Default queries don't need reasoning
   - name: health
     system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+    # Category-level cache override (if global cache is enabled)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.95  # Very strict - medical accuracy critical
     model_scores:
       - model: openai/gpt-oss-20b
         score: 1.0

@@ -105,6 +105,9 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "Provide helpful responses."
+    # Category-level cache (optional, already enabled globally with low threshold)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.65  # Even lower for general queries
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7

@@ -110,6 +110,9 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "You are a helpful assistant. Provide concise, accurate responses."
+    # Category-level cache (optional, already enabled globally)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.7  # Match global or slightly lower
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7

@@ -42,6 +42,9 @@ model_config:
 
 categories:
   - name: other
+    # Category-level cache settings (optional - falls back to global if not set)
+    # semantic_cache_enabled: true
+    # semantic_cache_similarity_threshold: 0.8
     model_scores:
       - model: openai/gpt-oss-20b
         score: 0.7

@@ -74,6 +74,8 @@ categories:
         use_reasoning: false
   - name: psychology
     system_prompt: "You are a psychology expert with deep knowledge of cognitive processes, behavioral patterns, mental health, developmental psychology, social psychology, and therapeutic approaches. Provide evidence-based insights grounded in psychological research and theory. When discussing mental health topics, emphasize the importance of professional consultation and avoid providing diagnostic or therapeutic advice."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.92 # High threshold for psychology - sensitive to nuances
     model_scores:
       - model: qwen3
         score: 0.6
@@ -98,12 +100,16 @@ categories:
         use_reasoning: false
   - name: other
     system_prompt: "You are a helpful and knowledgeable assistant. Provide accurate, helpful responses across a wide range of topics."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.75 # Lower threshold for general chat - less sensitive
     model_scores:
       - model: qwen3
         score: 0.7
         use_reasoning: false
   - name: health
     system_prompt: "You are a health and medical information expert with knowledge of anatomy, physiology, diseases, treatments, preventive care, nutrition, and wellness. Provide accurate, evidence-based health information while emphasizing that your responses are for educational purposes only and should never replace professional medical advice, diagnosis, or treatment. Always encourage users to consult healthcare professionals for medical concerns and emergencies."
+    semantic_cache_enabled: true
+    semantic_cache_similarity_threshold: 0.95 # High threshold for health - very sensitive to word changes
     model_scores:
       - model: qwen3
         score: 0.5

@@ -33,6 +33,11 @@ type CacheBackend interface {
 	// Returns the cached response, match status, and any error
 	FindSimilar(model string, query string) ([]byte, bool, error)
 
+	// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+	// This allows category-specific similarity thresholds
+	// Returns the cached response, match status, and any error
+	FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error)
+
 	// Close releases all resources held by the cache backend
 	Close() error
 

@@ -334,6 +334,133 @@ func (c *InMemoryCache) FindSimilar(model string, query string) ([]byte, bool, e
 	return nil, false, nil
 }
 
+// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+func (c *InMemoryCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: cache disabled")
+		return nil, false, nil
+	}
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("InMemoryCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
+		model, queryPreview, len(query), threshold)
+
+	// Generate semantic embedding for similarity comparison
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("memory", "find_similar", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	c.mu.RLock()
+	var (
+		bestIndex      = -1
+		bestEntry      CacheEntry
+		bestSimilarity float32
+		entriesChecked int
+		expiredCount   int
+	)
+	// Capture the lookup time after acquiring the read lock so TTL checks aren't skewed by embedding work or lock wait
+	now := time.Now()
+
+	// Compare with completed entries for the same model, tracking only the best match
+	for entryIndex, entry := range c.entries {
+		// Skip incomplete entries
+		if entry.ResponseBody == nil {
+			continue
+		}
+
+		// Only consider entries for the same model
+		if entry.Model != model {
+			continue
+		}
+
+		// Skip entries that have expired before considering them
+		if c.isExpired(entry, now) {
+			expiredCount++
+			continue
+		}
+
+		// Compute semantic similarity using dot product
+		var dotProduct float32
+		for i := 0; i < len(queryEmbedding) && i < len(entry.Embedding); i++ {
+			dotProduct += queryEmbedding[i] * entry.Embedding[i]
+		}
+
+		entriesChecked++
+		if bestIndex == -1 || dotProduct > bestSimilarity {
+			bestSimilarity = dotProduct
+			bestIndex = entryIndex
+		}
+	}
+	// Snapshot the best entry before releasing the read lock
+	if bestIndex >= 0 {
+		bestEntry = c.entries[bestIndex]
+	}
+
+	// Unlock the read lock since we need the write lock to update the access info
+	c.mu.RUnlock()
+
+	// Log if any expired entries were skipped
+	if expiredCount > 0 {
+		observability.Debugf("InMemoryCache: excluded %d expired entries during search (TTL: %ds)",
+			expiredCount, c.ttlSeconds)
+		observability.LogEvent("cache_expired_entries_found", map[string]interface{}{
+			"backend":       "memory",
+			"expired_count": expiredCount,
+			"ttl_seconds":   c.ttlSeconds,
+		})
+	}
+
+	// Handle case where no suitable entries exist
+	if bestIndex < 0 {
+		atomic.AddInt64(&c.missCount, 1)
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: no entries found with responses")
+		metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	// Check if the best match meets the similarity threshold
+	if bestSimilarity >= threshold {
+		atomic.AddInt64(&c.hitCount, 1)
+
+		c.mu.Lock()
+		c.updateAccessInfo(bestIndex, bestEntry)
+		c.mu.Unlock()
+
+		observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+			bestSimilarity, threshold, len(bestEntry.ResponseBody))
+		observability.LogEvent("cache_hit", map[string]interface{}{
+			"backend":    "memory",
+			"similarity": bestSimilarity,
+			"threshold":  threshold,
+			"model":      model,
+		})
+		metrics.RecordCacheOperation("memory", "find_similar", "hit", time.Since(start).Seconds())
+		metrics.RecordCacheHit()
+		return bestEntry.ResponseBody, true, nil
+	}
+
+	atomic.AddInt64(&c.missCount, 1)
+	observability.Debugf("InMemoryCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f (checked %d entries)",
+		bestSimilarity, threshold, entriesChecked)
+	observability.LogEvent("cache_miss", map[string]interface{}{
+		"backend":         "memory",
+		"best_similarity": bestSimilarity,
+		"threshold":       threshold,
+		"model":           model,
+		"entries_checked": entriesChecked,
+	})
+	metrics.RecordCacheOperation("memory", "find_similar", "miss", time.Since(start).Seconds())
+	metrics.RecordCacheMiss()
+	return nil, false, nil
+}
+
 // Close releases all resources held by the cache
 func (c *InMemoryCache) Close() error {
 	c.mu.Lock()

@@ -591,6 +591,112 @@ func (c *MilvusCache) FindSimilar(model string, query string) ([]byte, bool, err
 	return responseBody, true, nil
 }
 
+// FindSimilarWithThreshold searches for semantically similar cached requests using a specific threshold
+func (c *MilvusCache) FindSimilarWithThreshold(model string, query string, threshold float32) ([]byte, bool, error) {
+	start := time.Now()
+
+	if !c.enabled {
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache disabled")
+		return nil, false, nil
+	}
+	queryPreview := query
+	if len(query) > 50 {
+		queryPreview = query[:50] + "..."
+	}
+	observability.Debugf("MilvusCache.FindSimilarWithThreshold: searching for model='%s', query='%s' (len=%d chars), threshold=%.4f",
+		model, queryPreview, len(query), threshold)
+
+	// Generate semantic embedding for similarity comparison
+	queryEmbedding, err := candle_binding.GetEmbedding(query, 0) // Auto-detect dimension
+	if err != nil {
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		return nil, false, fmt.Errorf("failed to generate embedding: %w", err)
+	}
+
+	ctx := context.Background()
+
+	// Define search parameters
+	searchParam, err := entity.NewIndexHNSWSearchParam(c.config.Search.Params.Ef)
+	if err != nil {
+		return nil, false, fmt.Errorf("failed to create search parameters: %w", err)
+	}
+
+	// Use Milvus Search for efficient similarity search
+	searchResult, err := c.client.Search(
+		ctx,
+		c.collectionName,
+		[]string{},
+		fmt.Sprintf("model == \"%s\" && response_body != \"\"", model),
+		[]string{"response_body"},
+		[]entity.Vector{entity.FloatVector(queryEmbedding)},
+		c.config.Collection.VectorField.Name,
+		entity.MetricType(c.config.Collection.VectorField.MetricType),
+		c.config.Search.TopK,
+		searchParam,
+	)
+	if err != nil {
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: search failed: %v", err)
+		atomic.AddInt64(&c.missCount, 1)
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	if len(searchResult) == 0 || searchResult[0].ResultCount == 0 {
+		atomic.AddInt64(&c.missCount, 1)
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: no entries found")
+		metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	bestScore := searchResult[0].Scores[0]
+	if bestScore < threshold {
+		atomic.AddInt64(&c.missCount, 1)
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE MISS - best_similarity=%.4f < threshold=%.4f",
+			bestScore, threshold)
+		observability.LogEvent("cache_miss", map[string]interface{}{
+			"backend":         "milvus",
+			"best_similarity": bestScore,
+			"threshold":       threshold,
+			"model":           model,
+			"collection":      c.collectionName,
+		})
+		metrics.RecordCacheOperation("milvus", "find_similar", "miss", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	// Cache Hit
+	var responseBody []byte
+	responseBodyColumn, ok := searchResult[0].Fields[0].(*entity.ColumnVarChar)
+	if ok && responseBodyColumn.Len() > 0 {
+		responseBody = []byte(responseBodyColumn.Data()[0])
+	}
+
+	if responseBody == nil {
+		observability.Debugf("MilvusCache.FindSimilarWithThreshold: cache hit but response_body is missing or not a string")
+		atomic.AddInt64(&c.missCount, 1)
+		metrics.RecordCacheOperation("milvus", "find_similar", "error", time.Since(start).Seconds())
+		metrics.RecordCacheMiss()
+		return nil, false, nil
+	}
+
+	atomic.AddInt64(&c.hitCount, 1)
+	observability.Debugf("MilvusCache.FindSimilarWithThreshold: CACHE HIT - similarity=%.4f >= threshold=%.4f, response_size=%d bytes",
+		bestScore, threshold, len(responseBody))
+	observability.LogEvent("cache_hit", map[string]interface{}{
+		"backend":    "milvus",
+		"similarity": bestScore,
+		"threshold":  threshold,
+		"model":      model,
+		"collection": c.collectionName,
+	})
+	metrics.RecordCacheOperation("milvus", "find_similar", "hit", time.Since(start).Seconds())
+	metrics.RecordCacheHit()
+	return responseBody, true, nil
+}
+
 // Close releases all resources held by the cache
 func (c *MilvusCache) Close() error {
 	if c.client != nil {