From 4020e87aa1dc893384a261d803074b5d48ed7c04 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Oct 2025 06:08:14 +0000 Subject: [PATCH 1/5] Initial plan From 1e384ef7356fe518e4a856ffa8e60fb80cbb14d7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Oct 2025 06:22:52 +0000 Subject: [PATCH 2/5] Add category-level jailbreak detection configuration Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com> --- .../examples/jailbreak_category_example.yaml | 111 ++++++++++++++++ src/semantic-router/pkg/config/config.go | 14 +++ src/semantic-router/pkg/config/config_test.go | 119 ++++++++++++++++++ .../pkg/extproc/request_handler.go | 25 ++-- 4 files changed, 260 insertions(+), 9 deletions(-) create mode 100644 config/examples/jailbreak_category_example.yaml diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml new file mode 100644 index 000000000..064c20e57 --- /dev/null +++ b/config/examples/jailbreak_category_example.yaml @@ -0,0 +1,111 @@ +# Category-Level Jailbreak Detection Example +# This example demonstrates how to configure jailbreak detection at the category level +# Different categories can have different jailbreak detection settings based on their risk profiles + +# Global jailbreak detection configuration (can be overridden per category) +prompt_guard: + enabled: true # Global default - can be overridden per category + use_modernbert: true + model_id: "models/jailbreak_classifier_modernbert-base_model" + threshold: 0.7 + use_cpu: true + jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" + +# Classifier configuration +classifier: + category_model: + model_id: "models/category_classifier_modernbert-base_model" + use_modernbert: true + threshold: 0.6 + use_cpu: true + category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json" + +# Categories with different jailbreak detection settings +categories: + # High-security category: Enable jailbreak detection + - name: business + description: "Business queries, strategy, and professional advice" + jailbreak_enabled: true # Explicitly enable (inherits from global by default) + system_prompt: "You are a professional business consultant. Provide practical, actionable business advice." + model_scores: + - model: qwen3 + score: 0.7 + use_reasoning: false + + # Public-facing category: Enable jailbreak detection + - name: customer_support + description: "Customer support and general inquiries" + jailbreak_enabled: true # Explicitly enable for customer-facing content + system_prompt: "You are a friendly customer support agent. Help users with their questions." + model_scores: + - model: qwen3 + score: 0.8 + use_reasoning: false + + # Internal tool category: Disable jailbreak detection (trusted environment) + - name: code_generation + description: "Internal code generation and development tools" + jailbreak_enabled: false # Disable for internal developer tools + system_prompt: "You are a code generation assistant for internal developers." + model_scores: + - model: qwen3 + score: 0.9 + use_reasoning: true + + # Testing category: Disable jailbreak detection + - name: testing + description: "Testing and quality assurance queries" + jailbreak_enabled: false # Disable for testing purposes + system_prompt: "You are a QA assistant helping with test scenarios." + model_scores: + - model: qwen3 + score: 0.6 + use_reasoning: false + + # Default category: Uses global setting (inherits prompt_guard.enabled) + - name: general + description: "General queries that don't fit into specific categories" + # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled + system_prompt: "You are a helpful assistant." + model_scores: + - model: qwen3 + score: 0.5 + use_reasoning: false + +# Model configuration +model_config: + "qwen3": + reasoning_family: "qwen3" + preferred_endpoints: ["endpoint1"] + pii_policy: + allow_by_default: true + +# Reasoning family configurations +reasoning_families: + qwen3: + type: "chat_template_kwargs" + parameter: "thinking" + +# Default model for fallback +default_model: qwen3 + +# vLLM endpoints configuration +vllm_endpoints: + - name: "endpoint1" + address: "127.0.0.1" + port: 8000 + weight: 1 + +# Usage Notes: +# ============= +# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories +# 2. Category Override (jailbreak_enabled): Override global setting per category +# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled +# 4. Use Cases: +# - Set jailbreak_enabled: true for high-security, public-facing categories +# - Set jailbreak_enabled: false for internal tools or trusted environments +# - Omit jailbreak_enabled to use the global default +# 5. Security Best Practices: +# - Enable jailbreak detection by default (prompt_guard.enabled: true) +# - Only disable for specific categories where the risk is managed differently +# - Consider the consequences of disabling protection on a per-category basis diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 9766d4733..8e4ac28c5 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -370,6 +370,9 @@ type Category struct { // SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0) // If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"` + // JailbreakEnabled controls whether jailbreak detection is enabled for this category + // If nil, inherits from global PromptGuard.Enabled setting + JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"` } // GetModelReasoningFamily returns the reasoning family configuration for a given model name @@ -815,3 +818,14 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin // Fall back to global cache threshold or bert threshold return c.GetCacheSimilarityThreshold() } + +// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category +// If the category has an explicit setting, it takes precedence; otherwise, uses global setting +func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool { + category := c.GetCategoryByName(categoryName) + if category != nil && category.JailbreakEnabled != nil { + return *category.JailbreakEnabled + } + // Fall back to global setting + return c.PromptGuard.Enabled +} diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 8a34f3998..802d47cb9 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -1910,4 +1910,123 @@ categories: }) }) }) + + Describe("IsJailbreakEnabledForCategory", func() { + Context("when global jailbreak is enabled", func() { + It("should return true for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + + It("should return false when category explicitly disables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(false), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + + It("should return true when category explicitly enables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(true), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + }) + + Context("when global jailbreak is disabled", func() { + It("should return false for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + + It("should return true when category explicitly enables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(true), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue()) + }) + + It("should return false when category explicitly disables jailbreak", func() { + category := config.Category{ + Name: "test", + JailbreakEnabled: config.BoolPtr(false), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: false, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse()) + }) + }) + + Context("when category does not exist", func() { + It("should fall back to global setting", func() { + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Enabled: true, + }, + Categories: []config.Category{}, + } + + Expect(cfg.IsJailbreakEnabledForCategory("nonexistent")).To(BeTrue()) + }) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index e90f47457..0ddc8af86 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -396,12 +396,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo // Get content from messages userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest) - // Perform security checks - if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn { - return response, nil - } - - // Classify the request early to determine category for cache settings + // Classify the request early to determine category for security checks and cache settings var categoryName string if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") { // Determine text to use for classification @@ -417,6 +412,11 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo } } + // Perform security checks with category-specific settings + if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages, categoryName); shouldReturn { + return response, nil + } + // Handle caching with category-specific settings if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn { return response, nil @@ -426,13 +426,20 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo return r.handleModelRouting(openAIRequest, originalModel, userContent, nonUserMessages, ctx) } -// performSecurityChecks performs PII and jailbreak detection -func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) { +// performSecurityChecks performs PII and jailbreak detection with category-specific settings +func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string, categoryName string) (*ext_proc.ProcessingResponse, bool) { // Perform PII classification on all message content allContent := pii.ExtractAllContent(userContent, nonUserMessages) + // Check if jailbreak detection is enabled for this category + jailbreakEnabled := r.Classifier.IsJailbreakEnabled() + if categoryName != "" && r.Config != nil { + // Use category-specific setting if available + jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName) + } + // Perform jailbreak detection on all message content - if r.Classifier.IsJailbreakEnabled() { + if jailbreakEnabled { // Start jailbreak detection span spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanJailbreakDetection) defer span.End() From 458d7e7c17e2b509f7d26a6dfe5e2846eb8efff6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Oct 2025 06:26:18 +0000 Subject: [PATCH 3/5] Add documentation for category-level jailbreak settings Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com> --- config/config.yaml | 3 ++- .../training_lora/classifier_model_fine_tuning_lora/go.mod | 2 +- src/training/training_lora/pii_model_fine_tuning_lora/go.mod | 2 +- .../training_lora/prompt_guard_fine_tuning_lora/go.mod | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 279feb67a..ec9706ba9 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -19,7 +19,7 @@ tools: fallback_to_empty: true prompt_guard: - enabled: true + enabled: true # Global default - can be overridden per category with jailbreak_enabled use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 @@ -62,6 +62,7 @@ classifier: categories: - name: business system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." + # jailbreak_enabled: true # Optional: Override global jailbreak detection per category model_scores: - model: qwen3 score: 0.7 diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod index 338e6383f..d71496c7d 100644 --- a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod +++ b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod index 99bdf1528..c6f84bae1 100644 --- a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod +++ b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod index 6195d9f1b..869a3f418 100644 --- a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod +++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod @@ -4,4 +4,4 @@ go 1.24.1 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding -require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 +require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 From f75a8c1722548abe297f52c748643bb5bbb7728c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Oct 2025 07:28:16 +0000 Subject: [PATCH 4/5] Update documentation for category-level jailbreak detection - Add category-level jailbreak configuration to jailbreak-protection.md - Update category configuration docs with jailbreak_enabled parameter - Add security-focused configuration example - Update global configuration docs with category override notes - Update README to mention fine-grained security control Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com> --- README.md | 2 +- website/docs/installation/configuration.md | 4 +- .../docs/overview/categories/configuration.md | 69 +++++++++++++++- .../content-safety/jailbreak-protection.md | 78 ++++++++++++++++++- 4 files changed, 148 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c55a936ae..f856ed1b8 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p #### Prompt guard -Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. +Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control. ### Similarity Caching ⚡️ diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md index 37d0f8e05..18742ee35 100644 --- a/website/docs/installation/configuration.md +++ b/website/docs/installation/configuration.md @@ -38,7 +38,7 @@ tools: # Jailbreak protection prompt_guard: - enabled: false + enabled: false # Global default - can be overridden per category use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 @@ -84,6 +84,8 @@ categories: # Optional: Category-level cache settings # semantic_cache_enabled: true # semantic_cache_similarity_threshold: 0.9 # Higher threshold for math + # Optional: Category-level jailbreak settings + # jailbreak_enabled: true # Override global jailbreak detection - name: computer science model_scores: - model: your-model diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md index 7bc776d0d..09082d045 100644 --- a/website/docs/overview/categories/configuration.md +++ b/website/docs/overview/categories/configuration.md @@ -83,6 +83,34 @@ curl -X PUT http://localhost:8080/config/system-prompts \ ### Reasoning Configuration +#### `jailbreak_enabled` (Optional) + +- **Type**: Boolean +- **Description**: Whether to enable jailbreak detection for this category +- **Default**: Inherits from global `prompt_guard.enabled` setting +- **Impact**: Enables or disables jailbreak protection for this specific category + +```yaml +categories: + - name: customer_support + jailbreak_enabled: true # Explicitly enable for public-facing + model_scores: + - model: qwen3 + score: 0.8 + + - name: code_generation + jailbreak_enabled: false # Disable for internal tools + model_scores: + - model: qwen3 + score: 0.9 + + - name: general + # No jailbreak_enabled - inherits from global prompt_guard.enabled + model_scores: + - model: qwen3 + score: 0.5 +``` + #### `use_reasoning` (Required) - **Type**: Boolean @@ -196,7 +224,46 @@ categories: score: 0.2 ``` -### Example 3: Multi-Category Configuration +### Example 3: Security-Focused Configuration (Jailbreak Protection) + +```yaml +categories: + # High-security public-facing category + - name: "customer_support" + description: "Customer support and general inquiries" + jailbreak_enabled: true # Strict jailbreak protection + use_reasoning: false + model_scores: + - model: "phi4" + score: 0.9 + - model: "mistral-small3.1" + score: 0.7 + + # Trusted internal development category + - name: "code_generation" + description: "Internal code generation for developers" + jailbreak_enabled: false # Allow broader input for trusted users + use_reasoning: true + reasoning_effort: "medium" + model_scores: + - model: "gemma3:27b" + score: 0.9 + - model: "phi4" + score: 0.7 + + # General category using global default + - name: "general" + description: "General queries" + # jailbreak_enabled not specified - inherits from global prompt_guard.enabled + use_reasoning: false + model_scores: + - model: "phi4" + score: 0.6 + - model: "mistral-small3.1" + score: 0.6 +``` + +### Example 4: Multi-Category Configuration ```yaml categories: diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md index 6f3ac801c..477d4121e 100644 --- a/website/docs/tutorials/content-safety/jailbreak-protection.md +++ b/website/docs/tutorials/content-safety/jailbreak-protection.md @@ -43,7 +43,7 @@ Enable jailbreak detection in your configuration: ```yaml # config/config.yaml prompt_guard: - enabled: true + enabled: true # Global default - can be overridden per category model_id: "models/jailbreak_classifier_modernbert-base_model" threshold: 0.7 # Detection sensitivity (0.0-1.0) use_cpu: true # Run on CPU @@ -51,6 +51,51 @@ prompt_guard: jailbreak_mapping_path: "config/jailbreak_type_mapping.json" # Path to jailbreak type mapping ``` +### Category-Level Jailbreak Protection + +You can enable or disable jailbreak detection at the category level for fine-grained security control: + +```yaml +# Global default setting +prompt_guard: + enabled: true # Default for all categories + +categories: + # High-security category - explicitly enable + - name: customer_support + jailbreak_enabled: true # Strict protection for public-facing + model_scores: + - model: qwen3 + score: 0.8 + + # Internal tool - disable for trusted environment + - name: code_generation + jailbreak_enabled: false # Allow broader input for developers + model_scores: + - model: qwen3 + score: 0.9 + + # General category - inherits global setting + - name: general + # No jailbreak_enabled specified - uses global prompt_guard.enabled + model_scores: + - model: qwen3 + score: 0.5 +``` + +**Category-Level Behavior**: + +- **When `jailbreak_enabled` is not specified**: Category inherits from global `prompt_guard.enabled` +- **When `jailbreak_enabled: true`**: Jailbreak detection is explicitly enabled for this category +- **When `jailbreak_enabled: false`**: Jailbreak detection is explicitly disabled for this category +- **Category-specific setting always overrides global setting** when explicitly configured + +**Use Cases**: + +- **Enable for public-facing categories**: Customer support, business advice +- **Disable for internal tools**: Code generation for developers, testing environments +- **Inherit for general categories**: Use global default for most categories + ## How Jailbreak Protection Works The jailbreak protection system works as follows: @@ -134,9 +179,38 @@ security_policy_violations_total 45 ### 4. Integration with Routing - Apply stricter protection to sensitive models -- Use different thresholds for different categories +- Use category-level jailbreak settings for different domains - Combine with PII detection for comprehensive security +**Example**: Configure different jailbreak policies per category: + +```yaml +prompt_guard: + enabled: true # Global default + +categories: + # Strict protection for customer-facing categories + - name: customer_support + jailbreak_enabled: true + model_scores: + - model: safe-model + score: 0.9 + + # Relaxed protection for internal development + - name: code_generation + jailbreak_enabled: false # Allow broader input + model_scores: + - model: code-model + score: 0.9 + + # Use global default for general queries + - name: general + # Inherits from prompt_guard.enabled + model_scores: + - model: general-model + score: 0.7 +``` + ## Troubleshooting ### High False Positives From ea50e6c9cabaa3d0fe13d84bacf51dcb10937237 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Oct 2025 07:48:58 +0000 Subject: [PATCH 5/5] Add category-level jailbreak threshold configuration - Add JailbreakThreshold field to Category struct - Add GetJailbreakThresholdForCategory helper method - Create CheckForJailbreakWithThreshold and AnalyzeContentForJailbreakWithThreshold methods - Update performSecurityChecks to use category-specific threshold - Add 5 comprehensive tests for threshold configuration - Update example configs with threshold tuning examples - Update documentation with threshold configuration and tuning guidelines - Add threshold tuning guide with recommendations for different category types Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com> --- config/config.yaml | 1 + .../examples/jailbreak_category_example.yaml | 49 +++++++---- src/semantic-router/pkg/config/config.go | 14 ++++ src/semantic-router/pkg/config/config_test.go | 84 +++++++++++++++++++ .../pkg/extproc/request_handler.go | 8 +- .../pkg/utils/classification/classifier.go | 18 +++- .../docs/overview/categories/configuration.md | 46 +++++++++- .../content-safety/jailbreak-protection.md | 36 +++++--- 8 files changed, 219 insertions(+), 37 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index ec9706ba9..06c1b60f4 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -63,6 +63,7 @@ categories: - name: business system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations." # jailbreak_enabled: true # Optional: Override global jailbreak detection per category + # jailbreak_threshold: 0.8 # Optional: Override global jailbreak threshold per category model_scores: - model: qwen3 score: 0.7 diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml index 064c20e57..52b84087c 100644 --- a/config/examples/jailbreak_category_example.yaml +++ b/config/examples/jailbreak_category_example.yaml @@ -1,13 +1,13 @@ # Category-Level Jailbreak Detection Example # This example demonstrates how to configure jailbreak detection at the category level -# Different categories can have different jailbreak detection settings based on their risk profiles +# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles # Global jailbreak detection configuration (can be overridden per category) prompt_guard: enabled: true # Global default - can be overridden per category use_modernbert: true model_id: "models/jailbreak_classifier_modernbert-base_model" - threshold: 0.7 + threshold: 0.7 # Global default threshold - can be overridden per category use_cpu: true jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json" @@ -22,30 +22,33 @@ classifier: # Categories with different jailbreak detection settings categories: - # High-security category: Enable jailbreak detection + # High-security category: Strict jailbreak detection with high threshold - name: business description: "Business queries, strategy, and professional advice" jailbreak_enabled: true # Explicitly enable (inherits from global by default) + jailbreak_threshold: 0.9 # Higher threshold for stricter detection system_prompt: "You are a professional business consultant. Provide practical, actionable business advice." model_scores: - model: qwen3 score: 0.7 use_reasoning: false - # Public-facing category: Enable jailbreak detection + # Public-facing category: Enable with standard threshold - name: customer_support description: "Customer support and general inquiries" jailbreak_enabled: true # Explicitly enable for customer-facing content + jailbreak_threshold: 0.8 # Slightly higher than global for public-facing system_prompt: "You are a friendly customer support agent. Help users with their questions." model_scores: - model: qwen3 score: 0.8 use_reasoning: false - # Internal tool category: Disable jailbreak detection (trusted environment) + # Internal tool category: Relaxed threshold (trusted environment) - name: code_generation description: "Internal code generation and development tools" - jailbreak_enabled: false # Disable for internal developer tools + jailbreak_enabled: true # Keep enabled but with relaxed threshold + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives for code system_prompt: "You are a code generation assistant for internal developers." model_scores: - model: qwen3 @@ -62,10 +65,11 @@ categories: score: 0.6 use_reasoning: false - # Default category: Uses global setting (inherits prompt_guard.enabled) + # Default category: Uses global setting (inherits prompt_guard.enabled and threshold) - name: general description: "General queries that don't fit into specific categories" # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled + # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7) system_prompt: "You are a helpful assistant." model_scores: - model: qwen3 @@ -98,14 +102,25 @@ vllm_endpoints: # Usage Notes: # ============= -# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories -# 2. Category Override (jailbreak_enabled): Override global setting per category -# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled -# 4. Use Cases: -# - Set jailbreak_enabled: true for high-security, public-facing categories -# - Set jailbreak_enabled: false for internal tools or trusted environments -# - Omit jailbreak_enabled to use the global default -# 5. Security Best Practices: +# 1. Global Settings: +# - prompt_guard.enabled: Sets the default enabled/disabled for all categories +# - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories +# 2. Category Overrides: +# - jailbreak_enabled: Override global enabled/disabled setting per category +# - jailbreak_threshold: Override global threshold per category +# 3. Inheritance: +# - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled +# - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold +# 4. Threshold Tuning: +# - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks +# - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate +# - Recommended: Start with 0.7 globally, adjust per category based on risk profile +# 5. Use Cases: +# - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9) +# - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives +# - General categories: Use global default threshold +# 6. Security Best Practices: # - Enable jailbreak detection by default (prompt_guard.enabled: true) -# - Only disable for specific categories where the risk is managed differently -# - Consider the consequences of disabling protection on a per-category basis +# - Only disable or use very low thresholds for specific categories where the risk is managed differently +# - Consider the consequences of threshold settings on a per-category basis +# - Monitor false positive and false negative rates to tune thresholds appropriately diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go index 8e4ac28c5..8e5d34aab 100644 --- a/src/semantic-router/pkg/config/config.go +++ b/src/semantic-router/pkg/config/config.go @@ -373,6 +373,9 @@ type Category struct { // JailbreakEnabled controls whether jailbreak detection is enabled for this category // If nil, inherits from global PromptGuard.Enabled setting JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"` + // JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0) + // If nil, uses the global threshold from PromptGuard.Threshold + JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"` } // GetModelReasoningFamily returns the reasoning family configuration for a given model name @@ -829,3 +832,14 @@ func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool { // Fall back to global setting return c.PromptGuard.Enabled } + +// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category +// Priority: category-specific > global prompt_guard threshold +func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 { + category := c.GetCategoryByName(categoryName) + if category != nil && category.JailbreakThreshold != nil { + return *category.JailbreakThreshold + } + // Fall back to global threshold + return c.PromptGuard.Threshold +} diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go index 802d47cb9..ff027be35 100644 --- a/src/semantic-router/pkg/config/config_test.go +++ b/src/semantic-router/pkg/config/config_test.go @@ -2029,4 +2029,88 @@ categories: }) }) }) + + Describe("GetJailbreakThresholdForCategory", func() { + Context("when global threshold is set", func() { + It("should return global threshold for category without explicit setting", func() { + category := config.Category{ + Name: "test", + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.7))) + }) + + It("should return category-specific threshold when set", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.9), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.9))) + }) + + It("should allow lower threshold override", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.5), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.5))) + }) + + It("should allow higher threshold override", func() { + category := config.Category{ + Name: "test", + JailbreakThreshold: config.Float32Ptr(0.95), + ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}}, + } + + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.7, + }, + Categories: []config.Category{category}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.95))) + }) + }) + + Context("when category does not exist", func() { + It("should fall back to global threshold", func() { + cfg := &config.RouterConfig{ + PromptGuard: config.PromptGuardConfig{ + Threshold: 0.8, + }, + Categories: []config.Category{}, + } + + Expect(cfg.GetJailbreakThresholdForCategory("nonexistent")).To(Equal(float32(0.8))) + }) + }) + }) }) diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go index 0ddc8af86..d2482f934 100644 --- a/src/semantic-router/pkg/extproc/request_handler.go +++ b/src/semantic-router/pkg/extproc/request_handler.go @@ -438,6 +438,12 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName) } + // Get category-specific threshold + jailbreakThreshold := r.Config.PromptGuard.Threshold + if categoryName != "" && r.Config != nil { + jailbreakThreshold = r.Config.GetJailbreakThresholdForCategory(categoryName) + } + // Perform jailbreak detection on all message content if jailbreakEnabled { // Start jailbreak detection span @@ -445,7 +451,7 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st defer span.End() startTime := time.Now() - hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreak(allContent) + hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreakWithThreshold(allContent, jailbreakThreshold) detectionTime := time.Since(startTime).Milliseconds() observability.SetSpanAttributes(span, diff --git a/src/semantic-router/pkg/utils/classification/classifier.go b/src/semantic-router/pkg/utils/classification/classifier.go index ac5e5c0e4..3dc820f1c 100644 --- a/src/semantic-router/pkg/utils/classification/classifier.go +++ b/src/semantic-router/pkg/utils/classification/classifier.go @@ -425,6 +425,11 @@ func (c *Classifier) initializeJailbreakClassifier() error { // CheckForJailbreak analyzes the given text for jailbreak attempts func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, error) { + return c.CheckForJailbreakWithThreshold(text, c.Config.PromptGuard.Threshold) +} + +// CheckForJailbreakWithThreshold analyzes the given text for jailbreak attempts with a custom threshold +func (c *Classifier) CheckForJailbreakWithThreshold(text string, threshold float32) (bool, string, float32, error) { if !c.IsJailbreakEnabled() { return false, "", 0.0, fmt.Errorf("jailbreak detection is not enabled or properly configured") } @@ -453,14 +458,14 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro } // Check if confidence meets threshold and indicates jailbreak - isJailbreak := result.Confidence >= c.Config.PromptGuard.Threshold && jailbreakType == "jailbreak" + isJailbreak := result.Confidence >= threshold && jailbreakType == "jailbreak" if isJailbreak { observability.Warnf("JAILBREAK DETECTED: '%s' (confidence: %.3f, threshold: %.3f)", - jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold) + jailbreakType, result.Confidence, threshold) } else { observability.Infof("BENIGN: '%s' (confidence: %.3f, threshold: %.3f)", - jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold) + jailbreakType, result.Confidence, threshold) } return isJailbreak, jailbreakType, result.Confidence, nil @@ -468,6 +473,11 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro // AnalyzeContentForJailbreak analyzes multiple content pieces for jailbreak attempts func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []JailbreakDetection, error) { + return c.AnalyzeContentForJailbreakWithThreshold(contentList, c.Config.PromptGuard.Threshold) +} + +// AnalyzeContentForJailbreakWithThreshold analyzes multiple content pieces for jailbreak attempts with a custom threshold +func (c *Classifier) AnalyzeContentForJailbreakWithThreshold(contentList []string, threshold float32) (bool, []JailbreakDetection, error) { if !c.IsJailbreakEnabled() { return false, nil, fmt.Errorf("jailbreak detection is not enabled or properly configured") } @@ -480,7 +490,7 @@ func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []J continue } - isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreak(content) + isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreakWithThreshold(content, threshold) if err != nil { observability.Errorf("Error analyzing content %d: %v", i, err) continue diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md index 09082d045..9a274ec90 100644 --- a/website/docs/overview/categories/configuration.md +++ b/website/docs/overview/categories/configuration.md @@ -111,6 +111,42 @@ categories: score: 0.5 ``` +#### `jailbreak_threshold` (Optional) + +- **Type**: Float (0.0-1.0) +- **Description**: Confidence threshold for jailbreak detection +- **Default**: Inherits from global `prompt_guard.threshold` setting +- **Impact**: Controls sensitivity of jailbreak detection for this category +- **Tuning**: Higher values = stricter (fewer false positives), Lower values = more sensitive (catches more attacks) + +```yaml +categories: + - name: customer_support + jailbreak_enabled: true + jailbreak_threshold: 0.9 # Strict detection for public-facing + model_scores: + - model: qwen3 + score: 0.8 + + - name: code_generation + jailbreak_enabled: true + jailbreak_threshold: 0.5 # Relaxed to reduce false positives on code + model_scores: + - model: qwen3 + score: 0.9 + + - name: general + # No jailbreak_threshold - inherits from global prompt_guard.threshold + model_scores: + - model: qwen3 + score: 0.5 +``` + +**Threshold Guidelines**: +- **0.8-0.95**: High-security categories (customer support, business) +- **0.6-0.8**: Standard categories (general queries) +- **0.4-0.6**: Technical categories (code generation, development tools) + #### `use_reasoning` (Required) - **Type**: Boolean @@ -228,10 +264,11 @@ categories: ```yaml categories: - # High-security public-facing category + # High-security public-facing category with strict threshold - name: "customer_support" description: "Customer support and general inquiries" jailbreak_enabled: true # Strict jailbreak protection + jailbreak_threshold: 0.9 # High threshold for public-facing use_reasoning: false model_scores: - model: "phi4" @@ -239,10 +276,11 @@ categories: - model: "mistral-small3.1" score: 0.7 - # Trusted internal development category + # Technical category with relaxed threshold - name: "code_generation" - description: "Internal code generation for developers" - jailbreak_enabled: false # Allow broader input for trusted users + description: "Code generation for developers" + jailbreak_enabled: true # Keep enabled + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives on code use_reasoning: true reasoning_effort: "medium" model_scores: diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md index 477d4121e..60774e60f 100644 --- a/website/docs/tutorials/content-safety/jailbreak-protection.md +++ b/website/docs/tutorials/content-safety/jailbreak-protection.md @@ -53,31 +53,35 @@ prompt_guard: ### Category-Level Jailbreak Protection -You can enable or disable jailbreak detection at the category level for fine-grained security control: +You can configure jailbreak detection at the category level for fine-grained security control, including both enabling/disabling and threshold customization: ```yaml -# Global default setting +# Global default settings prompt_guard: enabled: true # Default for all categories + threshold: 0.7 # Default threshold for all categories categories: - # High-security category - explicitly enable + # High-security category - strict protection with high threshold - name: customer_support jailbreak_enabled: true # Strict protection for public-facing + jailbreak_threshold: 0.9 # Higher threshold for stricter detection model_scores: - model: qwen3 score: 0.8 - # Internal tool - disable for trusted environment + # Internal tool - relaxed threshold for code/technical content - name: code_generation - jailbreak_enabled: false # Allow broader input for developers + jailbreak_enabled: true # Keep enabled but with relaxed threshold + jailbreak_threshold: 0.5 # Lower threshold to reduce false positives model_scores: - model: qwen3 score: 0.9 - # General category - inherits global setting + # General category - inherits global settings - name: general - # No jailbreak_enabled specified - uses global prompt_guard.enabled + # No jailbreak_enabled or jailbreak_threshold specified + # Uses global prompt_guard.enabled (true) and threshold (0.7) model_scores: - model: qwen3 score: 0.5 @@ -88,13 +92,23 @@ categories: - **When `jailbreak_enabled` is not specified**: Category inherits from global `prompt_guard.enabled` - **When `jailbreak_enabled: true`**: Jailbreak detection is explicitly enabled for this category - **When `jailbreak_enabled: false`**: Jailbreak detection is explicitly disabled for this category -- **Category-specific setting always overrides global setting** when explicitly configured +- **When `jailbreak_threshold` is not specified**: Category inherits from global `prompt_guard.threshold` +- **When `jailbreak_threshold: 0.X`**: Uses category-specific threshold (0.0-1.0) +- **Category-specific settings always override global settings** when explicitly configured + +**Threshold Tuning Guide**: + +- **High threshold (0.8-0.95)**: Stricter detection, fewer false positives, may miss subtle attacks +- **Medium threshold (0.6-0.8)**: Balanced detection, good for most use cases +- **Low threshold (0.4-0.6)**: More sensitive, catches more attacks, higher false positive rate +- **Recommended**: Start with 0.7 globally, adjust per category based on risk profile and false positive tolerance **Use Cases**: -- **Enable for public-facing categories**: Customer support, business advice -- **Disable for internal tools**: Code generation for developers, testing environments -- **Inherit for general categories**: Use global default for most categories +- **High-security categories (0.8-0.9 threshold)**: Customer support, business advice, public-facing APIs +- **Technical categories (0.5-0.6 threshold)**: Code generation, developer tools (reduce false positives on technical jargon) +- **Internal tools (0.5 threshold or disabled)**: Testing environments, trusted internal applications +- **General categories (inherit global)**: Use global default for most categories ## How Jailbreak Protection Works