From 4020e87aa1dc893384a261d803074b5d48ed7c04 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Oct 2025 06:08:14 +0000
Subject: [PATCH 1/5] Initial plan


From 1e384ef7356fe518e4a856ffa8e60fb80cbb14d7 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Oct 2025 06:22:52 +0000
Subject: [PATCH 2/5] Add category-level jailbreak detection configuration

Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com>
---
 .../examples/jailbreak_category_example.yaml  | 111 ++++++++++++++++
 src/semantic-router/pkg/config/config.go      |  14 +++
 src/semantic-router/pkg/config/config_test.go | 119 ++++++++++++++++++
 .../pkg/extproc/request_handler.go            |  25 ++--
 4 files changed, 260 insertions(+), 9 deletions(-)
 create mode 100644 config/examples/jailbreak_category_example.yaml

diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml
new file mode 100644
index 000000000..064c20e57
--- /dev/null
+++ b/config/examples/jailbreak_category_example.yaml
@@ -0,0 +1,111 @@
+# Category-Level Jailbreak Detection Example
+# This example demonstrates how to configure jailbreak detection at the category level
+# Different categories can have different jailbreak detection settings based on their risk profiles
+
+# Global jailbreak detection configuration (can be overridden per category)
+prompt_guard:
+  enabled: true  # Global default - can be overridden per category
+  use_modernbert: true
+  model_id: "models/jailbreak_classifier_modernbert-base_model"
+  threshold: 0.7
+  use_cpu: true
+  jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
+
+# Classifier configuration
+classifier:
+  category_model:
+    model_id: "models/category_classifier_modernbert-base_model"
+    use_modernbert: true
+    threshold: 0.6
+    use_cpu: true
+    category_mapping_path: "models/category_classifier_modernbert-base_model/category_mapping.json"
+
+# Categories with different jailbreak detection settings
+categories:
+  # High-security category: Enable jailbreak detection
+  - name: business
+    description: "Business queries, strategy, and professional advice"
+    jailbreak_enabled: true  # Explicitly enable (inherits from global by default)
+    system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
+    model_scores:
+      - model: qwen3
+        score: 0.7
+        use_reasoning: false
+
+  # Public-facing category: Enable jailbreak detection
+  - name: customer_support
+    description: "Customer support and general inquiries"
+    jailbreak_enabled: true  # Explicitly enable for customer-facing content
+    system_prompt: "You are a friendly customer support agent. Help users with their questions."
+    model_scores:
+      - model: qwen3
+        score: 0.8
+        use_reasoning: false
+
+  # Internal tool category: Disable jailbreak detection (trusted environment)
+  - name: code_generation
+    description: "Internal code generation and development tools"
+    jailbreak_enabled: false  # Disable for internal developer tools
+    system_prompt: "You are a code generation assistant for internal developers."
+    model_scores:
+      - model: qwen3
+        score: 0.9
+        use_reasoning: true
+
+  # Testing category: Disable jailbreak detection
+  - name: testing
+    description: "Testing and quality assurance queries"
+    jailbreak_enabled: false  # Disable for testing purposes
+    system_prompt: "You are a QA assistant helping with test scenarios."
+    model_scores:
+      - model: qwen3
+        score: 0.6
+        use_reasoning: false
+
+  # Default category: Uses global setting (inherits prompt_guard.enabled)
+  - name: general
+    description: "General queries that don't fit into specific categories"
+    # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
+    system_prompt: "You are a helpful assistant."
+    model_scores:
+      - model: qwen3
+        score: 0.5
+        use_reasoning: false
+
+# Model configuration
+model_config:
+  "qwen3":
+    reasoning_family: "qwen3"
+    preferred_endpoints: ["endpoint1"]
+    pii_policy:
+      allow_by_default: true
+
+# Reasoning family configurations
+reasoning_families:
+  qwen3:
+    type: "chat_template_kwargs"
+    parameter: "thinking"
+
+# Default model for fallback
+default_model: qwen3
+
+# vLLM endpoints configuration
+vllm_endpoints:
+  - name: "endpoint1"
+    address: "127.0.0.1"
+    port: 8000
+    weight: 1
+
+# Usage Notes:
+# =============
+# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories
+# 2. Category Override (jailbreak_enabled): Override global setting per category
+# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
+# 4. Use Cases:
+#    - Set jailbreak_enabled: true for high-security, public-facing categories
+#    - Set jailbreak_enabled: false for internal tools or trusted environments
+#    - Omit jailbreak_enabled to use the global default
+# 5. Security Best Practices:
+#    - Enable jailbreak detection by default (prompt_guard.enabled: true)
+#    - Only disable for specific categories where the risk is managed differently
+#    - Consider the consequences of disabling protection on a per-category basis
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index 9766d4733..8e4ac28c5 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -370,6 +370,9 @@ type Category struct {
 	// SemanticCacheSimilarityThreshold defines the minimum similarity score for cache hits (0.0-1.0)
 	// If nil, uses the global threshold from SemanticCache.SimilarityThreshold or BertModel.Threshold
 	SemanticCacheSimilarityThreshold *float32 `yaml:"semantic_cache_similarity_threshold,omitempty"`
+	// JailbreakEnabled controls whether jailbreak detection is enabled for this category
+	// If nil, inherits from global PromptGuard.Enabled setting
+	JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
 }
 
 // GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -815,3 +818,14 @@ func (c *RouterConfig) GetCacheSimilarityThresholdForCategory(categoryName strin
 	// Fall back to global cache threshold or bert threshold
 	return c.GetCacheSimilarityThreshold()
 }
+
+// IsJailbreakEnabledForCategory returns whether jailbreak detection is enabled for a specific category
+// If the category has an explicit setting, it takes precedence; otherwise, uses global setting
+func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.JailbreakEnabled != nil {
+		return *category.JailbreakEnabled
+	}
+	// Fall back to global setting
+	return c.PromptGuard.Enabled
+}
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
index 8a34f3998..802d47cb9 100644
--- a/src/semantic-router/pkg/config/config_test.go
+++ b/src/semantic-router/pkg/config/config_test.go
@@ -1910,4 +1910,123 @@ categories:
 			})
 		})
 	})
+
+	Describe("IsJailbreakEnabledForCategory", func() {
+		Context("when global jailbreak is enabled", func() {
+			It("should return true for category without explicit setting", func() {
+				category := config.Category{
+					Name:        "test",
+					ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: true,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
+			})
+
+			It("should return false when category explicitly disables jailbreak", func() {
+				category := config.Category{
+					Name:             "test",
+					JailbreakEnabled: config.BoolPtr(false),
+					ModelScores:      []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: true,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
+			})
+
+			It("should return true when category explicitly enables jailbreak", func() {
+				category := config.Category{
+					Name:             "test",
+					JailbreakEnabled: config.BoolPtr(true),
+					ModelScores:      []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: true,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
+			})
+		})
+
+		Context("when global jailbreak is disabled", func() {
+			It("should return false for category without explicit setting", func() {
+				category := config.Category{
+					Name:        "test",
+					ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: false,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
+			})
+
+			It("should return true when category explicitly enables jailbreak", func() {
+				category := config.Category{
+					Name:             "test",
+					JailbreakEnabled: config.BoolPtr(true),
+					ModelScores:      []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: false,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeTrue())
+			})
+
+			It("should return false when category explicitly disables jailbreak", func() {
+				category := config.Category{
+					Name:             "test",
+					JailbreakEnabled: config.BoolPtr(false),
+					ModelScores:      []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: false,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("test")).To(BeFalse())
+			})
+		})
+
+		Context("when category does not exist", func() {
+			It("should fall back to global setting", func() {
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Enabled: true,
+					},
+					Categories: []config.Category{},
+				}
+
+				Expect(cfg.IsJailbreakEnabledForCategory("nonexistent")).To(BeTrue())
+			})
+		})
+	})
 })
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
index e90f47457..0ddc8af86 100644
--- a/src/semantic-router/pkg/extproc/request_handler.go
+++ b/src/semantic-router/pkg/extproc/request_handler.go
@@ -396,12 +396,7 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 	// Get content from messages
 	userContent, nonUserMessages := extractUserAndNonUserContent(openAIRequest)
 
-	// Perform security checks
-	if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages); shouldReturn {
-		return response, nil
-	}
-
-	// Classify the request early to determine category for cache settings
+	// Classify the request early to determine category for security checks and cache settings
 	var categoryName string
 	if r.Config != nil && r.Config.IsAutoModelName(originalModel) && (len(nonUserMessages) > 0 || userContent != "") {
 		// Determine text to use for classification
@@ -417,6 +412,11 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 		}
 	}
 
+	// Perform security checks with category-specific settings
+	if response, shouldReturn := r.performSecurityChecks(ctx, userContent, nonUserMessages, categoryName); shouldReturn {
+		return response, nil
+	}
+
 	// Handle caching with category-specific settings
 	if response, shouldReturn := r.handleCaching(ctx, categoryName); shouldReturn {
 		return response, nil
@@ -426,13 +426,20 @@ func (r *OpenAIRouter) handleRequestBody(v *ext_proc.ProcessingRequest_RequestBo
 	return r.handleModelRouting(openAIRequest, originalModel, userContent, nonUserMessages, ctx)
 }
 
-// performSecurityChecks performs PII and jailbreak detection
-func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string) (*ext_proc.ProcessingResponse, bool) {
+// performSecurityChecks performs PII and jailbreak detection with category-specific settings
+func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent string, nonUserMessages []string, categoryName string) (*ext_proc.ProcessingResponse, bool) {
 	// Perform PII classification on all message content
 	allContent := pii.ExtractAllContent(userContent, nonUserMessages)
 
+	// Check if jailbreak detection is enabled for this category
+	jailbreakEnabled := r.Classifier.IsJailbreakEnabled()
+	if categoryName != "" && r.Config != nil {
+		// Use category-specific setting if available
+		jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName)
+	}
+
 	// Perform jailbreak detection on all message content
-	if r.Classifier.IsJailbreakEnabled() {
+	if jailbreakEnabled {
 		// Start jailbreak detection span
 		spanCtx, span := observability.StartSpan(ctx.TraceContext, observability.SpanJailbreakDetection)
 		defer span.End()

From 458d7e7c17e2b509f7d26a6dfe5e2846eb8efff6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Oct 2025 06:26:18 +0000
Subject: [PATCH 3/5] Add documentation for category-level jailbreak settings

Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com>
---
 config/config.yaml                                             | 3 ++-
 .../training_lora/classifier_model_fine_tuning_lora/go.mod     | 2 +-
 src/training/training_lora/pii_model_fine_tuning_lora/go.mod   | 2 +-
 .../training_lora/prompt_guard_fine_tuning_lora/go.mod         | 2 +-
 4 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 279feb67a..ec9706ba9 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -19,7 +19,7 @@ tools:
   fallback_to_empty: true
 
 prompt_guard:
-  enabled: true
+  enabled: true  # Global default - can be overridden per category with jailbreak_enabled
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
   threshold: 0.7
@@ -62,6 +62,7 @@ classifier:
 categories:
   - name: business
     system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
+    # jailbreak_enabled: true  # Optional: Override global jailbreak detection per category
     model_scores:
       - model: qwen3
         score: 0.7
diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod
index 338e6383f..d71496c7d 100644
--- a/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod
+++ b/src/training/training_lora/classifier_model_fine_tuning_lora/go.mod
@@ -4,4 +4,4 @@ go 1.24.1
 
 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding
 
-require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 
+require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod
index 99bdf1528..c6f84bae1 100644
--- a/src/training/training_lora/pii_model_fine_tuning_lora/go.mod
+++ b/src/training/training_lora/pii_model_fine_tuning_lora/go.mod
@@ -4,4 +4,4 @@ go 1.24.1
 
 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding
 
-require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 
+require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000
diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod
index 6195d9f1b..869a3f418 100644
--- a/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod
+++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/go.mod
@@ -4,4 +4,4 @@ go 1.24.1
 
 replace github.com/vllm-project/semantic-router/candle-binding => ../../../../candle-binding
 
-require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000 
+require github.com/vllm-project/semantic-router/candle-binding v0.0.0-00010101000000-000000000000

From f75a8c1722548abe297f52c748643bb5bbb7728c Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Oct 2025 07:28:16 +0000
Subject: [PATCH 4/5] Update documentation for category-level jailbreak
 detection

- Add category-level jailbreak configuration to jailbreak-protection.md
- Update category configuration docs with jailbreak_enabled parameter
- Add security-focused configuration example
- Update global configuration docs with category override notes
- Update README to mention fine-grained security control

Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com>
---
 README.md                                     |  2 +-
 website/docs/installation/configuration.md    |  4 +-
 .../docs/overview/categories/configuration.md | 69 +++++++++++++++-
 .../content-safety/jailbreak-protection.md    | 78 ++++++++++++++++++-
 4 files changed, 148 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index c55a936ae..f856ed1b8 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ Detect PII in the prompt, avoiding sending PII to the LLM so as to protect the p
 
 #### Prompt guard
 
-Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving.
+Detect if the prompt is a jailbreak prompt, avoiding sending jailbreak prompts to the LLM so as to prevent the LLM from misbehaving. Can be configured globally or at the category level for fine-grained security control.
 
 ### Similarity Caching ⚡️
 
diff --git a/website/docs/installation/configuration.md b/website/docs/installation/configuration.md
index 37d0f8e05..18742ee35 100644
--- a/website/docs/installation/configuration.md
+++ b/website/docs/installation/configuration.md
@@ -38,7 +38,7 @@ tools:
 
 # Jailbreak protection
 prompt_guard:
-  enabled: false
+  enabled: false  # Global default - can be overridden per category
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
   threshold: 0.7
@@ -84,6 +84,8 @@ categories:
   # Optional: Category-level cache settings
   # semantic_cache_enabled: true
   # semantic_cache_similarity_threshold: 0.9  # Higher threshold for math
+  # Optional: Category-level jailbreak settings
+  # jailbreak_enabled: true  # Override global jailbreak detection
 - name: computer science
   model_scores:
   - model: your-model
diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md
index 7bc776d0d..09082d045 100644
--- a/website/docs/overview/categories/configuration.md
+++ b/website/docs/overview/categories/configuration.md
@@ -83,6 +83,34 @@ curl -X PUT http://localhost:8080/config/system-prompts \
 
 ### Reasoning Configuration
 
+#### `jailbreak_enabled` (Optional)
+
+- **Type**: Boolean
+- **Description**: Whether to enable jailbreak detection for this category
+- **Default**: Inherits from global `prompt_guard.enabled` setting
+- **Impact**: Enables or disables jailbreak protection for this specific category
+
+```yaml
+categories:
+  - name: customer_support
+    jailbreak_enabled: true  # Explicitly enable for public-facing
+    model_scores:
+      - model: qwen3
+        score: 0.8
+
+  - name: code_generation
+    jailbreak_enabled: false  # Disable for internal tools
+    model_scores:
+      - model: qwen3
+        score: 0.9
+
+  - name: general
+    # No jailbreak_enabled - inherits from global prompt_guard.enabled
+    model_scores:
+      - model: qwen3
+        score: 0.5
+```
+
 #### `use_reasoning` (Required)
 
 - **Type**: Boolean
@@ -196,7 +224,46 @@ categories:
         score: 0.2
 ```
 
-### Example 3: Multi-Category Configuration
+### Example 3: Security-Focused Configuration (Jailbreak Protection)
+
+```yaml
+categories:
+  # High-security public-facing category
+  - name: "customer_support"
+    description: "Customer support and general inquiries"
+    jailbreak_enabled: true  # Strict jailbreak protection
+    use_reasoning: false
+    model_scores:
+      - model: "phi4"
+        score: 0.9
+      - model: "mistral-small3.1"
+        score: 0.7
+
+  # Trusted internal development category
+  - name: "code_generation"
+    description: "Internal code generation for developers"
+    jailbreak_enabled: false  # Allow broader input for trusted users
+    use_reasoning: true
+    reasoning_effort: "medium"
+    model_scores:
+      - model: "gemma3:27b"
+        score: 0.9
+      - model: "phi4"
+        score: 0.7
+
+  # General category using global default
+  - name: "general"
+    description: "General queries"
+    # jailbreak_enabled not specified - inherits from global prompt_guard.enabled
+    use_reasoning: false
+    model_scores:
+      - model: "phi4"
+        score: 0.6
+      - model: "mistral-small3.1"
+        score: 0.6
+```
+
+### Example 4: Multi-Category Configuration
 
 ```yaml
 categories:
diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md
index 6f3ac801c..477d4121e 100644
--- a/website/docs/tutorials/content-safety/jailbreak-protection.md
+++ b/website/docs/tutorials/content-safety/jailbreak-protection.md
@@ -43,7 +43,7 @@ Enable jailbreak detection in your configuration:
 ```yaml
 # config/config.yaml
 prompt_guard:
-  enabled: true
+  enabled: true  # Global default - can be overridden per category
   model_id: "models/jailbreak_classifier_modernbert-base_model"
   threshold: 0.7                   # Detection sensitivity (0.0-1.0)
   use_cpu: true                    # Run on CPU
@@ -51,6 +51,51 @@ prompt_guard:
   jailbreak_mapping_path: "config/jailbreak_type_mapping.json"  # Path to jailbreak type mapping
 ```
 
+### Category-Level Jailbreak Protection
+
+You can enable or disable jailbreak detection at the category level for fine-grained security control:
+
+```yaml
+# Global default setting
+prompt_guard:
+  enabled: true  # Default for all categories
+
+categories:
+  # High-security category - explicitly enable
+  - name: customer_support
+    jailbreak_enabled: true  # Strict protection for public-facing
+    model_scores:
+      - model: qwen3
+        score: 0.8
+
+  # Internal tool - disable for trusted environment
+  - name: code_generation
+    jailbreak_enabled: false  # Allow broader input for developers
+    model_scores:
+      - model: qwen3
+        score: 0.9
+
+  # General category - inherits global setting
+  - name: general
+    # No jailbreak_enabled specified - uses global prompt_guard.enabled
+    model_scores:
+      - model: qwen3
+        score: 0.5
+```
+
+**Category-Level Behavior**:
+
+- **When `jailbreak_enabled` is not specified**: Category inherits from global `prompt_guard.enabled`
+- **When `jailbreak_enabled: true`**: Jailbreak detection is explicitly enabled for this category
+- **When `jailbreak_enabled: false`**: Jailbreak detection is explicitly disabled for this category
+- **Category-specific setting always overrides global setting** when explicitly configured
+
+**Use Cases**:
+
+- **Enable for public-facing categories**: Customer support, business advice
+- **Disable for internal tools**: Code generation for developers, testing environments
+- **Inherit for general categories**: Use global default for most categories
+
 ## How Jailbreak Protection Works
 
 The jailbreak protection system works as follows:
@@ -134,9 +179,38 @@ security_policy_violations_total 45
 ### 4. Integration with Routing
 
 - Apply stricter protection to sensitive models
-- Use different thresholds for different categories
+- Use category-level jailbreak settings for different domains
 - Combine with PII detection for comprehensive security
 
+**Example**: Configure different jailbreak policies per category:
+
+```yaml
+prompt_guard:
+  enabled: true  # Global default
+
+categories:
+  # Strict protection for customer-facing categories
+  - name: customer_support
+    jailbreak_enabled: true
+    model_scores:
+      - model: safe-model
+        score: 0.9
+
+  # Relaxed protection for internal development
+  - name: code_generation
+    jailbreak_enabled: false  # Allow broader input
+    model_scores:
+      - model: code-model
+        score: 0.9
+
+  # Use global default for general queries
+  - name: general
+    # Inherits from prompt_guard.enabled
+    model_scores:
+      - model: general-model
+        score: 0.7
+```
+
 ## Troubleshooting
 
 ### High False Positives

From ea50e6c9cabaa3d0fe13d84bacf51dcb10937237 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 22 Oct 2025 07:48:58 +0000
Subject: [PATCH 5/5] Add category-level jailbreak threshold configuration

- Add JailbreakThreshold field to Category struct
- Add GetJailbreakThresholdForCategory helper method
- Create CheckForJailbreakWithThreshold and AnalyzeContentForJailbreakWithThreshold methods
- Update performSecurityChecks to use category-specific threshold
- Add 5 comprehensive tests for threshold configuration
- Update example configs with threshold tuning examples
- Update documentation with threshold configuration and tuning guidelines
- Add threshold tuning guide with recommendations for different category types

Co-authored-by: Xunzhuo <48784001+Xunzhuo@users.noreply.github.com>
---
 config/config.yaml                            |  1 +
 .../examples/jailbreak_category_example.yaml  | 49 +++++++----
 src/semantic-router/pkg/config/config.go      | 14 ++++
 src/semantic-router/pkg/config/config_test.go | 84 +++++++++++++++++++
 .../pkg/extproc/request_handler.go            |  8 +-
 .../pkg/utils/classification/classifier.go    | 18 +++-
 .../docs/overview/categories/configuration.md | 46 +++++++++-
 .../content-safety/jailbreak-protection.md    | 36 +++++---
 8 files changed, 219 insertions(+), 37 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index ec9706ba9..06c1b60f4 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -63,6 +63,7 @@ categories:
   - name: business
     system_prompt: "You are a senior business consultant and strategic advisor with expertise in corporate strategy, operations management, financial analysis, marketing, and organizational development. Provide practical, actionable business advice backed by proven methodologies and industry best practices. Consider market dynamics, competitive landscape, and stakeholder interests in your recommendations."
     # jailbreak_enabled: true  # Optional: Override global jailbreak detection per category
+    # jailbreak_threshold: 0.8  # Optional: Override global jailbreak threshold per category
     model_scores:
       - model: qwen3
         score: 0.7
diff --git a/config/examples/jailbreak_category_example.yaml b/config/examples/jailbreak_category_example.yaml
index 064c20e57..52b84087c 100644
--- a/config/examples/jailbreak_category_example.yaml
+++ b/config/examples/jailbreak_category_example.yaml
@@ -1,13 +1,13 @@
 # Category-Level Jailbreak Detection Example
 # This example demonstrates how to configure jailbreak detection at the category level
-# Different categories can have different jailbreak detection settings based on their risk profiles
+# Different categories can have different jailbreak detection settings and thresholds based on their risk profiles
 
 # Global jailbreak detection configuration (can be overridden per category)
 prompt_guard:
   enabled: true  # Global default - can be overridden per category
   use_modernbert: true
   model_id: "models/jailbreak_classifier_modernbert-base_model"
-  threshold: 0.7
+  threshold: 0.7  # Global default threshold - can be overridden per category
   use_cpu: true
   jailbreak_mapping_path: "models/jailbreak_classifier_modernbert-base_model/jailbreak_type_mapping.json"
 
@@ -22,30 +22,33 @@ classifier:
 
 # Categories with different jailbreak detection settings
 categories:
-  # High-security category: Enable jailbreak detection
+  # High-security category: Strict jailbreak detection with high threshold
   - name: business
     description: "Business queries, strategy, and professional advice"
     jailbreak_enabled: true  # Explicitly enable (inherits from global by default)
+    jailbreak_threshold: 0.9  # Higher threshold for stricter detection
     system_prompt: "You are a professional business consultant. Provide practical, actionable business advice."
     model_scores:
       - model: qwen3
         score: 0.7
         use_reasoning: false
 
-  # Public-facing category: Enable jailbreak detection
+  # Public-facing category: Enable with standard threshold
   - name: customer_support
     description: "Customer support and general inquiries"
     jailbreak_enabled: true  # Explicitly enable for customer-facing content
+    jailbreak_threshold: 0.8  # Slightly higher than global for public-facing
     system_prompt: "You are a friendly customer support agent. Help users with their questions."
     model_scores:
       - model: qwen3
         score: 0.8
         use_reasoning: false
 
-  # Internal tool category: Disable jailbreak detection (trusted environment)
+  # Internal tool category: Relaxed threshold (trusted environment)
   - name: code_generation
     description: "Internal code generation and development tools"
-    jailbreak_enabled: false  # Disable for internal developer tools
+    jailbreak_enabled: true  # Keep enabled but with relaxed threshold
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives for code
     system_prompt: "You are a code generation assistant for internal developers."
     model_scores:
       - model: qwen3
@@ -62,10 +65,11 @@ categories:
         score: 0.6
         use_reasoning: false
 
-  # Default category: Uses global setting (inherits prompt_guard.enabled)
+  # Default category: Uses global setting (inherits prompt_guard.enabled and threshold)
   - name: general
     description: "General queries that don't fit into specific categories"
     # jailbreak_enabled not specified - will inherit from global prompt_guard.enabled
+    # jailbreak_threshold not specified - will inherit from global prompt_guard.threshold (0.7)
     system_prompt: "You are a helpful assistant."
     model_scores:
       - model: qwen3
@@ -98,14 +102,25 @@ vllm_endpoints:
 
 # Usage Notes:
 # =============
-# 1. Global Setting (prompt_guard.enabled): Sets the default for all categories
-# 2. Category Override (jailbreak_enabled): Override global setting per category
-# 3. Inheritance: If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
-# 4. Use Cases:
-#    - Set jailbreak_enabled: true for high-security, public-facing categories
-#    - Set jailbreak_enabled: false for internal tools or trusted environments
-#    - Omit jailbreak_enabled to use the global default
-# 5. Security Best Practices:
+# 1. Global Settings:
+#    - prompt_guard.enabled: Sets the default enabled/disabled for all categories
+#    - prompt_guard.threshold: Sets the default detection threshold (0.0-1.0) for all categories
+# 2. Category Overrides:
+#    - jailbreak_enabled: Override global enabled/disabled setting per category
+#    - jailbreak_threshold: Override global threshold per category
+# 3. Inheritance:
+#    - If jailbreak_enabled is not specified, inherits from prompt_guard.enabled
+#    - If jailbreak_threshold is not specified, inherits from prompt_guard.threshold
+# 4. Threshold Tuning:
+#    - Higher threshold (0.8-0.95): Stricter detection, fewer false positives, may miss subtle attacks
+#    - Lower threshold (0.5-0.7): More sensitive detection, catches more attacks, higher false positive rate
+#    - Recommended: Start with 0.7 globally, adjust per category based on risk profile
+# 5. Use Cases:
+#    - High-security categories (business, customer_support): Use higher thresholds (0.8-0.9)
+#    - Internal tools with code/technical content: Use lower thresholds (0.5-0.6) to reduce false positives
+#    - General categories: Use global default threshold
+# 6. Security Best Practices:
 #    - Enable jailbreak detection by default (prompt_guard.enabled: true)
-#    - Only disable for specific categories where the risk is managed differently
-#    - Consider the consequences of disabling protection on a per-category basis
+#    - Only disable or use very low thresholds for specific categories where the risk is managed differently
+#    - Consider the consequences of threshold settings on a per-category basis
+#    - Monitor false positive and false negative rates to tune thresholds appropriately
diff --git a/src/semantic-router/pkg/config/config.go b/src/semantic-router/pkg/config/config.go
index 8e4ac28c5..8e5d34aab 100644
--- a/src/semantic-router/pkg/config/config.go
+++ b/src/semantic-router/pkg/config/config.go
@@ -373,6 +373,9 @@ type Category struct {
 	// JailbreakEnabled controls whether jailbreak detection is enabled for this category
 	// If nil, inherits from global PromptGuard.Enabled setting
 	JailbreakEnabled *bool `yaml:"jailbreak_enabled,omitempty"`
+	// JailbreakThreshold defines the confidence threshold for jailbreak detection (0.0-1.0)
+	// If nil, uses the global threshold from PromptGuard.Threshold
+	JailbreakThreshold *float32 `yaml:"jailbreak_threshold,omitempty"`
 }
 
 // GetModelReasoningFamily returns the reasoning family configuration for a given model name
@@ -829,3 +832,14 @@ func (c *RouterConfig) IsJailbreakEnabledForCategory(categoryName string) bool {
 	// Fall back to global setting
 	return c.PromptGuard.Enabled
 }
+
+// GetJailbreakThresholdForCategory returns the effective jailbreak detection threshold for a category
+// Priority: category-specific > global prompt_guard threshold
+func (c *RouterConfig) GetJailbreakThresholdForCategory(categoryName string) float32 {
+	category := c.GetCategoryByName(categoryName)
+	if category != nil && category.JailbreakThreshold != nil {
+		return *category.JailbreakThreshold
+	}
+	// Fall back to global threshold
+	return c.PromptGuard.Threshold
+}
diff --git a/src/semantic-router/pkg/config/config_test.go b/src/semantic-router/pkg/config/config_test.go
index 802d47cb9..ff027be35 100644
--- a/src/semantic-router/pkg/config/config_test.go
+++ b/src/semantic-router/pkg/config/config_test.go
@@ -2029,4 +2029,88 @@ categories:
 			})
 		})
 	})
+
+	Describe("GetJailbreakThresholdForCategory", func() {
+		Context("when global threshold is set", func() {
+			It("should return global threshold for category without explicit setting", func() {
+				category := config.Category{
+					Name:        "test",
+					ModelScores: []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.7)))
+			})
+
+			It("should return category-specific threshold when set", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.9),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.9)))
+			})
+
+			It("should allow lower threshold override", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.5),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.5)))
+			})
+
+			It("should allow higher threshold override", func() {
+				category := config.Category{
+					Name:               "test",
+					JailbreakThreshold: config.Float32Ptr(0.95),
+					ModelScores:        []config.ModelScore{{Model: "test", Score: 1.0}},
+				}
+
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.7,
+					},
+					Categories: []config.Category{category},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("test")).To(Equal(float32(0.95)))
+			})
+		})
+
+		Context("when category does not exist", func() {
+			It("should fall back to global threshold", func() {
+				cfg := &config.RouterConfig{
+					PromptGuard: config.PromptGuardConfig{
+						Threshold: 0.8,
+					},
+					Categories: []config.Category{},
+				}
+
+				Expect(cfg.GetJailbreakThresholdForCategory("nonexistent")).To(Equal(float32(0.8)))
+			})
+		})
+	})
 })
diff --git a/src/semantic-router/pkg/extproc/request_handler.go b/src/semantic-router/pkg/extproc/request_handler.go
index 0ddc8af86..d2482f934 100644
--- a/src/semantic-router/pkg/extproc/request_handler.go
+++ b/src/semantic-router/pkg/extproc/request_handler.go
@@ -438,6 +438,12 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st
 		jailbreakEnabled = jailbreakEnabled && r.Config.IsJailbreakEnabledForCategory(categoryName)
 	}
 
+	// Get category-specific threshold
+	jailbreakThreshold := r.Config.PromptGuard.Threshold
+	if categoryName != "" && r.Config != nil {
+		jailbreakThreshold = r.Config.GetJailbreakThresholdForCategory(categoryName)
+	}
+
 	// Perform jailbreak detection on all message content
 	if jailbreakEnabled {
 		// Start jailbreak detection span
@@ -445,7 +451,7 @@ func (r *OpenAIRouter) performSecurityChecks(ctx *RequestContext, userContent st
 		defer span.End()
 
 		startTime := time.Now()
-		hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreak(allContent)
+		hasJailbreak, jailbreakDetections, err := r.Classifier.AnalyzeContentForJailbreakWithThreshold(allContent, jailbreakThreshold)
 		detectionTime := time.Since(startTime).Milliseconds()
 
 		observability.SetSpanAttributes(span,
diff --git a/src/semantic-router/pkg/utils/classification/classifier.go b/src/semantic-router/pkg/utils/classification/classifier.go
index ac5e5c0e4..3dc820f1c 100644
--- a/src/semantic-router/pkg/utils/classification/classifier.go
+++ b/src/semantic-router/pkg/utils/classification/classifier.go
@@ -425,6 +425,11 @@ func (c *Classifier) initializeJailbreakClassifier() error {
 
 // CheckForJailbreak analyzes the given text for jailbreak attempts
 func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, error) {
+	return c.CheckForJailbreakWithThreshold(text, c.Config.PromptGuard.Threshold)
+}
+
+// CheckForJailbreakWithThreshold analyzes the given text for jailbreak attempts with a custom threshold
+func (c *Classifier) CheckForJailbreakWithThreshold(text string, threshold float32) (bool, string, float32, error) {
 	if !c.IsJailbreakEnabled() {
 		return false, "", 0.0, fmt.Errorf("jailbreak detection is not enabled or properly configured")
 	}
@@ -453,14 +458,14 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro
 	}
 
 	// Check if confidence meets threshold and indicates jailbreak
-	isJailbreak := result.Confidence >= c.Config.PromptGuard.Threshold && jailbreakType == "jailbreak"
+	isJailbreak := result.Confidence >= threshold && jailbreakType == "jailbreak"
 
 	if isJailbreak {
 		observability.Warnf("JAILBREAK DETECTED: '%s' (confidence: %.3f, threshold: %.3f)",
-			jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold)
+			jailbreakType, result.Confidence, threshold)
 	} else {
 		observability.Infof("BENIGN: '%s' (confidence: %.3f, threshold: %.3f)",
-			jailbreakType, result.Confidence, c.Config.PromptGuard.Threshold)
+			jailbreakType, result.Confidence, threshold)
 	}
 
 	return isJailbreak, jailbreakType, result.Confidence, nil
@@ -468,6 +473,11 @@ func (c *Classifier) CheckForJailbreak(text string) (bool, string, float32, erro
 
 // AnalyzeContentForJailbreak analyzes multiple content pieces for jailbreak attempts
 func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []JailbreakDetection, error) {
+	return c.AnalyzeContentForJailbreakWithThreshold(contentList, c.Config.PromptGuard.Threshold)
+}
+
+// AnalyzeContentForJailbreakWithThreshold analyzes multiple content pieces for jailbreak attempts with a custom threshold
+func (c *Classifier) AnalyzeContentForJailbreakWithThreshold(contentList []string, threshold float32) (bool, []JailbreakDetection, error) {
 	if !c.IsJailbreakEnabled() {
 		return false, nil, fmt.Errorf("jailbreak detection is not enabled or properly configured")
 	}
@@ -480,7 +490,7 @@ func (c *Classifier) AnalyzeContentForJailbreak(contentList []string) (bool, []J
 			continue
 		}
 
-		isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreak(content)
+		isJailbreak, jailbreakType, confidence, err := c.CheckForJailbreakWithThreshold(content, threshold)
 		if err != nil {
 			observability.Errorf("Error analyzing content %d: %v", i, err)
 			continue
diff --git a/website/docs/overview/categories/configuration.md b/website/docs/overview/categories/configuration.md
index 09082d045..9a274ec90 100644
--- a/website/docs/overview/categories/configuration.md
+++ b/website/docs/overview/categories/configuration.md
@@ -111,6 +111,42 @@ categories:
         score: 0.5
 ```
 
+#### `jailbreak_threshold` (Optional)
+
+- **Type**: Float (0.0-1.0)
+- **Description**: Confidence threshold for jailbreak detection
+- **Default**: Inherits from global `prompt_guard.threshold` setting
+- **Impact**: Controls sensitivity of jailbreak detection for this category
+- **Tuning**: Higher values = stricter (fewer false positives), Lower values = more sensitive (catches more attacks)
+
+```yaml
+categories:
+  - name: customer_support
+    jailbreak_enabled: true
+    jailbreak_threshold: 0.9  # Strict detection for public-facing
+    model_scores:
+      - model: qwen3
+        score: 0.8
+
+  - name: code_generation
+    jailbreak_enabled: true
+    jailbreak_threshold: 0.5  # Relaxed to reduce false positives on code
+    model_scores:
+      - model: qwen3
+        score: 0.9
+
+  - name: general
+    # No jailbreak_threshold - inherits from global prompt_guard.threshold
+    model_scores:
+      - model: qwen3
+        score: 0.5
+```
+
+**Threshold Guidelines**:
+- **0.8-0.95**: High-security categories (customer support, business)
+- **0.6-0.8**: Standard categories (general queries)
+- **0.4-0.6**: Technical categories (code generation, development tools)
+
 #### `use_reasoning` (Required)
 
 - **Type**: Boolean
@@ -228,10 +264,11 @@ categories:
 
 ```yaml
 categories:
-  # High-security public-facing category
+  # High-security public-facing category with strict threshold
   - name: "customer_support"
     description: "Customer support and general inquiries"
     jailbreak_enabled: true  # Strict jailbreak protection
+    jailbreak_threshold: 0.9  # High threshold for public-facing
     use_reasoning: false
     model_scores:
       - model: "phi4"
@@ -239,10 +276,11 @@ categories:
       - model: "mistral-small3.1"
         score: 0.7
 
-  # Trusted internal development category
+  # Technical category with relaxed threshold
   - name: "code_generation"
-    description: "Internal code generation for developers"
-    jailbreak_enabled: false  # Allow broader input for trusted users
+    description: "Code generation for developers"
+    jailbreak_enabled: true  # Keep enabled
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives on code
     use_reasoning: true
     reasoning_effort: "medium"
     model_scores:
diff --git a/website/docs/tutorials/content-safety/jailbreak-protection.md b/website/docs/tutorials/content-safety/jailbreak-protection.md
index 477d4121e..60774e60f 100644
--- a/website/docs/tutorials/content-safety/jailbreak-protection.md
+++ b/website/docs/tutorials/content-safety/jailbreak-protection.md
@@ -53,31 +53,35 @@ prompt_guard:
 
 ### Category-Level Jailbreak Protection
 
-You can enable or disable jailbreak detection at the category level for fine-grained security control:
+You can configure jailbreak detection at the category level for fine-grained security control, including both enabling/disabling and threshold customization:
 
 ```yaml
-# Global default setting
+# Global default settings
 prompt_guard:
   enabled: true  # Default for all categories
+  threshold: 0.7  # Default threshold for all categories
 
 categories:
-  # High-security category - explicitly enable
+  # High-security category - strict protection with high threshold
   - name: customer_support
     jailbreak_enabled: true  # Strict protection for public-facing
+    jailbreak_threshold: 0.9  # Higher threshold for stricter detection
     model_scores:
       - model: qwen3
         score: 0.8
 
-  # Internal tool - disable for trusted environment
+  # Internal tool - relaxed threshold for code/technical content
   - name: code_generation
-    jailbreak_enabled: false  # Allow broader input for developers
+    jailbreak_enabled: true  # Keep enabled but with relaxed threshold
+    jailbreak_threshold: 0.5  # Lower threshold to reduce false positives
     model_scores:
       - model: qwen3
         score: 0.9
 
-  # General category - inherits global setting
+  # General category - inherits global settings
   - name: general
-    # No jailbreak_enabled specified - uses global prompt_guard.enabled
+    # No jailbreak_enabled or jailbreak_threshold specified
+    # Uses global prompt_guard.enabled (true) and threshold (0.7)
     model_scores:
       - model: qwen3
         score: 0.5
@@ -88,13 +92,23 @@ categories:
 - **When `jailbreak_enabled` is not specified**: Category inherits from global `prompt_guard.enabled`
 - **When `jailbreak_enabled: true`**: Jailbreak detection is explicitly enabled for this category
 - **When `jailbreak_enabled: false`**: Jailbreak detection is explicitly disabled for this category
-- **Category-specific setting always overrides global setting** when explicitly configured
+- **When `jailbreak_threshold` is not specified**: Category inherits from global `prompt_guard.threshold`
+- **When `jailbreak_threshold: 0.X`**: Uses category-specific threshold (0.0-1.0)
+- **Category-specific settings always override global settings** when explicitly configured
+
+**Threshold Tuning Guide**:
+
+- **High threshold (0.8-0.95)**: Stricter detection, fewer false positives, may miss subtle attacks
+- **Medium threshold (0.6-0.8)**: Balanced detection, good for most use cases
+- **Low threshold (0.4-0.6)**: More sensitive, catches more attacks, higher false positive rate
+- **Recommended**: Start with 0.7 globally, adjust per category based on risk profile and false positive tolerance
 
 **Use Cases**:
 
-- **Enable for public-facing categories**: Customer support, business advice
-- **Disable for internal tools**: Code generation for developers, testing environments
-- **Inherit for general categories**: Use global default for most categories
+- **High-security categories (0.8-0.9 threshold)**: Customer support, business advice, public-facing APIs
+- **Technical categories (0.5-0.6 threshold)**: Code generation, developer tools (reduce false positives on technical jargon)
+- **Internal tools (0.5 threshold or disabled)**: Testing environments, trusted internal applications
+- **General categories (inherit global)**: Use global default for most categories
 
 ## How Jailbreak Protection Works