Add per-provider concurrency limits to proxy

codelion · codelion · commit c563c8c88d59 · 2025-09-09T12:25:49.000+08:00
Introduces a 'max_concurrent' setting for each provider in the proxy configuration, allowing control over the maximum number of concurrent requests per provider. The client now attempts to acquire a slot before sending a request and releases it after completion, skipping providers at max capacity. Configuration validation ensures 'max_concurrent' is a positive integer or None.
diff --git a/optillm/plugins/proxy/client.py b/optillm/plugins/proxy/client.py
@@ -215,6 +215,13 @@ def create(self, **kwargs):
                         
                     attempted_providers.add(provider)
                     
+                    # Try to acquire a slot for this provider (with short timeout to try next provider quickly)
+                    slot_timeout = 0.5  # Don't wait too long for a single provider
+                    if not provider.acquire_slot(timeout=slot_timeout):
+                        logger.debug(f"Provider {provider.name} at max capacity, trying next provider")
+                        errors.append((provider.name, "At max concurrent requests"))
+                        continue
+                    
                     try:
                         # Map model name if needed and filter out OptiLLM-specific parameters
                         request_kwargs = self._filter_kwargs(kwargs.copy())
@@ -255,6 +262,11 @@ def create(self, **kwargs):
                         if self.proxy_client.track_errors:
                             provider.is_healthy = False
                             provider.last_error = str(e)
+                    
+                    finally:
+                        # Always release the provider slot
+                        provider.release_slot()
+                        logger.debug(f"Released slot for provider {provider.name}")
             
                 # All providers failed, try fallback client
                 if self.proxy_client.fallback_client:
diff --git a/optillm/plugins/proxy/config.py b/optillm/plugins/proxy/config.py
@@ -172,6 +172,8 @@ def _apply_defaults(config: Dict) -> Dict:
             provider.setdefault('weight', 1)
             provider.setdefault('fallback_only', False)
             provider.setdefault('model_map', {})
+            # Per-provider concurrency limit (None means no limit)
+            provider.setdefault('max_concurrent', None)
         
         return config
     
@@ -200,6 +202,12 @@ def _validate_config(config: Dict) -> Dict:
             if provider['weight'] <= 0:
                 logger.warning(f"Provider {provider['name']} has invalid weight {provider['weight']}, setting to 1")
                 provider['weight'] = 1
+            
+            # Validate max_concurrent if specified
+            if provider.get('max_concurrent') is not None:
+                if not isinstance(provider['max_concurrent'], int) or provider['max_concurrent'] <= 0:
+                    logger.warning(f"Provider {provider['name']} has invalid max_concurrent {provider['max_concurrent']}, removing limit")
+                    provider['max_concurrent'] = None
         
         # Validate routing strategy
         valid_strategies = ['weighted', 'round_robin', 'failover']