@@ -14,12 +14,11 @@ See the License for the specific language governing permissions and
1414limitations under the License.
1515*/
1616
17- // Package controller contains the implementation of the ` FlowController` engine.
17+ // Package controller contains the implementation of the FlowController engine.
1818//
19- // The FlowController is the central processing engine of the Flow Control system . It is a sharded, high-throughput
19+ // The FlowController is the central processing engine of the Flow Control layer . It is a sharded, high-throughput
2020// component responsible for managing the lifecycle of all incoming requests. It achieves this by acting as a stateless
21- // supervisor that orchestrates a pool of stateful workers (`internal.ShardProcessor`), distributing incoming requests
22- // among them using a sophisticated load-balancing algorithm.
21+ // supervisor that orchestrates a pool of stateful workers (ShardProcessors), distributing incoming requests among them.
2322package controller
2423
2524import (
@@ -43,22 +42,20 @@ import (
4342 logutil "sigs.k8s.io/gateway-api-inference-extension/pkg/epp/util/logging"
4443)
4544
46- // registryClient defines the minimal interface that the ` FlowController` needs to interact with the ` FlowRegistry` .
45+ // registryClient defines the minimal interface that the FlowController needs to interact with the FlowRegistry.
4746type registryClient interface {
4847 contracts.FlowRegistryObserver
4948 contracts.FlowRegistryDataPlane
5049}
5150
52- // shardProcessor is the minimal internal interface that the `FlowController` requires from its workers.
53- // This abstraction allows for the injection of mock processors during testing.
51+ // shardProcessor is the minimal internal interface that the FlowController requires from its workers.
5452type shardProcessor interface {
5553 Run (ctx context.Context )
5654 Submit (item * internal.FlowItem ) error
5755 SubmitOrBlock (ctx context.Context , item * internal.FlowItem ) error
5856}
5957
60- // shardProcessorFactory defines the signature for a function that creates a `shardProcessor`.
61- // This enables dependency injection for testing.
58+ // shardProcessorFactory defines the signature for creating a shardProcessor.
6259type shardProcessorFactory func (
6360 ctx context.Context ,
6461 shard contracts.RegistryShard ,
@@ -74,15 +71,16 @@ var _ shardProcessor = &internal.ShardProcessor{}
7471// managedWorker holds the state for a single supervised worker.
7572type managedWorker struct {
7673 processor shardProcessor
77- cancel context.CancelFunc
74+ // cancel function for the worker-specific context. Used during shutdown and GC.
75+ cancel context.CancelFunc
7876}
7977
80- // FlowController is the central, high-throughput engine of the Flow Control system .
81- // It is designed as a stateless distributor that orchestrates a pool of stateful workers (`internal. ShardProcessor`),
82- // following a " supervisor-worker" pattern.
78+ // FlowController is the central, high-throughput engine of the Flow Control layer .
79+ // It is designed as a stateless distributor that orchestrates a pool of stateful workers (ShardProcessor), following a
80+ // supervisor-worker pattern.
8381//
84- // The controller's `Run` loop executes periodically, acting as a garbage collector that keeps the pool of running
85- // workers synchronized with the dynamic shard topology of the ` FlowRegistry` .
82+ // The controller's run loop executes periodically, acting as a garbage collector that keeps the pool of running
83+ // workers synchronized with the dynamic shard topology of the FlowRegistry.
8684//
8785// Request Lifecycle Management:
8886//
@@ -103,25 +101,26 @@ type FlowController struct {
103101
104102 // --- Lifecycle state ---
105103
106- // parentCtx is the root context for the controller's lifecycle, established when `Run` is called.
104+ // parentCtx is the root context for the controller's lifecycle, established when NewFlowController is called.
107105 // It is the parent for all long-lived worker goroutines.
108106 parentCtx context.Context
109107
110108 // --- Concurrent state ---
111109
112- // workers is a highly concurrent map storing the ` managedWorker` for each shard.
110+ // workers is a highly concurrent map storing the managedWorker for each shard.
113111 // It is the controller's source of truth for the worker pool.
114- // The key is the shard ID (`string`), and the value is a `*managedWorker`.
115- workers sync.Map
112+ workers sync.Map // key: shard ID (string); value: *managedWorker
116113
114+ // wg waits for all worker goroutines to terminate during shutdown.
117115 wg sync.WaitGroup
118116}
119117
120- // flowControllerOption is a function that applies a configuration change to a `FlowController` .
118+ // flowControllerOption is a function that applies a configuration change.
121119// test-only
122120type flowControllerOption func (* FlowController )
123121
124- // NewFlowController creates a new `FlowController` instance.
122+ // NewFlowController creates and starts a new FlowController instance.
123+ // The provided context governs the lifecycle of the controller and all its workers.
125124func NewFlowController (
126125 ctx context.Context ,
127126 config Config ,
@@ -131,15 +130,14 @@ func NewFlowController(
131130 opts ... flowControllerOption ,
132131) (* FlowController , error ) {
133132 fc := & FlowController {
134- config : * config . deepCopy () ,
133+ config : config ,
135134 registry : registry ,
136135 saturationDetector : sd ,
137136 clock : clock.RealClock {},
138137 logger : logger .WithName ("flow-controller" ),
139138 parentCtx : ctx ,
140139 }
141140
142- // Use the real shard processor implementation by default.
143141 fc .shardProcessorFactory = func (
144142 ctx context.Context ,
145143 shard contracts.RegistryShard ,
@@ -167,9 +165,9 @@ func NewFlowController(
167165 return fc , nil
168166}
169167
170- // run starts the ` FlowController` 's main reconciliation loop.
168+ // run starts the FlowController's main reconciliation loop (supervisor loop) .
171169// This loop is responsible for garbage collecting workers whose shards no longer exist in the registry.
172- // This method blocks until the provided context is cancelled and ALL worker goroutines have fully terminated.
170+ // This method blocks until the provided context is cancelled and all worker goroutines have fully terminated.
173171func (fc * FlowController ) run (ctx context.Context ) {
174172 fc .logger .Info ("Starting FlowController reconciliation loop." )
175173 defer fc .logger .Info ("FlowController reconciliation loop stopped." )
@@ -194,23 +192,19 @@ func (fc *FlowController) run(ctx context.Context) {
194192// # Design Rationale: The Synchronous Model
195193//
196194// This blocking model is deliberately chosen for its simplicity and robustness, especially in the context of Envoy
197- // External Processing (` ext_proc` ), which operates on a stream-based protocol.
195+ // External Processing (ext_proc), which operates on a stream-based protocol.
198196//
199- // - ` ext_proc` Alignment: A single goroutine typically manages the stream for a given HTTP request.
200- // ` EnqueueAndWait` fits this perfectly: the request-handling goroutine calls it, blocks, and upon return, has a
197+ // - ext_proc Alignment: A single goroutine typically manages the stream for a given HTTP request.
198+ // EnqueueAndWait fits this perfectly: the request-handling goroutine calls it, blocks, and upon return, has a
201199// definitive outcome to act upon.
202200// - Simplified State Management: The state of a "waiting" request is implicitly managed by the blocked goroutine's
203- // stack and its `context. Context` . The system only needs to signal this specific goroutine to unblock it.
204- // - Direct Backpressure: If queues are full, ` EnqueueAndWait` returns an error immediately, providing direct
201+ // stack and its Context. The system only needs to signal this specific goroutine to unblock it.
202+ // - Direct Backpressure: If queues are full, EnqueueAndWait returns an error immediately, providing direct
205203// backpressure to the caller.
206204func (fc * FlowController ) EnqueueAndWait (
207205 ctx context.Context ,
208206 req types.FlowControlRequest ,
209207) (types.QueueOutcome , error ) {
210- if req == nil {
211- return types .QueueOutcomeRejectedOther , errors .New ("request cannot be nil" )
212- }
213-
214208 flowKey := req .FlowKey ()
215209 fairnessID := flowKey .ID
216210 priority := strconv .Itoa (flowKey .Priority )
@@ -365,17 +359,22 @@ type candidate struct {
365359// of that flow's queue, from least to most loaded.
366360func (fc * FlowController ) selectDistributionCandidates (key types.FlowKey ) ([]candidate , error ) {
367361 var candidates []candidate
362+
363+ // Acquire a connection to the registry for the flow key. This ensures a consistent view of the ActiveShards for the
364+ // duration of the shard selection process, preventing races with concurrent shard topology changes.
368365 err := fc .registry .WithConnection (key , func (conn contracts.ActiveFlowConnection ) error {
369366 shards := conn .ActiveShards ()
370- candidates = make ([]candidate , len (shards ))
371- for i , shard := range shards {
367+ candidates = make ([]candidate , 0 , len (shards ))
368+ for _ , shard := range shards {
372369 worker := fc .getOrStartWorker (shard )
373370 mq , err := shard .ManagedQueue (key )
374371 if err != nil {
375- panic (fmt .Sprintf ("invariant violation: ManagedQueue for leased flow %s failed on shard %s: %v" ,
376- key , shard .ID (), err ))
372+ fc .logger .Error (err ,
373+ "Invariant violation. Failed to get ManagedQueue for a leased flow on an Active shard. Skipping shard." ,
374+ "flowKey" , key , "shardID" , shard .ID ())
375+ continue
377376 }
378- candidates [ i ] = candidate {worker .processor , shard .ID (), mq .ByteSize ()}
377+ candidates = append ( candidates , candidate {worker .processor , shard .ID (), mq .ByteSize ()})
379378 }
380379 return nil
381380 })
@@ -435,15 +434,15 @@ func (fc *FlowController) distributeRequest(
435434}
436435
437436// getOrStartWorker implements the lazy-loading and startup of shard processors.
438- // It attempts to retrieve an existing worker for a shard. If one doesn't exist, it constructs a new worker and attempts
439- // to register it atomically. The worker's processor goroutine is only started * after* it has successfully been
440- // registered, preventing race conditions where multiple goroutines create and start the same worker.
437+ // It ensures that exactly one worker goroutine is started for each shard, using atomic operations
438+ // (sync.Map.LoadOrStore). The worker's processor goroutine is only started after it has successfully been registered,
439+ // preventing race conditions where multiple goroutines create and start the same worker.
441440func (fc * FlowController ) getOrStartWorker (shard contracts.RegistryShard ) * managedWorker {
442441 if w , ok := fc .workers .Load (shard .ID ()); ok {
443442 return w .(* managedWorker )
444443 }
445444
446- // Construct a new worker, but do not start its processor goroutine yet.
445+ // Construct a new worker, but do not start its goroutine yet.
447446 processorCtx , cancel := context .WithCancel (fc .parentCtx )
448447 processor := fc .shardProcessorFactory (
449448 processorCtx ,
@@ -459,18 +458,17 @@ func (fc *FlowController) getOrStartWorker(shard contracts.RegistryShard) *manag
459458 cancel : cancel ,
460459 }
461460
462- // Atomically load or store. This is the critical step for preventing race conditions .
461+ // Atomically load or store. This is the critical synchronization step .
463462 actual , loaded := fc .workers .LoadOrStore (shard .ID (), newWorker )
464463 if loaded {
465464 // Another goroutine beat us to it. The `newWorker` we created was not stored.
466- // We must cancel the context we created for it to prevent a leak, but we do not need to do anything else, as its
467- // processor was never started.
465+ // We must cancel the context we created to prevent a leak.
468466 cancel ()
469467 return actual .(* managedWorker )
470468 }
471469
472- // We won the race. The ` newWorker` was successfully stored.
473- // Now, and only now, do we start the processor's long-running goroutine.
470+ // We won the race. The newWorker was stored. Now, start the processor's long-running goroutine .
471+ fc . logger . V ( logutil . DEFAULT ). Info ( "Starting new ShardProcessor worker." , "shardID" , shard . ID ())
474472 fc .wg .Add (1 )
475473 go func () {
476474 defer fc .wg .Done ()
@@ -481,24 +479,22 @@ func (fc *FlowController) getOrStartWorker(shard contracts.RegistryShard) *manag
481479}
482480
483481// reconcileProcessors is the supervisor's core garbage collection loop.
484- // It fetches the current list of Active shards from the registry and removes any workers whose corresponding shards
485- // have been fully drained and garbage collected by the registry.
482+ // It identifies and stops workers whose corresponding shards have been removed from the registry.
486483func (fc * FlowController ) reconcileProcessors () {
487484 stats := fc .registry .ShardStats ()
488- shards := make (map [string ]struct {}, len (stats )) // ` map[shardID] -> isActive`
485+ shards := make (map [string ]struct {}, len (stats )) // map[shardID] -> isActive
489486 for _ , s := range stats {
490487 shards [s .ID ] = struct {}{}
491488 }
492489
493490 fc .workers .Range (func (key , value any ) bool {
494491 shardID := key .(string )
495492 worker := value .(* managedWorker )
496-
497- // GC check: Is the shard no longer in the registry at all?
498493 if _ , exists := shards [shardID ]; ! exists {
499- fc .logger .Info ("Stale worker detected for GC'd shard, shutting down." , "shardID" , shardID )
500- worker .cancel ()
501- fc .workers .Delete (shardID )
494+ fc .logger .V (logutil .DEFAULT ).Info ("Stale worker detected for GC'd shard, initiating shutdown." ,
495+ "shardID" , shardID )
496+ worker .cancel () // Cancel the worker's context, initiating the Processor's graceful shutdown sequence.
497+ fc .workers .Delete (shardID ) // Delete from the map so no new requests are routed to it.
502498 }
503499 return true
504500 })
@@ -515,7 +511,6 @@ func (fc *FlowController) shutdown() {
515511 worker .cancel ()
516512 return true
517513 })
518-
519514 fc .wg .Wait ()
520515 fc .logger .Info ("All shard processors have shut down." )
521516}
0 commit comments