6363 "GenerativeRequestsStatsProgressAggregator" ,
6464 "SchedulerStatsAggregator" ,
6565 "add_aggregate_metric" ,
66-
6766]
6867
6968
@@ -187,7 +186,7 @@ def __call__(
187186 "worker_resolve_start_delay" ,
188187 agg_state ,
189188 request_info .scheduler_timings .resolve_start ,
190- request_info .scheduler_timings .dequeued ,
189+ request_info .scheduler_timings .scheduled ,
191190 )
192191 add_aggregate_metric (
193192 "worker_resolve_time" ,
@@ -226,7 +225,7 @@ def __call__(
226225 request_info .request_timings .request_start ,
227226 )
228227 add_aggregate_metric (
229- "request_targeted_delay " ,
228+ "request_targeted_start_delay " ,
230229 agg_state ,
231230 request_info .request_timings .request_start ,
232231 request_info .scheduler_timings .targeted_start ,
@@ -324,6 +323,34 @@ def __call__(
324323 if response is None :
325324 return None
326325
326+ if (
327+ request_info .status == "completed"
328+ and request_info .request_timings .request_end is not None
329+ ):
330+ agg_state ["requests_per_second" ] = scheduler_state .successful_requests / (
331+ request_info .request_timings .request_end - scheduler_state .start_time
332+ )
333+ add_aggregate_metric (
334+ "request_latency" ,
335+ agg_state ,
336+ request_info .request_timings .request_end ,
337+ request_info .request_timings .request_start ,
338+ )
339+
340+ if (
341+ request_info .status == "completed"
342+ and request_info .request_timings .first_iteration is not None
343+ and request_info .request_timings .last_iteration is not None
344+ and response .output_tokens
345+ ):
346+ add_aggregate_metric (
347+ "time_per_output_token" ,
348+ agg_state ,
349+ request_info .request_timings .last_iteration ,
350+ request_info .request_timings .request_start ,
351+ count = response .output_tokens ,
352+ )
353+
327354 if (
328355 request_info .request_timings .first_iteration is not None
329356 and request_info .request_timings .request_start is not None
@@ -339,44 +366,57 @@ def __call__(
339366 request_info .request_timings .first_iteration is not None
340367 and request_info .request_timings .last_iteration is not None
341368 and response .output_tokens is not None
369+ and response .output_tokens > 1
342370 ):
343371 add_aggregate_metric (
344- "time_per_output_token " ,
372+ "inter_token_latency " ,
345373 agg_state ,
346374 request_info .request_timings .last_iteration ,
347- request_info .request_timings .request_start ,
348- count = response .output_tokens ,
375+ request_info .request_timings .first_iteration ,
376+ count = response .output_tokens - 1 ,
349377 )
350378
351- if response .output_tokens > 1 :
352- add_aggregate_metric (
353- "inter_token_latency" ,
354- agg_state ,
355- request_info .request_timings .last_iteration ,
356- request_info .request_timings .first_iteration ,
357- count = response .output_tokens - 1 ,
358- )
359-
360379 if response .prompt_tokens is not None :
361380 add_aggregate_metric (
362381 "prompt_tokens" ,
363382 agg_state ,
364383 response .prompt_tokens ,
365384 )
385+ if request_info .request_timings .request_end is not None :
386+ agg_state ["prompt_tokens_per_second" ] = agg_state [
387+ "prompt_tokens_total"
388+ ] / (
389+ request_info .request_timings .request_end
390+ - scheduler_state .start_time
391+ )
366392
367393 if response .output_tokens is not None :
368394 add_aggregate_metric (
369395 "output_tokens" ,
370396 agg_state ,
371397 response .output_tokens ,
372398 )
399+ if request_info .request_timings .request_end is not None :
400+ agg_state ["output_tokens_per_second" ] = agg_state [
401+ "output_tokens_total"
402+ ] / (
403+ request_info .request_timings .request_end
404+ - scheduler_state .start_time
405+ )
373406
374407 if response .total_tokens is not None :
375408 add_aggregate_metric (
376409 "total_tokens" ,
377410 agg_state ,
378411 response .total_tokens ,
379412 )
413+ if request_info .request_timings .request_end is not None :
414+ agg_state ["total_tokens_per_second" ] = agg_state [
415+ "total_tokens_total"
416+ ] / (
417+ request_info .request_timings .request_end
418+ - scheduler_state .start_time
419+ )
380420
381421 return agg_state
382422
@@ -411,6 +451,8 @@ class GenerativeRequestsAggregator(
411451 default = None ,
412452 description = "Cooldown duration in seconds to ignore at benchmark end" ,
413453 )
454+ _in_cooldown : bool = False
455+ _in_warmup : bool = False
414456
415457 def __call__ (
416458 self ,
@@ -433,44 +475,45 @@ def __call__(
433475 :param scheduler_state: Current scheduler execution state.
434476 :return: None, as this aggregator only collects for final compilation.
435477 """
478+ status = {
479+ "requests_in_warmup" : False ,
480+ "requests_in_cooldown" : False ,
481+ }
482+
436483 if (
437484 response is None
438485 or request_info .status not in {"completed" , "canceled" , "errored" }
439486 or (request_info .status == "canceled" and request_info .started_at is None )
440487 ):
441488 # Ignore requests that don't have a response yet.
442489 # Ignore requests that were canceled before they started.
443- return None
490+ return status
444491
445492 if (
446493 self .warmup_requests is not None
447494 and self .warmup_requests >= scheduler_state .processed_requests
448- ):
449- return None
450-
451- if (
495+ ) or (
452496 self .warmup_duration is not None
453497 and request_info .request_timings .request_end is not None
454498 and (
455499 scheduler_state .start_time + self .warmup_duration
456500 >= request_info .request_timings .request_end
457501 )
458502 ):
459- return None
503+ status ["requests_in_warmup" ] = True
504+
505+ return status
460506
461507 if (
462508 self .cooldown_requests is not None
463509 and scheduler_state .remaining_requests is not None
464510 and self .cooldown_requests >= scheduler_state .remaining_requests
465- ):
466- return None
467-
468- if (
511+ ) or (
469512 self .cooldown_duration is not None
470513 and scheduler_state .remaining_duration is not None
471514 and self .cooldown_duration >= scheduler_state .remaining_duration
472515 ):
473- return None
516+ return status [ "requests_in_cooldown" ]
474517
475518 if "completed" not in agg_state :
476519 agg_state ["completed" ] = []
@@ -484,7 +527,7 @@ def __call__(
484527 else :
485528 agg_state ["errored" ].append ((response , request , request_info ))
486529
487- return None
530+ return status
488531
489532 def compile (
490533 self , agg_state : dict [str , Any ], scheduler_state : SchedulerState
@@ -625,6 +668,22 @@ def compile(
625668 ],
626669 )
627670 ),
671+ total_token_count = (
672+ StatusDistributionSummary .from_values (
673+ value_types = [
674+ type_
675+ for type_ , req in zip (total_types , total )
676+ if req .prompt_tokens is not None
677+ or req .output_tokens is not None
678+ ],
679+ values = (
680+ (req .prompt_tokens or 0 ) + (req .output_tokens or 0 )
681+ for req in total
682+ if req .prompt_tokens is not None
683+ or req .output_tokens is not None
684+ ),
685+ )
686+ ),
628687 time_to_first_token_ms = (
629688 StatusDistributionSummary .from_values (
630689 value_types = [
0 commit comments