@@ -465,20 +465,23 @@ export class MLCEngine implements MLCEngineInterface {
465465 pipeline : LLMChatPipeline ,
466466 chatConfig : ChatConfig ,
467467 genConfig : GenerationConfig ,
468+ timeReceived : number ,
468469 ) : AsyncGenerator < ChatCompletionChunk , void , void > ;
469470 asyncGenerate (
470471 request : CompletionCreateParamsStreaming ,
471472 model : string ,
472473 pipeline : LLMChatPipeline ,
473474 chatConfig : ChatConfig ,
474475 genConfig : GenerationConfig ,
476+ timeReceived : number ,
475477 ) : AsyncGenerator < Completion , void , void > ;
476478 async * asyncGenerate (
477479 request : ChatCompletionRequestStreaming | CompletionCreateParamsStreaming ,
478480 model : string ,
479481 pipeline : LLMChatPipeline ,
480482 chatConfig : ChatConfig ,
481483 genConfig : GenerationConfig ,
484+ timeReceived : number ,
482485 ) : AsyncGenerator < ChatCompletionChunk | Completion , void , void > {
483486 // Since it is an async generator, we need to do fine-grained try-catch to ensure lock is
484487 // released only when errors occur. Then release at the very end when no error occurs.
@@ -678,18 +681,39 @@ export class MLCEngine implements MLCEngineInterface {
678681
679682 // 4. Usage chunk
680683 if ( request . stream_options ?. include_usage ) {
684+ const usedGrammar =
685+ "response_format" in request &&
686+ ( request . response_format ?. type === "grammar" ||
687+ request . response_format ?. type === "json_object" ) ;
681688 const completion_tokens = pipeline . getCurRoundDecodingTotalTokens ( ) ;
682689 const prompt_tokens = pipeline . getCurRoundPrefillTotalTokens ( ) ;
683690 const prefill_tokens_per_s = pipeline . getCurRoundPrefillTokensPerSec ( ) ;
684691 const decode_tokens_per_s = pipeline . getCurRoundDecodingTokensPerSec ( ) ;
692+ const grammar_init_s = pipeline . getCurRoundGrammarInitTotalTime ( ) ;
693+ const prefill_time = pipeline . getCurRoundPrefillTotalTime ( ) ;
694+ const decode_time = pipeline . getCurRoundDecodingTotalTime ( ) ;
695+ const grammar_per_token_s =
696+ pipeline . getCurRoundGrammarPerTokenTotalTime ( ) ;
697+ const defaultExtra = {
698+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
699+ prefill_tokens_per_s : prefill_tokens_per_s ,
700+ decode_tokens_per_s : decode_tokens_per_s ,
701+ time_to_first_token_s : prefill_time ,
702+ time_per_output_token_s : decode_time / completion_tokens ,
703+ } ;
685704 const usage : CompletionUsage = {
686705 completion_tokens : completion_tokens ,
687706 prompt_tokens : prompt_tokens ,
688707 total_tokens : completion_tokens + prompt_tokens ,
689- extra : {
690- prefill_tokens_per_s : prefill_tokens_per_s ,
691- decode_tokens_per_s : decode_tokens_per_s ,
692- } ,
708+ extra : usedGrammar
709+ ? {
710+ ...defaultExtra ,
711+ ...{
712+ grammar_init_s : grammar_init_s ,
713+ grammar_per_token_s : grammar_per_token_s / completion_tokens ,
714+ } ,
715+ }
716+ : defaultExtra ,
693717 } ;
694718 if ( isChatCompletion ) {
695719 const usageChunk : ChatCompletionChunk = {
@@ -745,6 +769,7 @@ export class MLCEngine implements MLCEngineInterface {
745769 async chatCompletion (
746770 request : ChatCompletionRequest ,
747771 ) : Promise < AsyncIterable < ChatCompletionChunk > | ChatCompletion > {
772+ const timeReceived = Date . now ( ) ;
748773 // 0. Check model loaded and preprocess inputs
749774 const [ selectedModelId , selectedPipeline , selectedChatConfig ] =
750775 this . getLLMStates ( "ChatCompletionRequest" , request . model ) ;
@@ -766,6 +791,7 @@ export class MLCEngine implements MLCEngineInterface {
766791 logprobs : request . logprobs ,
767792 top_logprobs : request . top_logprobs ,
768793 response_format : request . response_format ,
794+ ignore_eos : request . ignore_eos ,
769795 } ;
770796
771797 // 0.5 Block wait until this pipeline finishes all previous requests
@@ -780,6 +806,7 @@ export class MLCEngine implements MLCEngineInterface {
780806 selectedPipeline ,
781807 selectedChatConfig ,
782808 genConfig ,
809+ timeReceived ,
783810 ) ;
784811 }
785812
@@ -796,6 +823,8 @@ export class MLCEngine implements MLCEngineInterface {
796823 let prompt_tokens = 0 ;
797824 let prefill_time = 0 ;
798825 let decode_time = 0 ;
826+ let grammar_init_s = 0 ;
827+ let grammar_per_token_s = 0 ;
799828 for ( let i = 0 ; i < n ; i ++ ) {
800829 let outputMessage : string ;
801830 if ( this . interruptSignal ) {
@@ -852,8 +881,21 @@ export class MLCEngine implements MLCEngineInterface {
852881 prompt_tokens += selectedPipeline . getCurRoundPrefillTotalTokens ( ) ;
853882 prefill_time += selectedPipeline . getCurRoundPrefillTotalTime ( ) ;
854883 decode_time += selectedPipeline . getCurRoundDecodingTotalTime ( ) ;
884+ grammar_init_s += selectedPipeline . getCurRoundGrammarInitTotalTime ( ) ;
885+ grammar_per_token_s +=
886+ selectedPipeline . getCurRoundGrammarPerTokenTotalTime ( ) ;
855887 }
856-
888+ const usedGrammar =
889+ "response_format" in request &&
890+ ( request . response_format ?. type === "grammar" ||
891+ request . response_format ?. type === "json_object" ) ;
892+ const defaultExtra = {
893+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
894+ prefill_tokens_per_s : prompt_tokens / prefill_time ,
895+ decode_tokens_per_s : completion_tokens / decode_time ,
896+ time_to_first_token_s : prefill_time ,
897+ time_per_output_token_s : decode_time / completion_tokens ,
898+ } ;
857899 const response : ChatCompletion = {
858900 id : crypto . randomUUID ( ) ,
859901 choices : choices ,
@@ -864,10 +906,15 @@ export class MLCEngine implements MLCEngineInterface {
864906 completion_tokens : completion_tokens ,
865907 prompt_tokens : prompt_tokens ,
866908 total_tokens : completion_tokens + prompt_tokens ,
867- extra : {
868- prefill_tokens_per_s : prompt_tokens / prefill_time ,
869- decode_tokens_per_s : completion_tokens / decode_time ,
870- } ,
909+ extra : usedGrammar
910+ ? {
911+ ...defaultExtra ,
912+ ...{
913+ grammar_init_s : grammar_init_s ,
914+ grammar_per_token_s : grammar_per_token_s / completion_tokens ,
915+ } ,
916+ }
917+ : defaultExtra ,
871918 } as CompletionUsage ,
872919 } ;
873920
@@ -901,6 +948,8 @@ export class MLCEngine implements MLCEngineInterface {
901948 async completion (
902949 request : CompletionCreateParams ,
903950 ) : Promise < AsyncIterable < Completion > | Completion > {
951+ const timeReceived = Date . now ( ) ;
952+
904953 // 0. Check model loaded and preprocess inputs
905954 const [ selectedModelId , selectedPipeline , selectedChatConfig ] =
906955 this . getLLMStates ( "CompletionCreateParams" , request . model ) ;
@@ -915,6 +964,7 @@ export class MLCEngine implements MLCEngineInterface {
915964 logit_bias : request . logit_bias ,
916965 logprobs : request . logprobs ,
917966 top_logprobs : request . top_logprobs ,
967+ ignore_eos : request . ignore_eos ,
918968 } ;
919969
920970 // 0.5 Block wait until this pipeline finishes all previous requests
@@ -929,6 +979,7 @@ export class MLCEngine implements MLCEngineInterface {
929979 selectedPipeline ,
930980 selectedChatConfig ,
931981 genConfig ,
982+ timeReceived ,
932983 ) ;
933984 }
934985
@@ -989,8 +1040,11 @@ export class MLCEngine implements MLCEngineInterface {
9891040 prompt_tokens : prompt_tokens ,
9901041 total_tokens : completion_tokens + prompt_tokens ,
9911042 extra : {
1043+ e2e_latency_s : ( Date . now ( ) - timeReceived ) / 1000 ,
9921044 prefill_tokens_per_s : prompt_tokens / prefill_time ,
9931045 decode_tokens_per_s : completion_tokens / decode_time ,
1046+ time_to_first_token_s : prefill_time ,
1047+ time_per_output_token_s : decode_time / completion_tokens ,
9941048 } ,
9951049 } as CompletionUsage ,
9961050 } ;
0 commit comments