@@ -49,10 +49,7 @@ interface StagehandLogContext {
4949 child : ChildProcess ;
5050 port : number ;
5151 browserbaseSessionId ?: string ;
52- stagehandTokenTotals ?: {
53- inputTokens : number ;
54- outputTokens : number ;
55- } ;
52+ stagehandTokenTotals ?: StagehandUsageTotals ;
5653}
5754
5855async function startStagehandServer (
@@ -99,15 +96,44 @@ async function startStagehandServer(
9996 context . browserbaseSessionId = urlMatch [ 1 ] ;
10097 }
10198
102- // Total token usage line
103- const tokensMatch = trimmed . match (
104- / T o t a l t o k e n u s a g e : \s + ( \d + ) \s + i n p u t t o k e n s , \s + ( \d + ) \s + o u t p u t t o k e n s / i,
105- ) ;
106- if ( tokensMatch ) {
107- context . stagehandTokenTotals = {
108- inputTokens : Number . parseInt ( tokensMatch [ 1 ] , 10 ) ,
109- outputTokens : Number . parseInt ( tokensMatch [ 2 ] , 10 ) ,
110- } ;
99+ // Usage metrics JSON line from Stagehand MCP
100+ const metricsMatch = trimmed . match ( / ^ U s a g e m e t r i c s : \s * ( \{ .* \} ) $ / ) ;
101+ if ( metricsMatch ) {
102+ try {
103+ const metrics = JSON . parse ( metricsMatch [ 1 ] ) as {
104+ totalPromptTokens ?: number ;
105+ totalCompletionTokens ?: number ;
106+ totalInferenceTimeMs ?: number ;
107+ promptTokens ?: number ;
108+ completionTokens ?: number ;
109+ inputTokens ?: number ;
110+ outputTokens ?: number ;
111+ timeMs ?: number ;
112+ } ;
113+
114+ const totalInputTokens =
115+ metrics . totalPromptTokens ??
116+ metrics . promptTokens ??
117+ metrics . inputTokens ??
118+ 0 ;
119+ const totalOutputTokens =
120+ metrics . totalCompletionTokens ??
121+ metrics . completionTokens ??
122+ metrics . outputTokens ??
123+ 0 ;
124+ const totalTimeMs = metrics . totalInferenceTimeMs ?? metrics . timeMs ?? 0 ;
125+
126+ context . stagehandTokenTotals = {
127+ totalInputTokens,
128+ totalOutputTokens,
129+ totalTimeMs,
130+ } ;
131+ } catch ( err ) {
132+ console . error (
133+ "[benchmark-mcp] Failed to parse Stagehand usage metrics JSON:" ,
134+ err ,
135+ ) ;
136+ }
111137 }
112138 } ;
113139
@@ -335,82 +361,6 @@ async function runTaskWithAgent(
335361 } ;
336362}
337363
338- async function fetchStagehandTokenUsageSummary (
339- browserbaseSessionId : string ,
340- ) : Promise < StagehandUsageTotals | null > {
341- const apiKey = process . env . BROWSERBASE_API_KEY ;
342- const projectId = process . env . BROWSERBASE_PROJECT_ID ;
343- const modelApiKey =
344- process . env . GEMINI_API_KEY || process . env . GOOGLE_API_KEY || "" ;
345-
346- if ( ! apiKey || ! projectId || ! modelApiKey ) {
347- console . error (
348- "[benchmark-mcp] Skipping Stagehand replay call due to missing API keys." ,
349- ) ;
350- return null ;
351- }
352-
353- const replayResponse = await fetch (
354- `https://api.stagehand.browserbase.com/v1/sessions/${ browserbaseSessionId } /replay` ,
355- {
356- method : "GET" ,
357- headers : {
358- "x-bb-api-key" : apiKey ,
359- "x-bb-project-id" : projectId ,
360- "x-bb-session-id" : browserbaseSessionId ,
361- "x-stream-response" : "true" ,
362- "x-model-api-key" : modelApiKey ,
363- "x-sent-at" : new Date ( ) . toISOString ( ) ,
364- "x-language" : "typescript" ,
365- "x-sdk-version" : "3.0.1" ,
366- } ,
367- } ,
368- ) ;
369-
370- try {
371- const replayJson = ( await replayResponse . json ( ) ) as {
372- data : {
373- pages : Array < {
374- actions : Array < {
375- tokenUsage ?: {
376- inputTokens : number ;
377- outputTokens : number ;
378- timeMs ?: number ;
379- } ;
380- } > ;
381- } > ;
382- } ;
383- } ;
384-
385- const totals : StagehandUsageTotals = {
386- totalInputTokens : 0 ,
387- totalOutputTokens : 0 ,
388- totalTimeMs : 0 ,
389- } ;
390-
391- for ( const page of replayJson . data . pages ) {
392- for ( const action of page . actions ) {
393- if ( action . tokenUsage ) {
394- totals . totalInputTokens += action . tokenUsage . inputTokens ;
395- totals . totalOutputTokens += action . tokenUsage . outputTokens ;
396- if ( typeof action . tokenUsage . timeMs === "number" ) {
397- totals . totalTimeMs += action . tokenUsage . timeMs ;
398- }
399- }
400- }
401- }
402-
403- return totals ;
404- } catch ( error ) {
405- const message = error instanceof Error ? error . message : String ( error ) ;
406- console . error (
407- "[benchmark-mcp] Failed to parse Stagehand replay response:" ,
408- message ,
409- ) ;
410- return null ;
411- }
412- }
413-
414364async function runBenchmark (
415365 mcpName : string ,
416366 datasetName : SupportedDatasetName ,
@@ -529,21 +479,7 @@ async function runBenchmark(
529479 ) ;
530480
531481 if ( isStagehand && stagehandContext ) {
532- let stagehandTotals : StagehandUsageTotals | null = null ;
533-
534- if ( stagehandContext . browserbaseSessionId ) {
535- stagehandTotals = await fetchStagehandTokenUsageSummary (
536- stagehandContext . browserbaseSessionId ,
537- ) ;
538- }
539-
540- if ( ! stagehandTotals && stagehandContext . stagehandTokenTotals ) {
541- stagehandTotals = {
542- totalInputTokens : stagehandContext . stagehandTokenTotals . inputTokens ,
543- totalOutputTokens : stagehandContext . stagehandTokenTotals . outputTokens ,
544- totalTimeMs : 0 ,
545- } ;
546- }
482+ const stagehandTotals = stagehandContext . stagehandTokenTotals ;
547483
548484 if ( stagehandTotals ) {
549485 console . log (
0 commit comments