Skip to content

Commit c13052c

Browse files
committed
fix token usage tracking
1 parent 0d1ecdc commit c13052c

File tree

2 files changed

+52
-226
lines changed

2 files changed

+52
-226
lines changed

evals/benchmark-mcp.ts

Lines changed: 40 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,7 @@ interface StagehandLogContext {
4949
child: ChildProcess;
5050
port: number;
5151
browserbaseSessionId?: string;
52-
stagehandTokenTotals?: {
53-
inputTokens: number;
54-
outputTokens: number;
55-
};
52+
stagehandTokenTotals?: StagehandUsageTotals;
5653
}
5754

5855
async function startStagehandServer(
@@ -99,15 +96,44 @@ async function startStagehandServer(
9996
context.browserbaseSessionId = urlMatch[1];
10097
}
10198

102-
// Total token usage line
103-
const tokensMatch = trimmed.match(
104-
/Total token usage:\s+(\d+)\s+input tokens,\s+(\d+)\s+output tokens/i,
105-
);
106-
if (tokensMatch) {
107-
context.stagehandTokenTotals = {
108-
inputTokens: Number.parseInt(tokensMatch[1], 10),
109-
outputTokens: Number.parseInt(tokensMatch[2], 10),
110-
};
99+
// Usage metrics JSON line from Stagehand MCP
100+
const metricsMatch = trimmed.match(/^Usage metrics:\s*(\{.*\})$/);
101+
if (metricsMatch) {
102+
try {
103+
const metrics = JSON.parse(metricsMatch[1]) as {
104+
totalPromptTokens?: number;
105+
totalCompletionTokens?: number;
106+
totalInferenceTimeMs?: number;
107+
promptTokens?: number;
108+
completionTokens?: number;
109+
inputTokens?: number;
110+
outputTokens?: number;
111+
timeMs?: number;
112+
};
113+
114+
const totalInputTokens =
115+
metrics.totalPromptTokens ??
116+
metrics.promptTokens ??
117+
metrics.inputTokens ??
118+
0;
119+
const totalOutputTokens =
120+
metrics.totalCompletionTokens ??
121+
metrics.completionTokens ??
122+
metrics.outputTokens ??
123+
0;
124+
const totalTimeMs = metrics.totalInferenceTimeMs ?? metrics.timeMs ?? 0;
125+
126+
context.stagehandTokenTotals = {
127+
totalInputTokens,
128+
totalOutputTokens,
129+
totalTimeMs,
130+
};
131+
} catch (err) {
132+
console.error(
133+
"[benchmark-mcp] Failed to parse Stagehand usage metrics JSON:",
134+
err,
135+
);
136+
}
111137
}
112138
};
113139

@@ -335,82 +361,6 @@ async function runTaskWithAgent(
335361
};
336362
}
337363

338-
async function fetchStagehandTokenUsageSummary(
339-
browserbaseSessionId: string,
340-
): Promise<StagehandUsageTotals | null> {
341-
const apiKey = process.env.BROWSERBASE_API_KEY;
342-
const projectId = process.env.BROWSERBASE_PROJECT_ID;
343-
const modelApiKey =
344-
process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || "";
345-
346-
if (!apiKey || !projectId || !modelApiKey) {
347-
console.error(
348-
"[benchmark-mcp] Skipping Stagehand replay call due to missing API keys.",
349-
);
350-
return null;
351-
}
352-
353-
const replayResponse = await fetch(
354-
`https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`,
355-
{
356-
method: "GET",
357-
headers: {
358-
"x-bb-api-key": apiKey,
359-
"x-bb-project-id": projectId,
360-
"x-bb-session-id": browserbaseSessionId,
361-
"x-stream-response": "true",
362-
"x-model-api-key": modelApiKey,
363-
"x-sent-at": new Date().toISOString(),
364-
"x-language": "typescript",
365-
"x-sdk-version": "3.0.1",
366-
},
367-
},
368-
);
369-
370-
try {
371-
const replayJson = (await replayResponse.json()) as {
372-
data: {
373-
pages: Array<{
374-
actions: Array<{
375-
tokenUsage?: {
376-
inputTokens: number;
377-
outputTokens: number;
378-
timeMs?: number;
379-
};
380-
}>;
381-
}>;
382-
};
383-
};
384-
385-
const totals: StagehandUsageTotals = {
386-
totalInputTokens: 0,
387-
totalOutputTokens: 0,
388-
totalTimeMs: 0,
389-
};
390-
391-
for (const page of replayJson.data.pages) {
392-
for (const action of page.actions) {
393-
if (action.tokenUsage) {
394-
totals.totalInputTokens += action.tokenUsage.inputTokens;
395-
totals.totalOutputTokens += action.tokenUsage.outputTokens;
396-
if (typeof action.tokenUsage.timeMs === "number") {
397-
totals.totalTimeMs += action.tokenUsage.timeMs;
398-
}
399-
}
400-
}
401-
}
402-
403-
return totals;
404-
} catch (error) {
405-
const message = error instanceof Error ? error.message : String(error);
406-
console.error(
407-
"[benchmark-mcp] Failed to parse Stagehand replay response:",
408-
message,
409-
);
410-
return null;
411-
}
412-
}
413-
414364
async function runBenchmark(
415365
mcpName: string,
416366
datasetName: SupportedDatasetName,
@@ -529,21 +479,7 @@ async function runBenchmark(
529479
);
530480

531481
if (isStagehand && stagehandContext) {
532-
let stagehandTotals: StagehandUsageTotals | null = null;
533-
534-
if (stagehandContext.browserbaseSessionId) {
535-
stagehandTotals = await fetchStagehandTokenUsageSummary(
536-
stagehandContext.browserbaseSessionId,
537-
);
538-
}
539-
540-
if (!stagehandTotals && stagehandContext.stagehandTokenTotals) {
541-
stagehandTotals = {
542-
totalInputTokens: stagehandContext.stagehandTokenTotals.inputTokens,
543-
totalOutputTokens: stagehandContext.stagehandTokenTotals.outputTokens,
544-
totalTimeMs: 0,
545-
};
546-
}
482+
const stagehandTotals = stagehandContext.stagehandTokenTotals;
547483

548484
if (stagehandTotals) {
549485
console.log(

src/tools/session.ts

Lines changed: 12 additions & 122 deletions
Original file line numberDiff line numberDiff line change
@@ -7,44 +7,6 @@ import { createUIResource } from "@mcp-ui/server";
77
import type { BrowserSession } from "../types/types.js";
88
import { TextContent } from "@modelcontextprotocol/sdk/types.js";
99

10-
// Types for Stagehand replay response payload
11-
interface StagehandReplayTokenUsage {
12-
inputTokens: number;
13-
outputTokens: number;
14-
timeMs?: number;
15-
}
16-
17-
interface StagehandReplayAction {
18-
method: string;
19-
parameters?: unknown;
20-
result?: unknown;
21-
timestamp: number;
22-
endTime?: number;
23-
tokenUsage?: StagehandReplayTokenUsage;
24-
}
25-
26-
interface StagehandReplayPage {
27-
url: string;
28-
timestamp: number;
29-
duration: number;
30-
actions: StagehandReplayAction[];
31-
}
32-
33-
interface StagehandReplayData {
34-
pages: StagehandReplayPage[];
35-
}
36-
37-
interface StagehandReplayResponse {
38-
success: boolean;
39-
data: StagehandReplayData;
40-
}
41-
42-
interface StagehandReplayTokenUsageTotals {
43-
totalInputTokens: number;
44-
totalOutputTokens: number;
45-
totalTimeMs: number;
46-
}
47-
4810
// --- Tool: Create Session ---
4911
const CreateSessionInputSchema = z.object({
5012
// Keep sessionId optional
@@ -179,79 +141,6 @@ const closeSessionSchema: ToolSchema<typeof CloseSessionInputSchema> = {
179141
inputSchema: CloseSessionInputSchema,
180142
};
181143

182-
/**
183-
* Fetch token usage metrics from the Stagehand replay endpoint for a given
184-
* Browserbase session and return aggregated totals.
185-
*/
186-
async function fetchStagehandTokenUsageSummary(
187-
config: Context["config"],
188-
browserbaseSessionId: string,
189-
): Promise<StagehandReplayTokenUsageTotals | null> {
190-
const apiKey = config.browserbaseApiKey;
191-
const projectId = config.browserbaseProjectId;
192-
const modelApiKey =
193-
config.modelApiKey ||
194-
process.env.GEMINI_API_KEY ||
195-
process.env.GOOGLE_API_KEY;
196-
197-
if (!apiKey || !projectId || !modelApiKey) {
198-
process.stderr.write(
199-
"[tool.closeSession] Skipping Stagehand replay call due to missing API keys or session ID.\n",
200-
);
201-
return null;
202-
}
203-
204-
const replayResponse = await fetch(
205-
`https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`,
206-
{
207-
method: "GET",
208-
headers: {
209-
"x-bb-api-key": apiKey,
210-
"x-bb-project-id": projectId,
211-
"x-bb-session-id": browserbaseSessionId,
212-
"x-stream-response": "true",
213-
"x-model-api-key": modelApiKey,
214-
"x-sent-at": new Date().toISOString(),
215-
"x-language": "typescript",
216-
"x-sdk-version": "3.0.1",
217-
},
218-
},
219-
);
220-
221-
try {
222-
// This JSON should contain tokenUsage with inputTokens/outputTokens per action
223-
const replayJson = (await replayResponse.json()) as StagehandReplayResponse;
224-
225-
// Aggregate token usage across all pages/actions into a single totals object
226-
const totals: StagehandReplayTokenUsageTotals = {
227-
totalInputTokens: 0,
228-
totalOutputTokens: 0,
229-
totalTimeMs: 0,
230-
};
231-
232-
for (const page of replayJson.data.pages) {
233-
for (const action of page.actions) {
234-
if (action.tokenUsage) {
235-
totals.totalInputTokens += action.tokenUsage.inputTokens;
236-
totals.totalOutputTokens += action.tokenUsage.outputTokens;
237-
if (typeof action.tokenUsage.timeMs === "number") {
238-
totals.totalTimeMs += action.tokenUsage.timeMs;
239-
}
240-
}
241-
}
242-
}
243-
244-
return totals;
245-
} catch (parseError) {
246-
const message =
247-
parseError instanceof Error ? parseError.message : String(parseError);
248-
process.stderr.write(
249-
`[tool.closeSession] Failed to parse Stagehand replay response: ${message}\n`,
250-
);
251-
return null;
252-
}
253-
}
254-
255144
async function handleCloseSession(context: Context): Promise<ToolResult> {
256145
const action = async (): Promise<ToolActionResult> => {
257146
// Store the current session ID before cleanup
@@ -278,18 +167,19 @@ async function handleCloseSession(context: Context): Promise<ToolResult> {
278167
await sessionManager.cleanupSession(previousSessionId);
279168
cleanupSuccessful = true;
280169

281-
// Fetch Stagehand token usage metrics via shared util and log aggregate totals
282-
if (browserbaseSessionId) {
283-
const tokenUsageTotals = await fetchStagehandTokenUsageSummary(
284-
context.config,
285-
browserbaseSessionId,
170+
// Fetch Stagehand token usage metrics directly from the Stagehand instance
171+
try {
172+
const metrics = await session?.stagehand?.metrics;
173+
174+
process.stdout.write(`Usage metrics: ${JSON.stringify(metrics)}\n`);
175+
} catch (metricsError) {
176+
const message =
177+
metricsError instanceof Error
178+
? metricsError.message
179+
: String(metricsError);
180+
process.stderr.write(
181+
`[tool.closeSession] Failed to read Stagehand metrics: ${message}\n`,
286182
);
287-
288-
if (tokenUsageTotals) {
289-
process.stdout.write(
290-
`Total token usage: ${tokenUsageTotals.totalInputTokens} input tokens, ${tokenUsageTotals.totalOutputTokens} output tokens\n`,
291-
);
292-
}
293183
}
294184
} else {
295185
process.stderr.write(

0 commit comments

Comments
 (0)