Skip to content

Commit 3fb3ec0

Browse files
committed
spec out simple evals script
1 parent 5367e62 commit 3fb3ec0

File tree

6 files changed

+3458
-4237
lines changed

6 files changed

+3458
-4237
lines changed

evals/benchmark-mcp.ts

Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
#!/usr/bin/env tsx
2+
3+
import { Command } from "commander";
4+
import * as fs from "fs/promises";
5+
import * as path from "path";
6+
import { fileURLToPath } from "url";
7+
import { config as loadEnv } from "dotenv";
8+
import { query } from "@anthropic-ai/claude-agent-sdk";
9+
10+
loadEnv();
11+
12+
const __filename = fileURLToPath(import.meta.url);
13+
const __dirname = path.dirname(__filename);
14+
15+
type SupportedDatasetName = "onlineMind2Web";
16+
17+
interface OnlineMind2WebTask {
18+
task_id: string;
19+
confirmed_task: string;
20+
website: string;
21+
reference_length: number;
22+
level: string;
23+
}
24+
25+
interface ClaudeUsageTotals {
26+
input_tokens: number;
27+
output_tokens: number;
28+
cache_creation_input_tokens: number;
29+
cache_read_input_tokens: number;
30+
total_cost_usd: number;
31+
}
32+
33+
interface TaskRunResult {
34+
taskId: string;
35+
website: string;
36+
level: string;
37+
durationMs: number;
38+
usage: ClaudeUsageTotals;
39+
}
40+
41+
interface StagehandUsageTotals {
42+
totalInputTokens: number;
43+
totalOutputTokens: number;
44+
totalTimeMs: number;
45+
}
46+
47+
const DATASETS: Record<SupportedDatasetName, string> = {
48+
onlineMind2Web: path.resolve(
49+
__dirname,
50+
"./datasets/onlineMind2Web/onlineMind2Web.jsonl",
51+
),
52+
};
53+
54+
const MCP_PRESETS: Record<
55+
string,
56+
{
57+
type: "stdio";
58+
command: string;
59+
args: string[];
60+
env?: Record<string, string>;
61+
}
62+
> = {
63+
stagehand: {
64+
type: "stdio",
65+
command: "npx",
66+
args: ["@browserbasehq/mcp-server-browserbase@latest"],
67+
env: {},
68+
},
69+
playwright: {
70+
type: "stdio",
71+
command: "npx",
72+
args: ["@playwright/mcp@latest"],
73+
env: {},
74+
},
75+
"chrome-devtools": {
76+
type: "stdio",
77+
command: "npx",
78+
args: ["chrome-devtools-mcp@latest"],
79+
env: {},
80+
},
81+
"browser-use": {
82+
type: "stdio",
83+
command: "npx",
84+
args: ["browser-use-mcp@latest"],
85+
env: {},
86+
},
87+
};
88+
89+
async function loadTasks(
90+
datasetName: SupportedDatasetName,
91+
): Promise<OnlineMind2WebTask[]> {
92+
const datasetPath = DATASETS[datasetName];
93+
if (!datasetPath) {
94+
throw new Error(
95+
`[benchmark-mcp] Unsupported dataset "${datasetName}". Supported datasets: ${Object.keys(DATASETS).join(", ")}`,
96+
);
97+
}
98+
const raw = await fs.readFile(datasetPath, "utf-8");
99+
const lines = raw.split("\n").filter((line) => line.trim().length > 0);
100+
101+
const tasks: OnlineMind2WebTask[] = [];
102+
for (const line of lines) {
103+
try {
104+
const parsed = JSON.parse(line) as OnlineMind2WebTask;
105+
if (parsed && parsed.task_id && parsed.confirmed_task && parsed.website) {
106+
tasks.push(parsed);
107+
}
108+
} catch (err) {
109+
// Skip malformed lines but surface a hint once.
110+
console.error("Skipping malformed JSONL line from dataset:", err);
111+
}
112+
}
113+
114+
return tasks;
115+
}
116+
117+
type QueryOptions = Parameters<typeof query>[0];
118+
119+
type QueryStreamMessage = {
120+
usage?:
121+
| {
122+
input_tokens?: number;
123+
output_tokens?: number;
124+
cache_creation_input_tokens?: number;
125+
cache_read_input_tokens?: number;
126+
total_cost_usd?: number;
127+
}
128+
| undefined;
129+
};
130+
131+
async function runTaskWithAgent(
132+
task: OnlineMind2WebTask,
133+
mcpName: string,
134+
mcpConfig: (typeof MCP_PRESETS)[string],
135+
): Promise<TaskRunResult> {
136+
const prompt = [
137+
"You are a browsing agent.",
138+
"",
139+
`Start URL: ${task.website}`,
140+
`Task: ${task.confirmed_task}`,
141+
"",
142+
"Use the available MCP tools to browse and complete this task.",
143+
"When you are done, briefly summarize what you did.",
144+
].join("\n");
145+
146+
const usageTotals: ClaudeUsageTotals = {
147+
input_tokens: 0,
148+
output_tokens: 0,
149+
cache_creation_input_tokens: 0,
150+
cache_read_input_tokens: 0,
151+
total_cost_usd: 0,
152+
};
153+
154+
const startTime = Date.now();
155+
156+
const stream = query({
157+
prompt,
158+
options: {
159+
mcpServers: {
160+
[mcpName]: mcpConfig as unknown as Record<string, unknown>,
161+
},
162+
},
163+
} as QueryOptions);
164+
165+
for await (const message of stream as AsyncIterable<QueryStreamMessage>) {
166+
const usage = message.usage;
167+
168+
if (usage) {
169+
usageTotals.input_tokens += usage.input_tokens ?? 0;
170+
usageTotals.output_tokens += usage.output_tokens ?? 0;
171+
usageTotals.cache_creation_input_tokens +=
172+
usage.cache_creation_input_tokens ?? 0;
173+
usageTotals.cache_read_input_tokens += usage.cache_read_input_tokens ?? 0;
174+
usageTotals.total_cost_usd += usage.total_cost_usd ?? 0;
175+
}
176+
}
177+
178+
const durationMs = Date.now() - startTime;
179+
180+
return {
181+
taskId: task.task_id,
182+
website: task.website,
183+
level: task.level,
184+
durationMs,
185+
usage: usageTotals,
186+
};
187+
}
188+
189+
async function fetchStagehandTokenUsageSummary(): Promise<StagehandUsageTotals | null> {
190+
const apiKey = process.env.BROWSERBASE_API_KEY;
191+
const projectId = process.env.BROWSERBASE_PROJECT_ID;
192+
const modelApiKey =
193+
process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || "";
194+
const browserbaseSessionId = process.env.BROWSERBASE_SESSION_ID;
195+
196+
if (!apiKey || !projectId || !modelApiKey || !browserbaseSessionId) {
197+
console.error(
198+
"[benchmark-mcp] Skipping Stagehand replay call due to missing API keys or BROWSERBASE_SESSION_ID.",
199+
);
200+
return null;
201+
}
202+
203+
const replayResponse = await fetch(
204+
`https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`,
205+
{
206+
method: "GET",
207+
headers: {
208+
"x-bb-api-key": apiKey,
209+
"x-bb-project-id": projectId,
210+
"x-bb-session-id": browserbaseSessionId,
211+
"x-stream-response": "true",
212+
"x-model-api-key": modelApiKey,
213+
"x-sent-at": new Date().toISOString(),
214+
"x-language": "typescript",
215+
"x-sdk-version": "3.0.1",
216+
},
217+
},
218+
);
219+
220+
try {
221+
const replayJson = (await replayResponse.json()) as {
222+
data: {
223+
pages: Array<{
224+
actions: Array<{
225+
tokenUsage?: {
226+
inputTokens: number;
227+
outputTokens: number;
228+
timeMs?: number;
229+
};
230+
}>;
231+
}>;
232+
};
233+
};
234+
235+
const totals: StagehandUsageTotals = {
236+
totalInputTokens: 0,
237+
totalOutputTokens: 0,
238+
totalTimeMs: 0,
239+
};
240+
241+
for (const page of replayJson.data.pages) {
242+
for (const action of page.actions) {
243+
if (action.tokenUsage) {
244+
totals.totalInputTokens += action.tokenUsage.inputTokens;
245+
totals.totalOutputTokens += action.tokenUsage.outputTokens;
246+
if (typeof action.tokenUsage.timeMs === "number") {
247+
totals.totalTimeMs += action.tokenUsage.timeMs;
248+
}
249+
}
250+
}
251+
}
252+
253+
return totals;
254+
} catch (error) {
255+
const message = error instanceof Error ? error.message : String(error);
256+
console.error(
257+
"[benchmark-mcp] Failed to parse Stagehand replay response:",
258+
message,
259+
);
260+
return null;
261+
}
262+
}
263+
264+
async function runBenchmark(
265+
mcpName: string,
266+
datasetName: SupportedDatasetName,
267+
limit?: number,
268+
): Promise<void> {
269+
const mcpConfig = MCP_PRESETS[mcpName];
270+
if (!mcpConfig) {
271+
console.error(
272+
`[benchmark-mcp] Unsupported MCP "${mcpName}". Supported MCPs: ${Object.keys(MCP_PRESETS).join(", ")}`,
273+
);
274+
process.exitCode = 1;
275+
return;
276+
}
277+
278+
let tasks = await loadTasks(datasetName);
279+
if (tasks.length === 0) {
280+
console.error(
281+
`[benchmark-mcp] No tasks loaded for dataset "${datasetName}".`,
282+
);
283+
process.exitCode = 1;
284+
return;
285+
}
286+
287+
if (typeof limit === "number" && Number.isFinite(limit) && limit > 0) {
288+
tasks = tasks.slice(0, Math.min(limit, tasks.length));
289+
}
290+
291+
console.log(
292+
`[benchmark-mcp] Running benchmark for MCP "${mcpName}" on dataset "${datasetName}" with ${tasks.length} tasks.`,
293+
);
294+
295+
const overallStart = Date.now();
296+
297+
const aggregateUsage: ClaudeUsageTotals = {
298+
input_tokens: 0,
299+
output_tokens: 0,
300+
cache_creation_input_tokens: 0,
301+
cache_read_input_tokens: 0,
302+
total_cost_usd: 0,
303+
};
304+
305+
let totalDurationMs = 0;
306+
307+
for (let index = 0; index < tasks.length; index++) {
308+
const task = tasks[index];
309+
const taskNumber = index + 1;
310+
311+
console.log(
312+
`[benchmark-mcp] Task ${taskNumber}/${tasks.length} (${task.level}) ${task.task_id}`,
313+
);
314+
console.log(`[benchmark-mcp] Website: ${task.website}`);
315+
console.log(`[benchmark-mcp] Goal: ${task.confirmed_task}`);
316+
317+
const result = await runTaskWithAgent(task, mcpName, mcpConfig);
318+
319+
console.log(
320+
`[benchmark-mcp] Completed in ${(result.durationMs / 1000).toFixed(2)}s | Claude tokens: ${result.usage.input_tokens} in / ${result.usage.output_tokens} out`,
321+
);
322+
323+
totalDurationMs += result.durationMs;
324+
aggregateUsage.input_tokens += result.usage.input_tokens;
325+
aggregateUsage.output_tokens += result.usage.output_tokens;
326+
aggregateUsage.cache_creation_input_tokens +=
327+
result.usage.cache_creation_input_tokens;
328+
aggregateUsage.cache_read_input_tokens +=
329+
result.usage.cache_read_input_tokens;
330+
aggregateUsage.total_cost_usd += result.usage.total_cost_usd;
331+
}
332+
333+
const overallDurationMs = Date.now() - overallStart;
334+
const avgTaskDurationMs = totalDurationMs / tasks.length;
335+
336+
console.log("");
337+
console.log(
338+
`[benchmark-mcp] MCP: ${mcpName} | Dataset: ${datasetName} | Tasks: ${tasks.length}`,
339+
);
340+
console.log(
341+
`[benchmark-mcp] Total time: ${(overallDurationMs / 1000).toFixed(2)}s | Avg/task: ${(avgTaskDurationMs / 1000).toFixed(2)}s`,
342+
);
343+
console.log(
344+
`[benchmark-mcp] Claude tokens: ${aggregateUsage.input_tokens} in / ${aggregateUsage.output_tokens} out`,
345+
);
346+
console.log(
347+
`[benchmark-mcp] Claude cache tokens: ${aggregateUsage.cache_creation_input_tokens} created / ${aggregateUsage.cache_read_input_tokens} read`,
348+
);
349+
console.log(
350+
`[benchmark-mcp] Claude cost (approx): $${aggregateUsage.total_cost_usd.toFixed(6)}`,
351+
);
352+
353+
if (mcpName === "stagehand") {
354+
const stagehandTotals = await fetchStagehandTokenUsageSummary();
355+
if (stagehandTotals) {
356+
console.log(
357+
`[benchmark-mcp] Stagehand tokens: ${stagehandTotals.totalInputTokens} in / ${stagehandTotals.totalOutputTokens} out | Time: ${(stagehandTotals.totalTimeMs / 1000).toFixed(2)}s`,
358+
);
359+
}
360+
}
361+
}
362+
363+
const program = new Command();
364+
365+
program
366+
.name("benchmark-mcp")
367+
.description(
368+
"Lightweight benchmarking script for MCPs using the Claude Agent SDK",
369+
);
370+
371+
program
372+
.requiredOption(
373+
"--mcp <name>",
374+
"MCP to benchmark (e.g. stagehand, playwright, chrome-devtools, browser-use)",
375+
)
376+
.option(
377+
"--dataset <name>",
378+
'Dataset name (currently only "onlineMind2Web" is supported)',
379+
"onlineMind2Web",
380+
)
381+
.option(
382+
"--limit <number>",
383+
"Limit the number of tasks to run (for quick smoke tests)",
384+
)
385+
.action(
386+
async (options: {
387+
mcp: string;
388+
dataset: SupportedDatasetName;
389+
limit?: string;
390+
}) => {
391+
const limit =
392+
typeof options.limit === "string"
393+
? Number.parseInt(options.limit, 10)
394+
: undefined;
395+
await runBenchmark(options.mcp, options.dataset, limit);
396+
},
397+
);
398+
399+
program.parse();

0 commit comments

Comments
 (0)