|
| 1 | +#!/usr/bin/env tsx |
| 2 | + |
| 3 | +import { Command } from "commander"; |
| 4 | +import * as fs from "fs/promises"; |
| 5 | +import * as path from "path"; |
| 6 | +import { fileURLToPath } from "url"; |
| 7 | +import { config as loadEnv } from "dotenv"; |
| 8 | +import { query } from "@anthropic-ai/claude-agent-sdk"; |
| 9 | + |
| 10 | +loadEnv(); |
| 11 | + |
| 12 | +const __filename = fileURLToPath(import.meta.url); |
| 13 | +const __dirname = path.dirname(__filename); |
| 14 | + |
| 15 | +type SupportedDatasetName = "onlineMind2Web"; |
| 16 | + |
| 17 | +interface OnlineMind2WebTask { |
| 18 | + task_id: string; |
| 19 | + confirmed_task: string; |
| 20 | + website: string; |
| 21 | + reference_length: number; |
| 22 | + level: string; |
| 23 | +} |
| 24 | + |
| 25 | +interface ClaudeUsageTotals { |
| 26 | + input_tokens: number; |
| 27 | + output_tokens: number; |
| 28 | + cache_creation_input_tokens: number; |
| 29 | + cache_read_input_tokens: number; |
| 30 | + total_cost_usd: number; |
| 31 | +} |
| 32 | + |
| 33 | +interface TaskRunResult { |
| 34 | + taskId: string; |
| 35 | + website: string; |
| 36 | + level: string; |
| 37 | + durationMs: number; |
| 38 | + usage: ClaudeUsageTotals; |
| 39 | +} |
| 40 | + |
| 41 | +interface StagehandUsageTotals { |
| 42 | + totalInputTokens: number; |
| 43 | + totalOutputTokens: number; |
| 44 | + totalTimeMs: number; |
| 45 | +} |
| 46 | + |
| 47 | +const DATASETS: Record<SupportedDatasetName, string> = { |
| 48 | + onlineMind2Web: path.resolve( |
| 49 | + __dirname, |
| 50 | + "./datasets/onlineMind2Web/onlineMind2Web.jsonl", |
| 51 | + ), |
| 52 | +}; |
| 53 | + |
| 54 | +const MCP_PRESETS: Record< |
| 55 | + string, |
| 56 | + { |
| 57 | + type: "stdio"; |
| 58 | + command: string; |
| 59 | + args: string[]; |
| 60 | + env?: Record<string, string>; |
| 61 | + } |
| 62 | +> = { |
| 63 | + stagehand: { |
| 64 | + type: "stdio", |
| 65 | + command: "npx", |
| 66 | + args: ["@browserbasehq/mcp-server-browserbase@latest"], |
| 67 | + env: {}, |
| 68 | + }, |
| 69 | + playwright: { |
| 70 | + type: "stdio", |
| 71 | + command: "npx", |
| 72 | + args: ["@playwright/mcp@latest"], |
| 73 | + env: {}, |
| 74 | + }, |
| 75 | + "chrome-devtools": { |
| 76 | + type: "stdio", |
| 77 | + command: "npx", |
| 78 | + args: ["chrome-devtools-mcp@latest"], |
| 79 | + env: {}, |
| 80 | + }, |
| 81 | + "browser-use": { |
| 82 | + type: "stdio", |
| 83 | + command: "npx", |
| 84 | + args: ["browser-use-mcp@latest"], |
| 85 | + env: {}, |
| 86 | + }, |
| 87 | +}; |
| 88 | + |
| 89 | +async function loadTasks( |
| 90 | + datasetName: SupportedDatasetName, |
| 91 | +): Promise<OnlineMind2WebTask[]> { |
| 92 | + const datasetPath = DATASETS[datasetName]; |
| 93 | + if (!datasetPath) { |
| 94 | + throw new Error( |
| 95 | + `[benchmark-mcp] Unsupported dataset "${datasetName}". Supported datasets: ${Object.keys(DATASETS).join(", ")}`, |
| 96 | + ); |
| 97 | + } |
| 98 | + const raw = await fs.readFile(datasetPath, "utf-8"); |
| 99 | + const lines = raw.split("\n").filter((line) => line.trim().length > 0); |
| 100 | + |
| 101 | + const tasks: OnlineMind2WebTask[] = []; |
| 102 | + for (const line of lines) { |
| 103 | + try { |
| 104 | + const parsed = JSON.parse(line) as OnlineMind2WebTask; |
| 105 | + if (parsed && parsed.task_id && parsed.confirmed_task && parsed.website) { |
| 106 | + tasks.push(parsed); |
| 107 | + } |
| 108 | + } catch (err) { |
| 109 | + // Skip malformed lines but surface a hint once. |
| 110 | + console.error("Skipping malformed JSONL line from dataset:", err); |
| 111 | + } |
| 112 | + } |
| 113 | + |
| 114 | + return tasks; |
| 115 | +} |
| 116 | + |
| 117 | +type QueryOptions = Parameters<typeof query>[0]; |
| 118 | + |
| 119 | +type QueryStreamMessage = { |
| 120 | + usage?: |
| 121 | + | { |
| 122 | + input_tokens?: number; |
| 123 | + output_tokens?: number; |
| 124 | + cache_creation_input_tokens?: number; |
| 125 | + cache_read_input_tokens?: number; |
| 126 | + total_cost_usd?: number; |
| 127 | + } |
| 128 | + | undefined; |
| 129 | +}; |
| 130 | + |
| 131 | +async function runTaskWithAgent( |
| 132 | + task: OnlineMind2WebTask, |
| 133 | + mcpName: string, |
| 134 | + mcpConfig: (typeof MCP_PRESETS)[string], |
| 135 | +): Promise<TaskRunResult> { |
| 136 | + const prompt = [ |
| 137 | + "You are a browsing agent.", |
| 138 | + "", |
| 139 | + `Start URL: ${task.website}`, |
| 140 | + `Task: ${task.confirmed_task}`, |
| 141 | + "", |
| 142 | + "Use the available MCP tools to browse and complete this task.", |
| 143 | + "When you are done, briefly summarize what you did.", |
| 144 | + ].join("\n"); |
| 145 | + |
| 146 | + const usageTotals: ClaudeUsageTotals = { |
| 147 | + input_tokens: 0, |
| 148 | + output_tokens: 0, |
| 149 | + cache_creation_input_tokens: 0, |
| 150 | + cache_read_input_tokens: 0, |
| 151 | + total_cost_usd: 0, |
| 152 | + }; |
| 153 | + |
| 154 | + const startTime = Date.now(); |
| 155 | + |
| 156 | + const stream = query({ |
| 157 | + prompt, |
| 158 | + options: { |
| 159 | + mcpServers: { |
| 160 | + [mcpName]: mcpConfig as unknown as Record<string, unknown>, |
| 161 | + }, |
| 162 | + }, |
| 163 | + } as QueryOptions); |
| 164 | + |
| 165 | + for await (const message of stream as AsyncIterable<QueryStreamMessage>) { |
| 166 | + const usage = message.usage; |
| 167 | + |
| 168 | + if (usage) { |
| 169 | + usageTotals.input_tokens += usage.input_tokens ?? 0; |
| 170 | + usageTotals.output_tokens += usage.output_tokens ?? 0; |
| 171 | + usageTotals.cache_creation_input_tokens += |
| 172 | + usage.cache_creation_input_tokens ?? 0; |
| 173 | + usageTotals.cache_read_input_tokens += usage.cache_read_input_tokens ?? 0; |
| 174 | + usageTotals.total_cost_usd += usage.total_cost_usd ?? 0; |
| 175 | + } |
| 176 | + } |
| 177 | + |
| 178 | + const durationMs = Date.now() - startTime; |
| 179 | + |
| 180 | + return { |
| 181 | + taskId: task.task_id, |
| 182 | + website: task.website, |
| 183 | + level: task.level, |
| 184 | + durationMs, |
| 185 | + usage: usageTotals, |
| 186 | + }; |
| 187 | +} |
| 188 | + |
| 189 | +async function fetchStagehandTokenUsageSummary(): Promise<StagehandUsageTotals | null> { |
| 190 | + const apiKey = process.env.BROWSERBASE_API_KEY; |
| 191 | + const projectId = process.env.BROWSERBASE_PROJECT_ID; |
| 192 | + const modelApiKey = |
| 193 | + process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || ""; |
| 194 | + const browserbaseSessionId = process.env.BROWSERBASE_SESSION_ID; |
| 195 | + |
| 196 | + if (!apiKey || !projectId || !modelApiKey || !browserbaseSessionId) { |
| 197 | + console.error( |
| 198 | + "[benchmark-mcp] Skipping Stagehand replay call due to missing API keys or BROWSERBASE_SESSION_ID.", |
| 199 | + ); |
| 200 | + return null; |
| 201 | + } |
| 202 | + |
| 203 | + const replayResponse = await fetch( |
| 204 | + `https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`, |
| 205 | + { |
| 206 | + method: "GET", |
| 207 | + headers: { |
| 208 | + "x-bb-api-key": apiKey, |
| 209 | + "x-bb-project-id": projectId, |
| 210 | + "x-bb-session-id": browserbaseSessionId, |
| 211 | + "x-stream-response": "true", |
| 212 | + "x-model-api-key": modelApiKey, |
| 213 | + "x-sent-at": new Date().toISOString(), |
| 214 | + "x-language": "typescript", |
| 215 | + "x-sdk-version": "3.0.1", |
| 216 | + }, |
| 217 | + }, |
| 218 | + ); |
| 219 | + |
| 220 | + try { |
| 221 | + const replayJson = (await replayResponse.json()) as { |
| 222 | + data: { |
| 223 | + pages: Array<{ |
| 224 | + actions: Array<{ |
| 225 | + tokenUsage?: { |
| 226 | + inputTokens: number; |
| 227 | + outputTokens: number; |
| 228 | + timeMs?: number; |
| 229 | + }; |
| 230 | + }>; |
| 231 | + }>; |
| 232 | + }; |
| 233 | + }; |
| 234 | + |
| 235 | + const totals: StagehandUsageTotals = { |
| 236 | + totalInputTokens: 0, |
| 237 | + totalOutputTokens: 0, |
| 238 | + totalTimeMs: 0, |
| 239 | + }; |
| 240 | + |
| 241 | + for (const page of replayJson.data.pages) { |
| 242 | + for (const action of page.actions) { |
| 243 | + if (action.tokenUsage) { |
| 244 | + totals.totalInputTokens += action.tokenUsage.inputTokens; |
| 245 | + totals.totalOutputTokens += action.tokenUsage.outputTokens; |
| 246 | + if (typeof action.tokenUsage.timeMs === "number") { |
| 247 | + totals.totalTimeMs += action.tokenUsage.timeMs; |
| 248 | + } |
| 249 | + } |
| 250 | + } |
| 251 | + } |
| 252 | + |
| 253 | + return totals; |
| 254 | + } catch (error) { |
| 255 | + const message = error instanceof Error ? error.message : String(error); |
| 256 | + console.error( |
| 257 | + "[benchmark-mcp] Failed to parse Stagehand replay response:", |
| 258 | + message, |
| 259 | + ); |
| 260 | + return null; |
| 261 | + } |
| 262 | +} |
| 263 | + |
| 264 | +async function runBenchmark( |
| 265 | + mcpName: string, |
| 266 | + datasetName: SupportedDatasetName, |
| 267 | + limit?: number, |
| 268 | +): Promise<void> { |
| 269 | + const mcpConfig = MCP_PRESETS[mcpName]; |
| 270 | + if (!mcpConfig) { |
| 271 | + console.error( |
| 272 | + `[benchmark-mcp] Unsupported MCP "${mcpName}". Supported MCPs: ${Object.keys(MCP_PRESETS).join(", ")}`, |
| 273 | + ); |
| 274 | + process.exitCode = 1; |
| 275 | + return; |
| 276 | + } |
| 277 | + |
| 278 | + let tasks = await loadTasks(datasetName); |
| 279 | + if (tasks.length === 0) { |
| 280 | + console.error( |
| 281 | + `[benchmark-mcp] No tasks loaded for dataset "${datasetName}".`, |
| 282 | + ); |
| 283 | + process.exitCode = 1; |
| 284 | + return; |
| 285 | + } |
| 286 | + |
| 287 | + if (typeof limit === "number" && Number.isFinite(limit) && limit > 0) { |
| 288 | + tasks = tasks.slice(0, Math.min(limit, tasks.length)); |
| 289 | + } |
| 290 | + |
| 291 | + console.log( |
| 292 | + `[benchmark-mcp] Running benchmark for MCP "${mcpName}" on dataset "${datasetName}" with ${tasks.length} tasks.`, |
| 293 | + ); |
| 294 | + |
| 295 | + const overallStart = Date.now(); |
| 296 | + |
| 297 | + const aggregateUsage: ClaudeUsageTotals = { |
| 298 | + input_tokens: 0, |
| 299 | + output_tokens: 0, |
| 300 | + cache_creation_input_tokens: 0, |
| 301 | + cache_read_input_tokens: 0, |
| 302 | + total_cost_usd: 0, |
| 303 | + }; |
| 304 | + |
| 305 | + let totalDurationMs = 0; |
| 306 | + |
| 307 | + for (let index = 0; index < tasks.length; index++) { |
| 308 | + const task = tasks[index]; |
| 309 | + const taskNumber = index + 1; |
| 310 | + |
| 311 | + console.log( |
| 312 | + `[benchmark-mcp] Task ${taskNumber}/${tasks.length} (${task.level}) ${task.task_id}`, |
| 313 | + ); |
| 314 | + console.log(`[benchmark-mcp] Website: ${task.website}`); |
| 315 | + console.log(`[benchmark-mcp] Goal: ${task.confirmed_task}`); |
| 316 | + |
| 317 | + const result = await runTaskWithAgent(task, mcpName, mcpConfig); |
| 318 | + |
| 319 | + console.log( |
| 320 | + `[benchmark-mcp] Completed in ${(result.durationMs / 1000).toFixed(2)}s | Claude tokens: ${result.usage.input_tokens} in / ${result.usage.output_tokens} out`, |
| 321 | + ); |
| 322 | + |
| 323 | + totalDurationMs += result.durationMs; |
| 324 | + aggregateUsage.input_tokens += result.usage.input_tokens; |
| 325 | + aggregateUsage.output_tokens += result.usage.output_tokens; |
| 326 | + aggregateUsage.cache_creation_input_tokens += |
| 327 | + result.usage.cache_creation_input_tokens; |
| 328 | + aggregateUsage.cache_read_input_tokens += |
| 329 | + result.usage.cache_read_input_tokens; |
| 330 | + aggregateUsage.total_cost_usd += result.usage.total_cost_usd; |
| 331 | + } |
| 332 | + |
| 333 | + const overallDurationMs = Date.now() - overallStart; |
| 334 | + const avgTaskDurationMs = totalDurationMs / tasks.length; |
| 335 | + |
| 336 | + console.log(""); |
| 337 | + console.log( |
| 338 | + `[benchmark-mcp] MCP: ${mcpName} | Dataset: ${datasetName} | Tasks: ${tasks.length}`, |
| 339 | + ); |
| 340 | + console.log( |
| 341 | + `[benchmark-mcp] Total time: ${(overallDurationMs / 1000).toFixed(2)}s | Avg/task: ${(avgTaskDurationMs / 1000).toFixed(2)}s`, |
| 342 | + ); |
| 343 | + console.log( |
| 344 | + `[benchmark-mcp] Claude tokens: ${aggregateUsage.input_tokens} in / ${aggregateUsage.output_tokens} out`, |
| 345 | + ); |
| 346 | + console.log( |
| 347 | + `[benchmark-mcp] Claude cache tokens: ${aggregateUsage.cache_creation_input_tokens} created / ${aggregateUsage.cache_read_input_tokens} read`, |
| 348 | + ); |
| 349 | + console.log( |
| 350 | + `[benchmark-mcp] Claude cost (approx): $${aggregateUsage.total_cost_usd.toFixed(6)}`, |
| 351 | + ); |
| 352 | + |
| 353 | + if (mcpName === "stagehand") { |
| 354 | + const stagehandTotals = await fetchStagehandTokenUsageSummary(); |
| 355 | + if (stagehandTotals) { |
| 356 | + console.log( |
| 357 | + `[benchmark-mcp] Stagehand tokens: ${stagehandTotals.totalInputTokens} in / ${stagehandTotals.totalOutputTokens} out | Time: ${(stagehandTotals.totalTimeMs / 1000).toFixed(2)}s`, |
| 358 | + ); |
| 359 | + } |
| 360 | + } |
| 361 | +} |
| 362 | + |
| 363 | +const program = new Command(); |
| 364 | + |
| 365 | +program |
| 366 | + .name("benchmark-mcp") |
| 367 | + .description( |
| 368 | + "Lightweight benchmarking script for MCPs using the Claude Agent SDK", |
| 369 | + ); |
| 370 | + |
| 371 | +program |
| 372 | + .requiredOption( |
| 373 | + "--mcp <name>", |
| 374 | + "MCP to benchmark (e.g. stagehand, playwright, chrome-devtools, browser-use)", |
| 375 | + ) |
| 376 | + .option( |
| 377 | + "--dataset <name>", |
| 378 | + 'Dataset name (currently only "onlineMind2Web" is supported)', |
| 379 | + "onlineMind2Web", |
| 380 | + ) |
| 381 | + .option( |
| 382 | + "--limit <number>", |
| 383 | + "Limit the number of tasks to run (for quick smoke tests)", |
| 384 | + ) |
| 385 | + .action( |
| 386 | + async (options: { |
| 387 | + mcp: string; |
| 388 | + dataset: SupportedDatasetName; |
| 389 | + limit?: string; |
| 390 | + }) => { |
| 391 | + const limit = |
| 392 | + typeof options.limit === "string" |
| 393 | + ? Number.parseInt(options.limit, 10) |
| 394 | + : undefined; |
| 395 | + await runBenchmark(options.mcp, options.dataset, limit); |
| 396 | + }, |
| 397 | + ); |
| 398 | + |
| 399 | +program.parse(); |
0 commit comments