fix token usage tracking

shrey150 · shrey150 · commit c13052cd9686 · 2025-11-17T16:13:31.000-08:00
diff --git a/evals/benchmark-mcp.ts b/evals/benchmark-mcp.ts
@@ -49,10 +49,7 @@ interface StagehandLogContext {
   child: ChildProcess;
   port: number;
   browserbaseSessionId?: string;
-  stagehandTokenTotals?: {
-    inputTokens: number;
-    outputTokens: number;
-  };
+  stagehandTokenTotals?: StagehandUsageTotals;
 }
 
 async function startStagehandServer(
@@ -99,15 +96,44 @@ async function startStagehandServer(
       context.browserbaseSessionId = urlMatch[1];
     }
 
-    // Total token usage line
-    const tokensMatch = trimmed.match(
-      /Total token usage:\s+(\d+)\s+input tokens,\s+(\d+)\s+output tokens/i,
-    );
-    if (tokensMatch) {
-      context.stagehandTokenTotals = {
-        inputTokens: Number.parseInt(tokensMatch[1], 10),
-        outputTokens: Number.parseInt(tokensMatch[2], 10),
-      };
+    // Usage metrics JSON line from Stagehand MCP
+    const metricsMatch = trimmed.match(/^Usage metrics:\s*(\{.*\})$/);
+    if (metricsMatch) {
+      try {
+        const metrics = JSON.parse(metricsMatch[1]) as {
+          totalPromptTokens?: number;
+          totalCompletionTokens?: number;
+          totalInferenceTimeMs?: number;
+          promptTokens?: number;
+          completionTokens?: number;
+          inputTokens?: number;
+          outputTokens?: number;
+          timeMs?: number;
+        };
+
+        const totalInputTokens =
+          metrics.totalPromptTokens ??
+          metrics.promptTokens ??
+          metrics.inputTokens ??
+          0;
+        const totalOutputTokens =
+          metrics.totalCompletionTokens ??
+          metrics.completionTokens ??
+          metrics.outputTokens ??
+          0;
+        const totalTimeMs = metrics.totalInferenceTimeMs ?? metrics.timeMs ?? 0;
+
+        context.stagehandTokenTotals = {
+          totalInputTokens,
+          totalOutputTokens,
+          totalTimeMs,
+        };
+      } catch (err) {
+        console.error(
+          "[benchmark-mcp] Failed to parse Stagehand usage metrics JSON:",
+          err,
+        );
+      }
     }
   };
 
@@ -335,82 +361,6 @@ async function runTaskWithAgent(
   };
 }
 
-async function fetchStagehandTokenUsageSummary(
-  browserbaseSessionId: string,
-): Promise<StagehandUsageTotals | null> {
-  const apiKey = process.env.BROWSERBASE_API_KEY;
-  const projectId = process.env.BROWSERBASE_PROJECT_ID;
-  const modelApiKey =
-    process.env.GEMINI_API_KEY || process.env.GOOGLE_API_KEY || "";
-
-  if (!apiKey || !projectId || !modelApiKey) {
-    console.error(
-      "[benchmark-mcp] Skipping Stagehand replay call due to missing API keys.",
-    );
-    return null;
-  }
-
-  const replayResponse = await fetch(
-    `https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`,
-    {
-      method: "GET",
-      headers: {
-        "x-bb-api-key": apiKey,
-        "x-bb-project-id": projectId,
-        "x-bb-session-id": browserbaseSessionId,
-        "x-stream-response": "true",
-        "x-model-api-key": modelApiKey,
-        "x-sent-at": new Date().toISOString(),
-        "x-language": "typescript",
-        "x-sdk-version": "3.0.1",
-      },
-    },
-  );
-
-  try {
-    const replayJson = (await replayResponse.json()) as {
-      data: {
-        pages: Array<{
-          actions: Array<{
-            tokenUsage?: {
-              inputTokens: number;
-              outputTokens: number;
-              timeMs?: number;
-            };
-          }>;
-        }>;
-      };
-    };
-
-    const totals: StagehandUsageTotals = {
-      totalInputTokens: 0,
-      totalOutputTokens: 0,
-      totalTimeMs: 0,
-    };
-
-    for (const page of replayJson.data.pages) {
-      for (const action of page.actions) {
-        if (action.tokenUsage) {
-          totals.totalInputTokens += action.tokenUsage.inputTokens;
-          totals.totalOutputTokens += action.tokenUsage.outputTokens;
-          if (typeof action.tokenUsage.timeMs === "number") {
-            totals.totalTimeMs += action.tokenUsage.timeMs;
-          }
-        }
-      }
-    }
-
-    return totals;
-  } catch (error) {
-    const message = error instanceof Error ? error.message : String(error);
-    console.error(
-      "[benchmark-mcp] Failed to parse Stagehand replay response:",
-      message,
-    );
-    return null;
-  }
-}
-
 async function runBenchmark(
   mcpName: string,
   datasetName: SupportedDatasetName,
@@ -529,21 +479,7 @@ async function runBenchmark(
     );
 
     if (isStagehand && stagehandContext) {
-      let stagehandTotals: StagehandUsageTotals | null = null;
-
-      if (stagehandContext.browserbaseSessionId) {
-        stagehandTotals = await fetchStagehandTokenUsageSummary(
-          stagehandContext.browserbaseSessionId,
-        );
-      }
-
-      if (!stagehandTotals && stagehandContext.stagehandTokenTotals) {
-        stagehandTotals = {
-          totalInputTokens: stagehandContext.stagehandTokenTotals.inputTokens,
-          totalOutputTokens: stagehandContext.stagehandTokenTotals.outputTokens,
-          totalTimeMs: 0,
-        };
-      }
+      const stagehandTotals = stagehandContext.stagehandTokenTotals;
 
       if (stagehandTotals) {
         console.log(
diff --git a/src/tools/session.ts b/src/tools/session.ts
@@ -7,44 +7,6 @@ import { createUIResource } from "@mcp-ui/server";
 import type { BrowserSession } from "../types/types.js";
 import { TextContent } from "@modelcontextprotocol/sdk/types.js";
 
-// Types for Stagehand replay response payload
-interface StagehandReplayTokenUsage {
-  inputTokens: number;
-  outputTokens: number;
-  timeMs?: number;
-}
-
-interface StagehandReplayAction {
-  method: string;
-  parameters?: unknown;
-  result?: unknown;
-  timestamp: number;
-  endTime?: number;
-  tokenUsage?: StagehandReplayTokenUsage;
-}
-
-interface StagehandReplayPage {
-  url: string;
-  timestamp: number;
-  duration: number;
-  actions: StagehandReplayAction[];
-}
-
-interface StagehandReplayData {
-  pages: StagehandReplayPage[];
-}
-
-interface StagehandReplayResponse {
-  success: boolean;
-  data: StagehandReplayData;
-}
-
-interface StagehandReplayTokenUsageTotals {
-  totalInputTokens: number;
-  totalOutputTokens: number;
-  totalTimeMs: number;
-}
-
 // --- Tool: Create Session ---
 const CreateSessionInputSchema = z.object({
   // Keep sessionId optional
@@ -179,79 +141,6 @@ const closeSessionSchema: ToolSchema<typeof CloseSessionInputSchema> = {
   inputSchema: CloseSessionInputSchema,
 };
 
-/**
- * Fetch token usage metrics from the Stagehand replay endpoint for a given
- * Browserbase session and return aggregated totals.
- */
-async function fetchStagehandTokenUsageSummary(
-  config: Context["config"],
-  browserbaseSessionId: string,
-): Promise<StagehandReplayTokenUsageTotals | null> {
-  const apiKey = config.browserbaseApiKey;
-  const projectId = config.browserbaseProjectId;
-  const modelApiKey =
-    config.modelApiKey ||
-    process.env.GEMINI_API_KEY ||
-    process.env.GOOGLE_API_KEY;
-
-  if (!apiKey || !projectId || !modelApiKey) {
-    process.stderr.write(
-      "[tool.closeSession] Skipping Stagehand replay call due to missing API keys or session ID.\n",
-    );
-    return null;
-  }
-
-  const replayResponse = await fetch(
-    `https://api.stagehand.browserbase.com/v1/sessions/${browserbaseSessionId}/replay`,
-    {
-      method: "GET",
-      headers: {
-        "x-bb-api-key": apiKey,
-        "x-bb-project-id": projectId,
-        "x-bb-session-id": browserbaseSessionId,
-        "x-stream-response": "true",
-        "x-model-api-key": modelApiKey,
-        "x-sent-at": new Date().toISOString(),
-        "x-language": "typescript",
-        "x-sdk-version": "3.0.1",
-      },
-    },
-  );
-
-  try {
-    // This JSON should contain tokenUsage with inputTokens/outputTokens per action
-    const replayJson = (await replayResponse.json()) as StagehandReplayResponse;
-
-    // Aggregate token usage across all pages/actions into a single totals object
-    const totals: StagehandReplayTokenUsageTotals = {
-      totalInputTokens: 0,
-      totalOutputTokens: 0,
-      totalTimeMs: 0,
-    };
-
-    for (const page of replayJson.data.pages) {
-      for (const action of page.actions) {
-        if (action.tokenUsage) {
-          totals.totalInputTokens += action.tokenUsage.inputTokens;
-          totals.totalOutputTokens += action.tokenUsage.outputTokens;
-          if (typeof action.tokenUsage.timeMs === "number") {
-            totals.totalTimeMs += action.tokenUsage.timeMs;
-          }
-        }
-      }
-    }
-
-    return totals;
-  } catch (parseError) {
-    const message =
-      parseError instanceof Error ? parseError.message : String(parseError);
-    process.stderr.write(
-      `[tool.closeSession] Failed to parse Stagehand replay response: ${message}\n`,
-    );
-    return null;
-  }
-}
-
 async function handleCloseSession(context: Context): Promise<ToolResult> {
   const action = async (): Promise<ToolActionResult> => {
     // Store the current session ID before cleanup
@@ -278,18 +167,19 @@ async function handleCloseSession(context: Context): Promise<ToolResult> {
         await sessionManager.cleanupSession(previousSessionId);
         cleanupSuccessful = true;
 
-        // Fetch Stagehand token usage metrics via shared util and log aggregate totals
-        if (browserbaseSessionId) {
-          const tokenUsageTotals = await fetchStagehandTokenUsageSummary(
-            context.config,
-            browserbaseSessionId,
+        // Fetch Stagehand token usage metrics directly from the Stagehand instance
+        try {
+          const metrics = await session?.stagehand?.metrics;
+
+          process.stdout.write(`Usage metrics: ${JSON.stringify(metrics)}\n`);
+        } catch (metricsError) {
+          const message =
+            metricsError instanceof Error
+              ? metricsError.message
+              : String(metricsError);
+          process.stderr.write(
+            `[tool.closeSession] Failed to read Stagehand metrics: ${message}\n`,
           );
-
-          if (tokenUsageTotals) {
-            process.stdout.write(
-              `Total token usage: ${tokenUsageTotals.totalInputTokens} input tokens, ${tokenUsageTotals.totalOutputTokens} output tokens\n`,
-            );
-          }
         }
       } else {
         process.stderr.write(