Skip to content

Commit 8be1e0d

Browse files
committed
evals: show average excluding <= 1.0 scores
1 parent 8574e94 commit 8be1e0d

File tree

2 files changed

+20
-0
lines changed

2 files changed

+20
-0
lines changed

evals/buffbench/run-buffbench.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,7 @@ export async function runBuffBench(options: {
359359
agentId,
360360
runs: [],
361361
averageScore: 0,
362+
averageScoreExcludingFailures: 0,
362363
averageCost: 0,
363364
averageDuration: 0,
364365
}
@@ -422,6 +423,18 @@ export async function runBuffBench(options: {
422423
validRuns.length
423424
: 0
424425

426+
// Calculate average excluding huge failures (scores ≤1.0)
427+
const runsExcludingFailures = validRuns.filter(
428+
(r) => r.judging.overallScore > 1.0,
429+
)
430+
agentData.averageScoreExcludingFailures =
431+
runsExcludingFailures.length > 0
432+
? runsExcludingFailures.reduce(
433+
(sum, r) => sum + r.judging.overallScore,
434+
0,
435+
) / runsExcludingFailures.length
436+
: 0
437+
425438
agentData.averageCost =
426439
validRuns.length > 0
427440
? validRuns.reduce((sum, r) => sum + r.cost, 0) / validRuns.length
@@ -507,8 +520,14 @@ export async function runBuffBench(options: {
507520
const validRuns = data.runs.filter(
508521
(r) => !commitShasWithErrors.has(r.commitSha),
509522
)
523+
const runsExcludingFailures = validRuns.filter(
524+
(r) => r.judging.overallScore > 1.0,
525+
)
510526
console.log(`\n${agentId}:`)
511527
console.log(` Average Score: ${data.averageScore.toFixed(2)}/10`)
528+
console.log(
529+
` Average Score (excluding failures ≤1.0): ${data.averageScoreExcludingFailures.toFixed(2)}/10 (${runsExcludingFailures.length}/${validRuns.length} runs)`,
530+
)
512531
console.log(` Average Cost: ${data.averageCost.toFixed(4)}`)
513532
console.log(
514533
` Average Duration: ${(data.averageDuration / 1000).toFixed(1)}s`,

evals/buffbench/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ export interface AgentEvalResults {
7777
agentId: string
7878
runs: EvalRun[]
7979
averageScore: number
80+
averageScoreExcludingFailures: number
8081
averageCost: number
8182
averageDuration: number
8283
}

0 commit comments

Comments
 (0)