@@ -359,6 +359,7 @@ export async function runBuffBench(options: {
359359 agentId,
360360 runs : [ ] ,
361361 averageScore : 0 ,
362+ averageScoreExcludingFailures : 0 ,
362363 averageCost : 0 ,
363364 averageDuration : 0 ,
364365 }
@@ -422,6 +423,18 @@ export async function runBuffBench(options: {
422423 validRuns . length
423424 : 0
424425
426+ // Calculate average excluding huge failures (scores ≤1.0)
427+ const runsExcludingFailures = validRuns . filter (
428+ ( r ) => r . judging . overallScore > 1.0 ,
429+ )
430+ agentData . averageScoreExcludingFailures =
431+ runsExcludingFailures . length > 0
432+ ? runsExcludingFailures . reduce (
433+ ( sum , r ) => sum + r . judging . overallScore ,
434+ 0 ,
435+ ) / runsExcludingFailures . length
436+ : 0
437+
425438 agentData . averageCost =
426439 validRuns . length > 0
427440 ? validRuns . reduce ( ( sum , r ) => sum + r . cost , 0 ) / validRuns . length
@@ -507,8 +520,14 @@ export async function runBuffBench(options: {
507520 const validRuns = data . runs . filter (
508521 ( r ) => ! commitShasWithErrors . has ( r . commitSha ) ,
509522 )
523+ const runsExcludingFailures = validRuns . filter (
524+ ( r ) => r . judging . overallScore > 1.0 ,
525+ )
510526 console . log ( `\n${ agentId } :` )
511527 console . log ( ` Average Score: ${ data . averageScore . toFixed ( 2 ) } /10` )
528+ console . log (
529+ ` Average Score (excluding failures ≤1.0): ${ data . averageScoreExcludingFailures . toFixed ( 2 ) } /10 (${ runsExcludingFailures . length } /${ validRuns . length } runs)` ,
530+ )
512531 console . log ( ` Average Cost: ${ data . averageCost . toFixed ( 4 ) } ` )
513532 console . log (
514533 ` Average Duration: ${ ( data . averageDuration / 1000 ) . toFixed ( 1 ) } s` ,
0 commit comments