Skip to content

Commit a2f68a3

Browse files
committed
evals: Run 3 judges, average their scores, return median analysis
1 parent 38578c8 commit a2f68a3

File tree

1 file changed

+90
-28
lines changed

1 file changed

+90
-28
lines changed

evals/buffbench/judge.ts

Lines changed: 90 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -128,11 +128,57 @@ interface JudgeCommitResultInput {
128128
finalCheckOutputs?: string
129129
}
130130

131+
async function runSingleJudge(
132+
input: JudgeCommitResultInput,
133+
judgePrompt: string,
134+
judgeIndex: number,
135+
): Promise<JudgingResult | null> {
136+
const { client } = input
137+
138+
const agentOutput: string[] = []
139+
try {
140+
const judgeResult = await withTimeout(
141+
client.run({
142+
agent: 'judge',
143+
prompt: judgePrompt,
144+
agentDefinitions: [judgeAgent],
145+
handleEvent: (event) => {
146+
if (event.type === 'text') {
147+
agentOutput.push(event.text)
148+
} else if (event.type === 'tool_call') {
149+
agentOutput.push(JSON.stringify(event, null, 2))
150+
} else if (event.type === 'error') {
151+
console.warn(
152+
`[Judge ${judgeIndex + 1}] Error event:`,
153+
event.message,
154+
)
155+
}
156+
},
157+
}),
158+
20 * 60 * 1000,
159+
'Judge agent timed out after 20 minutes',
160+
)
161+
162+
if (judgeResult.output.type !== 'structuredOutput') {
163+
console.error(
164+
`Judge ${judgeIndex + 1} - not structured output`,
165+
JSON.stringify(judgeResult.output, null, 2),
166+
)
167+
console.error('Judge agent output trace:', agentOutput.join(''))
168+
return null
169+
}
170+
171+
return judgeResult.output.value as JudgingResult
172+
} catch (error) {
173+
console.warn(`Judge ${judgeIndex + 1} failed:`, error)
174+
return null
175+
}
176+
}
177+
131178
export async function judgeCommitResult(
132179
input: JudgeCommitResultInput,
133180
): Promise<JudgingResult> {
134181
const {
135-
client,
136182
prompt,
137183
groundTruthFileDiffs,
138184
contextFiles,
@@ -169,41 +215,57 @@ ${agentDiff || '(No changes made)'}
169215
${error ? `\n## Error Encountered\n${error}` : ''}
170216
${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}`
171217

172-
const agentOutput: string[] = []
173-
const judgeResult = await withTimeout(
174-
client.run({
175-
agent: 'judge',
176-
prompt: judgePrompt,
177-
agentDefinitions: [judgeAgent],
178-
handleEvent: (event) => {
179-
if (event.type === 'text') {
180-
agentOutput.push(event.text)
181-
} else if (event.type === 'tool_call') {
182-
agentOutput.push(JSON.stringify(event, null, 2))
183-
} else if (event.type === 'error') {
184-
console.warn('[Judge] Error event:', event.message)
185-
}
186-
},
187-
}),
188-
20 * 60 * 1000,
189-
'Judge agent timed out after 20 minutes',
218+
// Run 3 judges in parallel
219+
const judgePromises = Array.from({ length: 3 }, (_, index) =>
220+
runSingleJudge(input, judgePrompt, index),
190221
)
191222

192-
if (judgeResult.output.type !== 'structuredOutput') {
193-
console.error(
194-
'Error running judge agent - not structured output',
195-
JSON.stringify(judgeResult.output, null, 2),
196-
)
197-
console.error('Judge agent output trace:', agentOutput.join(''))
223+
const judgeResults = await Promise.all(judgePromises)
224+
const validResults = judgeResults.filter(
225+
(result): result is JudgingResult => result !== null,
226+
)
227+
228+
if (validResults.length === 0) {
229+
console.error('All judges failed to provide results')
198230
return {
199-
analysis: 'Error running judge agent - not structured output',
231+
analysis: 'Error running judge agent - all judges failed',
200232
strengths: [],
201-
weaknesses: ['Judge failed to provide structured output'],
233+
weaknesses: ['All judges failed to provide structured output'],
202234
completionScore: 0,
203235
codeQualityScore: 0,
204236
overallScore: 0,
205237
}
206238
}
207239

208-
return judgeResult.output.value as JudgingResult
240+
// Sort judges by overall score and select the median for analysis
241+
const sortedResults = validResults.sort(
242+
(a, b) => a.overallScore - b.overallScore,
243+
)
244+
const medianIndex = Math.floor(sortedResults.length / 2)
245+
const medianResult = sortedResults[medianIndex]
246+
247+
// Calculate average scores across all valid judges
248+
const averageCompletionScore =
249+
validResults.reduce((sum, r) => sum + r.completionScore, 0) /
250+
validResults.length
251+
const averageCodeQualityScore =
252+
validResults.reduce((sum, r) => sum + r.codeQualityScore, 0) /
253+
validResults.length
254+
const averageOverallScore =
255+
validResults.reduce((sum, r) => sum + r.overallScore, 0) /
256+
validResults.length
257+
258+
console.log(
259+
`Judging results overall score: ${averageOverallScore.toFixed(1)} (individual scores: ${validResults.map((r) => r.overallScore.toFixed(1)).join(', ')})`,
260+
)
261+
262+
// Return median judge's analysis with averaged scores
263+
return {
264+
analysis: medianResult.analysis,
265+
strengths: medianResult.strengths,
266+
weaknesses: medianResult.weaknesses,
267+
completionScore: averageCompletionScore,
268+
codeQualityScore: averageCodeQualityScore,
269+
overallScore: averageOverallScore,
270+
}
209271
}

0 commit comments

Comments
 (0)