@@ -128,11 +128,57 @@ interface JudgeCommitResultInput {
128128 finalCheckOutputs ?: string
129129}
130130
131+ async function runSingleJudge (
132+ input : JudgeCommitResultInput ,
133+ judgePrompt : string ,
134+ judgeIndex : number ,
135+ ) : Promise < JudgingResult | null > {
136+ const { client } = input
137+
138+ const agentOutput : string [ ] = [ ]
139+ try {
140+ const judgeResult = await withTimeout (
141+ client . run ( {
142+ agent : 'judge' ,
143+ prompt : judgePrompt ,
144+ agentDefinitions : [ judgeAgent ] ,
145+ handleEvent : ( event ) => {
146+ if ( event . type === 'text' ) {
147+ agentOutput . push ( event . text )
148+ } else if ( event . type === 'tool_call' ) {
149+ agentOutput . push ( JSON . stringify ( event , null , 2 ) )
150+ } else if ( event . type === 'error' ) {
151+ console . warn (
152+ `[Judge ${ judgeIndex + 1 } ] Error event:` ,
153+ event . message ,
154+ )
155+ }
156+ } ,
157+ } ) ,
158+ 20 * 60 * 1000 ,
159+ 'Judge agent timed out after 20 minutes' ,
160+ )
161+
162+ if ( judgeResult . output . type !== 'structuredOutput' ) {
163+ console . error (
164+ `Judge ${ judgeIndex + 1 } - not structured output` ,
165+ JSON . stringify ( judgeResult . output , null , 2 ) ,
166+ )
167+ console . error ( 'Judge agent output trace:' , agentOutput . join ( '' ) )
168+ return null
169+ }
170+
171+ return judgeResult . output . value as JudgingResult
172+ } catch ( error ) {
173+ console . warn ( `Judge ${ judgeIndex + 1 } failed:` , error )
174+ return null
175+ }
176+ }
177+
131178export async function judgeCommitResult (
132179 input : JudgeCommitResultInput ,
133180) : Promise < JudgingResult > {
134181 const {
135- client,
136182 prompt,
137183 groundTruthFileDiffs,
138184 contextFiles,
@@ -169,41 +215,57 @@ ${agentDiff || '(No changes made)'}
169215${ error ? `\n## Error Encountered\n${ error } ` : '' }
170216${ finalCheckOutputs ? `\n## Final Check Command Outputs\n${ finalCheckOutputs } ` : '' } `
171217
172- const agentOutput : string [ ] = [ ]
173- const judgeResult = await withTimeout (
174- client . run ( {
175- agent : 'judge' ,
176- prompt : judgePrompt ,
177- agentDefinitions : [ judgeAgent ] ,
178- handleEvent : ( event ) => {
179- if ( event . type === 'text' ) {
180- agentOutput . push ( event . text )
181- } else if ( event . type === 'tool_call' ) {
182- agentOutput . push ( JSON . stringify ( event , null , 2 ) )
183- } else if ( event . type === 'error' ) {
184- console . warn ( '[Judge] Error event:' , event . message )
185- }
186- } ,
187- } ) ,
188- 20 * 60 * 1000 ,
189- 'Judge agent timed out after 20 minutes' ,
218+ // Run 3 judges in parallel
219+ const judgePromises = Array . from ( { length : 3 } , ( _ , index ) =>
220+ runSingleJudge ( input , judgePrompt , index ) ,
190221 )
191222
192- if ( judgeResult . output . type !== 'structuredOutput' ) {
193- console . error (
194- 'Error running judge agent - not structured output' ,
195- JSON . stringify ( judgeResult . output , null , 2 ) ,
196- )
197- console . error ( 'Judge agent output trace:' , agentOutput . join ( '' ) )
223+ const judgeResults = await Promise . all ( judgePromises )
224+ const validResults = judgeResults . filter (
225+ ( result ) : result is JudgingResult => result !== null ,
226+ )
227+
228+ if ( validResults . length === 0 ) {
229+ console . error ( 'All judges failed to provide results' )
198230 return {
199- analysis : 'Error running judge agent - not structured output ' ,
231+ analysis : 'Error running judge agent - all judges failed ' ,
200232 strengths : [ ] ,
201- weaknesses : [ 'Judge failed to provide structured output' ] ,
233+ weaknesses : [ 'All judges failed to provide structured output' ] ,
202234 completionScore : 0 ,
203235 codeQualityScore : 0 ,
204236 overallScore : 0 ,
205237 }
206238 }
207239
208- return judgeResult . output . value as JudgingResult
240+ // Sort judges by overall score and select the median for analysis
241+ const sortedResults = validResults . sort (
242+ ( a , b ) => a . overallScore - b . overallScore ,
243+ )
244+ const medianIndex = Math . floor ( sortedResults . length / 2 )
245+ const medianResult = sortedResults [ medianIndex ]
246+
247+ // Calculate average scores across all valid judges
248+ const averageCompletionScore =
249+ validResults . reduce ( ( sum , r ) => sum + r . completionScore , 0 ) /
250+ validResults . length
251+ const averageCodeQualityScore =
252+ validResults . reduce ( ( sum , r ) => sum + r . codeQualityScore , 0 ) /
253+ validResults . length
254+ const averageOverallScore =
255+ validResults . reduce ( ( sum , r ) => sum + r . overallScore , 0 ) /
256+ validResults . length
257+
258+ console . log (
259+ `Judging results overall score: ${ averageOverallScore . toFixed ( 1 ) } (individual scores: ${ validResults . map ( ( r ) => r . overallScore . toFixed ( 1 ) ) . join ( ', ' ) } )` ,
260+ )
261+
262+ // Return median judge's analysis with averaged scores
263+ return {
264+ analysis : medianResult . analysis ,
265+ strengths : medianResult . strengths ,
266+ weaknesses : medianResult . weaknesses ,
267+ completionScore : averageCompletionScore ,
268+ codeQualityScore : averageCodeQualityScore ,
269+ overallScore : averageOverallScore ,
270+ }
209271}
0 commit comments