Skip to content

Commit 0378f48

Browse files
committed
feat: support retrying timed out evaluations
We should not flaw/skip results due to timeouts caused by e.g. stuck building or stuck serving.
1 parent 4c95a39 commit 0378f48

File tree

4 files changed

+76
-44
lines changed

4 files changed

+76
-44
lines changed

runner/configuration/constants.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,9 @@ export const DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS = 1;
3333
*/
3434
export const DEFAULT_MAX_TEST_REPAIR_ATTEMPTS = 0;
3535

36+
/** Default number of retries when a prompt evaluation timed out. */
37+
export const DEFAULT_PROMPT_TIMEOUT_RETRIES = 1;
38+
3639
/** Name of the folder where we store all generated reports */
3740
export const REPORTS_ROOT_DIR = join(rootDir, 'reports');
3841

runner/eval-cli.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import {
66
DEFAULT_MAX_BUILD_REPAIR_ATTEMPTS,
77
DEFAULT_MAX_TEST_REPAIR_ATTEMPTS,
88
DEFAULT_MODEL_NAME,
9+
DEFAULT_PROMPT_TIMEOUT_RETRIES,
910
} from './configuration/constants.js';
1011
import {generateCodeAndAssess} from './orchestration/generate.js';
1112
import {logReportToConsole, writeReportToDisk} from './reporting/report-logging.js';
@@ -42,6 +43,7 @@ interface Options {
4243
skipLighthouse?: boolean;
4344
maxTestRepairAttempts?: number;
4445
maxBuildRepairAttempts?: number;
46+
promptTimeoutRetries?: number;
4547
}
4648

4749
function builder(argv: Argv): Argv<Options> {
@@ -168,6 +170,12 @@ function builder(argv: Argv): Argv<Options> {
168170
description:
169171
'Number of repair attempts for discovered test failures (including a11y violations and ones from testCommand)',
170172
})
173+
.option('prompt-timeout-retries', {
174+
type: 'number',
175+
default: DEFAULT_PROMPT_TIMEOUT_RETRIES,
176+
description:
177+
'Maximum number of times to retry a prompt evaluation after it fails due to a timeout.',
178+
})
171179
.strict()
172180
.version(false)
173181
.help()
@@ -221,6 +229,7 @@ async function handler(cliArgs: Arguments<Options>): Promise<void> {
221229
skipLighthouse: cliArgs.skipLighthouse,
222230
maxBuildRepairAttempts: cliArgs.maxBuildRepairAttempts,
223231
maxTestRepairAttempts: cliArgs.maxTestRepairAttempts,
232+
promptTimeoutRetries: cliArgs.promptTimeoutRetries,
224233
abortSignal: abortCtrl.signal,
225234
});
226235

runner/orchestration/generate.ts

Lines changed: 63 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ import {
2121
} from '../shared-interfaces.js';
2222
import {UserFacingError} from '../utils/errors.js';
2323
import {executeCommand} from '../utils/exec.js';
24-
import {callWithTimeout} from '../utils/timeout.js';
24+
import {callWithTimeout, TimeoutError} from '../utils/timeout.js';
2525
import {LocalExecutor} from './executors/local-executor.js';
2626
import {startEvaluationTask} from './generate-eval-task.js';
2727
import {prepareSummary} from './generate-summary.js';
@@ -145,55 +145,74 @@ export async function generateCodeAndAssess(options: AssessmentConfig): Promise<
145145
for (const rootPromptDef of promptsToProcess) {
146146
allTasks.push(
147147
appConcurrencyQueue.add(async () => {
148-
const evalID = await env.executor.initializeEval();
149-
let results: AssessmentResult[] | undefined;
150-
151-
try {
152-
results = await callWithTimeout(
153-
`Evaluation of ${rootPromptDef.name}`,
154-
async timeoutAbortSignal =>
155-
startEvaluationTask(
156-
options,
157-
evalID,
158-
env,
159-
autoraterLlm,
160-
cujGenerationLlm,
161-
rootPromptDef,
162-
combineAbortSignals(
163-
allTasksAbortCtrl.signal,
164-
timeoutAbortSignal,
165-
options.abortSignal,
148+
const evaluate = async () => {
149+
const evalID = await env.executor.initializeEval();
150+
let results: AssessmentResult[] | undefined;
151+
152+
try {
153+
results = await callWithTimeout(
154+
`Evaluation of ${rootPromptDef.name}`,
155+
async timeoutAbortSignal =>
156+
startEvaluationTask(
157+
options,
158+
evalID,
159+
env,
160+
autoraterLlm,
161+
cujGenerationLlm,
162+
rootPromptDef,
163+
combineAbortSignals(
164+
allTasksAbortCtrl.signal,
165+
timeoutAbortSignal,
166+
options.abortSignal,
167+
),
168+
workerConcurrencyQueue,
169+
progress,
166170
),
167-
workerConcurrencyQueue,
168-
progress,
169-
),
170-
// A timeout is used to prevent from stuck evaluations.
171-
env.promptTimeoutMinutes ?? 10,
172-
);
173-
return results;
174-
} catch (e: unknown) {
175-
failedPrompts.push({
176-
promptName: rootPromptDef.name,
177-
error: `${e}`,
178-
stack: e instanceof Error ? e.stack : undefined,
179-
});
180-
181-
let details = `Error: ${e}`;
182-
if (e instanceof Error && e.stack) {
183-
details += `\nStack: ${e.stack}`;
171+
// A timeout is used to prevent from stuck evaluations.
172+
env.promptTimeoutMinutes ?? 10,
173+
);
174+
return results;
175+
} finally {
176+
// Gracefully finalize the eval. Errors in finalization should not propagate.
177+
try {
178+
await env.executor.finalizeEval(evalID);
179+
} catch (e) {
180+
progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
181+
}
182+
progress.evalFinished(rootPromptDef, results || []);
184183
}
184+
};
185185

186-
progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
187-
return [] satisfies AssessmentResult[];
188-
} finally {
189-
// Gracefully finalize the eval. Errors in finalization should not propagate.
186+
// Retries + initial attempt.
187+
const maxAttempts = (options.promptTimeoutRetries ?? 0) + 1;
188+
for (let attemptIdx = 0; attemptIdx < maxAttempts; attemptIdx++) {
190189
try {
191-
await env.executor.finalizeEval(evalID);
192-
} catch (e) {
193-
progress.log(rootPromptDef, 'error', 'Failed to finalize eval', `${e}`);
190+
return await evaluate();
191+
} catch (e: unknown) {
192+
if (e instanceof TimeoutError && attemptIdx < maxAttempts) {
193+
continue;
194+
}
195+
196+
failedPrompts.push({
197+
promptName: rootPromptDef.name,
198+
error: `${e}`,
199+
stack: e instanceof Error ? e.stack : undefined,
200+
});
201+
202+
let details = `Error: ${e}`;
203+
if (e instanceof Error && e.stack) {
204+
details += `\nStack: ${e.stack}`;
205+
}
206+
207+
progress.log(rootPromptDef, 'error', 'Failed to evaluate code', details);
208+
return [] satisfies AssessmentResult[];
194209
}
195-
progress.evalFinished(rootPromptDef, results || []);
196210
}
211+
212+
throw new Error(
213+
`Unexpected code path. ` +
214+
`There were ${maxAttempts} attempts for evaluating: ${rootPromptDef.name}`,
215+
);
197216
}),
198217
);
199218
}

runner/shared-interfaces.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ export interface AssessmentConfig {
3030
skipLighthouse?: boolean;
3131
maxTestRepairAttempts?: number;
3232
maxBuildRepairAttempts?: number;
33+
promptTimeoutRetries?: number;
3334
abortSignal?: AbortSignal;
3435
}
3536

0 commit comments

Comments
 (0)