Skip to content

Commit c8e59c3

Browse files
committed
roachtest: adjust error tolerance for DiskStalledWALFailoverWithProgress
Bump throughput error tolerance from 20% to 25% and drop the last throughput sample if it is 0 to address flakiness. Fixes: #153328 Fixes: #152781
1 parent cd17f4e commit c8e59c3

File tree

1 file changed

+22
-10
lines changed

1 file changed

+22
-10
lines changed

pkg/cmd/roachtest/tests/disk_stall.go

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
467467
operationDur = 3 * time.Minute
468468
// QPS sampling parameters.
469469
sampleInterval = 10 * time.Second
470-
errorTolerance = 0.2 // 20% tolerance for throughput variation.
470+
errorTolerance = 0.25 // 25% tolerance for throughput variation.
471471
)
472472

473473
t.Status("setting up disk staller")
@@ -566,20 +566,21 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
566566
t.Fatalf("context done before workload started: %s", ctx.Err())
567567
case <-workloadStarted:
568568
}
569+
// Wait 30s after workload starts before beginning sampling.
570+
const workloadStartDelay = 30 * time.Second
571+
// Calculate approximate how many samples to take. We want to account
572+
// for the time waited for workload startup and we should also stop
573+
// sampling ~15s before the workload starts shutting down.
574+
samplingDuration := operationDur - workloadStartDelay - 15*time.Second
575+
sampleCount := int(samplingDuration / sampleInterval)
569576

570-
// Wait 20s after workload starts before beginning sampling.
571577
select {
572578
case <-ctx.Done():
573579
t.Fatalf("context done before workload started: %s", ctx.Err())
574-
case <-time.After(30 * time.Second):
580+
case <-time.After(workloadStartDelay):
575581
t.Status("starting QPS sampling")
576582
}
577583

578-
// We want to stop sampling 10s before workload ends to avoid sampling during shutdown.
579-
// We'll take approx. 14 samples with this configuration.
580-
samplingDuration := operationDur - 40*time.Second // 30s initial wait + 10s buffer at workload end
581-
sampleCount := int(samplingDuration / sampleInterval)
582-
583584
sampleTimer := time.NewTicker(sampleInterval)
584585
defer sampleTimer.Stop()
585586

@@ -648,12 +649,22 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
648649
// Wait for all goroutines to complete.
649650
g.Wait()
650651

652+
if len(samples) == 0 {
653+
t.Fatalf("no throughput samples collected for iteration %d", iteration)
654+
}
655+
651656
// Validate throughput samples are within tolerance.
657+
// Drop the last one if it is 0, since we can't fully sync the sampling
658+
// with workload startup/shutdown, it may have been taken while the workload
659+
// was shutting down.
660+
if samples[len(samples)-1] == 0 {
661+
samples = samples[:len(samples)-1]
662+
}
652663
meanThroughput := roachtestutil.GetMeanOverLastN(len(samples), samples)
653664
t.Status("mean throughput for iteration", iteration, ": ", meanThroughput)
654665
for _, sample := range samples {
655666
require.InEpsilonf(t, meanThroughput, sample, errorTolerance,
656-
"sample %f is not within tolerance of mean %f", sample, meanThroughput)
667+
"sample %f is not within tolerance of mean %f\nsamples:%v", sample, meanThroughput, samples)
657668
}
658669
iterationMeans = append(iterationMeans, meanThroughput)
659670
iteration++
@@ -671,7 +682,8 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
671682
overallMean := roachtestutil.GetMeanOverLastN(len(iterationMeans), iterationMeans)
672683
for _, mean := range iterationMeans {
673684
require.InEpsilonf(t, overallMean, mean, errorTolerance,
674-
"iteration mean %f is not within tolerance of overall mean %f", mean, overallMean)
685+
"iteration mean %f is not within tolerance of overall mean %f\niteration means:%v", mean,
686+
overallMean, iterationMeans)
675687
}
676688

677689
data := mustGetMetrics(ctx, c, t, adminURL, install.SystemInterfaceName,

0 commit comments

Comments
 (0)