@@ -467,7 +467,7 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
467467 operationDur = 3 * time .Minute
468468 // QPS sampling parameters.
469469 sampleInterval = 10 * time .Second
470- errorTolerance = 0.2 // 20 % tolerance for throughput variation.
470+ errorTolerance = 0.25 // 25 % tolerance for throughput variation.
471471 )
472472
473473 t .Status ("setting up disk staller" )
@@ -566,20 +566,21 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
566566 t .Fatalf ("context done before workload started: %s" , ctx .Err ())
567567 case <- workloadStarted :
568568 }
569+ // Wait 30s after workload starts before beginning sampling.
570+ const workloadStartDelay = 30 * time .Second
571+ // Calculate approximate how many samples to take. We want to account
572+ // for the time waited for workload startup and we should also stop
573+ // sampling ~15s before the workload starts shutting down.
574+ samplingDuration := operationDur - workloadStartDelay - 15 * time .Second
575+ sampleCount := int (samplingDuration / sampleInterval )
569576
570- // Wait 20s after workload starts before beginning sampling.
571577 select {
572578 case <- ctx .Done ():
573579 t .Fatalf ("context done before workload started: %s" , ctx .Err ())
574- case <- time .After (30 * time . Second ):
580+ case <- time .After (workloadStartDelay ):
575581 t .Status ("starting QPS sampling" )
576582 }
577583
578- // We want to stop sampling 10s before workload ends to avoid sampling during shutdown.
579- // We'll take approx. 14 samples with this configuration.
580- samplingDuration := operationDur - 40 * time .Second // 30s initial wait + 10s buffer at workload end
581- sampleCount := int (samplingDuration / sampleInterval )
582-
583584 sampleTimer := time .NewTicker (sampleInterval )
584585 defer sampleTimer .Stop ()
585586
@@ -648,12 +649,22 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
648649 // Wait for all goroutines to complete.
649650 g .Wait ()
650651
652+ if len (samples ) == 0 {
653+ t .Fatalf ("no throughput samples collected for iteration %d" , iteration )
654+ }
655+
651656 // Validate throughput samples are within tolerance.
657+ // Drop the last one if it is 0, since we can't fully sync the sampling
658+ // with workload startup/shutdown, it may have been taken while the workload
659+ // was shutting down.
660+ if samples [len (samples )- 1 ] == 0 {
661+ samples = samples [:len (samples )- 1 ]
662+ }
652663 meanThroughput := roachtestutil .GetMeanOverLastN (len (samples ), samples )
653664 t .Status ("mean throughput for iteration" , iteration , ": " , meanThroughput )
654665 for _ , sample := range samples {
655666 require .InEpsilonf (t , meanThroughput , sample , errorTolerance ,
656- "sample %f is not within tolerance of mean %f" , sample , meanThroughput )
667+ "sample %f is not within tolerance of mean %f\n samples:%v " , sample , meanThroughput , samples )
657668 }
658669 iterationMeans = append (iterationMeans , meanThroughput )
659670 iteration ++
@@ -671,7 +682,8 @@ func runDiskStalledWALFailoverWithProgress(ctx context.Context, t test.Test, c c
671682 overallMean := roachtestutil .GetMeanOverLastN (len (iterationMeans ), iterationMeans )
672683 for _ , mean := range iterationMeans {
673684 require .InEpsilonf (t , overallMean , mean , errorTolerance ,
674- "iteration mean %f is not within tolerance of overall mean %f" , mean , overallMean )
685+ "iteration mean %f is not within tolerance of overall mean %f\n iteration means:%v" , mean ,
686+ overallMean , iterationMeans )
675687 }
676688
677689 data := mustGetMetrics (ctx , c , t , adminURL , install .SystemInterfaceName ,
0 commit comments