asim: make LoadBasedRebalancingObjective configurable

wenyihu6 · wenyihu6 · commit 5fa2c3b027bc · 2025-11-05T09:28:39.000-05:00
Previously, the simulator was updated to use cluster setting
LoadBasedRebalancingObjective, which defaults to CPU-based rebalancing. However,
the goal of rebalancing_qps.txt is to rebalance based on QPS. This commit
introduces LoadBasedRebalancingObjective as a configurable simulation setting,
allowing it to be adjusted via a cluster setting. And it updates rebalancing_qps
to explicitly set the rebalancing objective to QPS, restoring its original
intent after the recent simulator change.
diff --git a/pkg/kv/kvserver/asim/state/impl.go b/pkg/kv/kvserver/asim/state/impl.go
@@ -1402,6 +1402,8 @@ func (s *state) SetClusterSetting(Key string, Value interface{}) {
 	switch Key {
 	case "LBRebalancingMode":
 		kvserverbase.LoadBasedRebalancingMode.Override(context.Background(), &s.settings.ST.SV, kvserverbase.LBRebalancingMode(Value.(int64)))
+	case "LBRebalancingObjective":
+		kvserver.LoadBasedRebalancingObjective.Override(context.Background(), &s.settings.ST.SV, kvserver.LBRebalancingObjective(Value.(int64)))
 	default:
 		panic("other cluster settings not supported")
 	}
diff --git a/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go b/pkg/kv/kvserver/asim/tests/datadriven_simulation_test.go
@@ -745,6 +745,17 @@ func TestDataDriven(t *testing.T) {
 								Value:            rebalanceMode,
 							}})
 					}
+
+					var rebalanceObjective int64
+					if scanIfExists(t, d, "rebalance_objective", &rebalanceObjective) {
+						events = append(events, scheduled.ScheduledEvent{
+							At: settingsGen.Settings.StartTime.Add(delay),
+							TargetEvent: event.SetSimulationSettingsEvent{
+								IsClusterSetting: true,
+								Key:              "LBRebalancingObjective",
+								Value:            rebalanceObjective,
+							}})
+					}
 					return ""
 				default:
 					return fmt.Sprintf("unknown command: %s", d.Cmd)
diff --git a/pkg/kv/kvserver/asim/tests/testdata/non_rand/sma/rebalancing_qps.txt b/pkg/kv/kvserver/asim/tests/testdata/non_rand/sma/rebalancing_qps.txt
@@ -1,10 +1,5 @@
 # This test verifies that the allocator can rebalance qps load across a 7-node
 # cluster. mma-only fails here since it doesn't balance based on leases.
-# TODO(tbg): this test works because the default rebalancing objective in asim
-# was accidentally left as 'qps' (even though we use 'cpu' in production).
-# When the default changes, this test needs to override the setting and probably
-# rename the test to specify that it's about qps rebalancing in particular,
-# which otherwise sees little explicit testing.
 skip_under_ci
 ----
 
@@ -35,38 +30,27 @@ assertion stat=qps type=steady ticks=6 upper_bound=0.05
 ----
 asserting: |qps(t)/mean_{T}(qps) - 1| ≤ 0.05 ∀ t∈T and each store (T=last 6 ticks)
 
+
+# Set rebalance objective to 0 to disable rebalancing.
+setting rebalance_objective=0
+----
+
 # The generators are then called and 2 simulation runs, named samples are
 # created and evaluated. Each sample has a fixed duration of 3 minutes.
 # Following the evaluation, the samples are checked individually against the
 # existing assertions, added above. If any assertion fails, the reason is
 # printed. If no assertions fail, then OK is printed.
 eval duration=3m samples=2 seed=42 metrics=(qps,replica_moves) cfgs=(sma-count,mma-only) full=true
 ----
-qps#1: last:  [s1=2997, s2=1998, s3=0, s4=0, s5=1001, s6=1002, s7=0] (stddev=1067.98, mean=999.71, sum=6998)
-qps#1: thrash_pct: [s1=8%, s2=8%, s3=0%, s4=0%, s5=5%, s6=6%, s7=0%]  (sum=27%)
-replica_moves#1: last:  [s1=2, s2=1, s3=0, s4=0, s5=1, s6=1, s7=0] (stddev=0.70, mean=0.71, sum=5)
+qps#1: last:  [s1=996, s2=1001, s3=999, s4=994, s5=1001, s6=1002, s7=1007] (stddev=3.93, mean=1000.00, sum=7000)
+qps#1: thrash_pct: [s1=12%, s2=9%, s3=4%, s4=8%, s5=5%, s6=6%, s7=4%]  (sum=48%)
+replica_moves#1: last:  [s1=2, s2=1, s3=0, s4=1, s5=1, s6=1, s7=0] (stddev=0.64, mean=0.86, sum=6)
 replica_moves#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%]  (sum=0%)
-qps#2: last:  [s1=2997, s2=2000, s3=0, s4=0, s5=0, s6=2001, s7=0] (stddev=1194.63, mean=999.71, sum=6998)
-qps#2: thrash_pct: [s1=7%, s2=7%, s3=0%, s4=0%, s5=0%, s6=11%, s7=0%]  (sum=25%)
-replica_moves#2: last:  [s1=2, s2=1, s3=0, s4=0, s5=0, s6=1, s7=0] (stddev=0.73, mean=0.57, sum=4)
+qps#2: last:  [s1=999, s2=999, s3=997, s4=998, s5=997, s6=1002, s7=998] (stddev=1.59, mean=998.57, sum=6990)
+qps#2: thrash_pct: [s1=11%, s2=8%, s3=4%, s4=103%, s5=4%, s6=105%, s7=5%]  (sum=240%)
+replica_moves#2: last:  [s1=2, s2=1, s3=0, s4=1, s5=0, s6=1, s7=0] (stddev=0.70, mean=0.71, sum=5)
 replica_moves#2: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%]  (sum=0%)
-artifacts[sma-count]: a725af49d5bb0f06
-failed assertion sample 1
-  balance stat=qps threshold=(≤1.15) ticks=6
-	max/mean=3.00 tick=0
-	max/mean=3.00 tick=1
-	max/mean=3.00 tick=2
-	max/mean=3.00 tick=3
-	max/mean=3.00 tick=4
-	max/mean=3.00 tick=5
-failed assertion sample 2
-  balance stat=qps threshold=(≤1.15) ticks=6
-	max/mean=3.00 tick=0
-	max/mean=3.00 tick=1
-	max/mean=3.00 tick=2
-	max/mean=3.00 tick=3
-	max/mean=3.00 tick=4
-	max/mean=3.00 tick=5
+artifacts[sma-count]: df2fd03851620873
 ==========================
 qps#1: last:  [s1=3998, s2=1998, s3=0, s4=0, s5=0, s6=1002, s7=0] (stddev=1413.41, mean=999.71, sum=6998)
 qps#1: thrash_pct: [s1=5%, s2=6%, s3=0%, s4=0%, s5=0%, s6=4%, s7=0%]  (sum=15%)
@@ -101,6 +85,7 @@ Cluster Set Up
 Key Space
 	[0,10000): 7(rf=3), 0MiB, [s1:(7,4*),s2:(6,2*),s3:(3,0*),s4:(2,0*),s5:(1,0*),s6:(1,1*),s7:(1,0*)]
 Event
+	set LBRebalancingObjective to 0
 	set LBRebalancingMode to 2
 Workload Set Up
 	[0,10000): 95%r large-block [128-256B/op, 7000ops/s]
@@ -124,31 +109,15 @@ setting gossip_delay=20s
 # thrashing on the fourth sample here.
 eval duration=5m samples=2 seed=42 metrics=(qps,replica_moves) cfgs=(sma-count,mma-only)
 ----
-qps#1: last:  [s1=2004, s2=0, s3=0, s4=0, s5=3001, s6=0, s7=1994] (stddev=1195.23, mean=999.86, sum=6999)
-qps#1: thrash_pct: [s1=115%, s2=3%, s3=0%, s4=0%, s5=8%, s6=55%, s7=8%]  (sum=189%)
-replica_moves#1: last:  [s1=5, s2=4, s3=0, s4=0, s5=4, s6=4, s7=2] (stddev=1.91, mean=2.71, sum=19)
+qps#1: last:  [s1=994, s2=999, s3=1005, s4=1000, s5=998, s6=1002, s7=999] (stddev=3.16, mean=999.57, sum=6997)
+qps#1: thrash_pct: [s1=117%, s2=59%, s3=54%, s4=53%, s5=104%, s6=105%, s7=53%]  (sum=543%)
+replica_moves#1: last:  [s1=5, s2=4, s3=0, s4=4, s5=3, s6=5, s7=2] (stddev=1.67, mean=3.29, sum=23)
 replica_moves#1: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%]  (sum=0%)
-qps#2: last:  [s1=1001, s2=0, s3=0, s4=1001, s5=0, s6=3003, s7=1995] (stddev=1069.18, mean=1000.00, sum=7000)
-qps#2: thrash_pct: [s1=269%, s2=2%, s3=0%, s4=3%, s5=258%, s6=8%, s7=60%]  (sum=600%)
-replica_moves#2: last:  [s1=11, s2=4, s3=0, s4=0, s5=9, s6=12, s7=3] (stddev=4.69, mean=5.57, sum=39)
+qps#2: last:  [s1=1001, s2=1003, s3=994, s4=997, s5=1002, s6=1001, s7=998] (stddev=2.97, mean=999.43, sum=6996)
+qps#2: thrash_pct: [s1=256%, s2=47%, s3=43%, s4=2%, s5=284%, s6=126%, s7=125%]  (sum=882%)
+replica_moves#2: last:  [s1=12, s2=7, s3=1, s4=0, s5=10, s6=8, s7=3] (stddev=4.26, mean=5.86, sum=41)
 replica_moves#2: thrash_pct: [s1=0%, s2=0%, s3=0%, s4=0%, s5=0%, s6=0%, s7=0%]  (sum=0%)
-artifacts[sma-count]: 83a77bd5552b8bfe
-failed assertion sample 1
-  balance stat=qps threshold=(≤1.15) ticks=6
-	max/mean=3.00 tick=0
-	max/mean=3.00 tick=1
-	max/mean=3.00 tick=2
-	max/mean=3.00 tick=3
-	max/mean=3.00 tick=4
-	max/mean=3.00 tick=5
-failed assertion sample 2
-  balance stat=qps threshold=(≤1.15) ticks=6
-	max/mean=3.00 tick=0
-	max/mean=3.00 tick=1
-	max/mean=3.00 tick=2
-	max/mean=3.00 tick=3
-	max/mean=3.00 tick=4
-	max/mean=3.00 tick=5
+artifacts[sma-count]: 1bd16e771629608b
 ==========================
 qps#1: last:  [s1=4002, s2=1995, s3=0, s4=0, s5=0, s6=1002, s7=0] (stddev=1414.32, mean=999.86, sum=6999)
 qps#1: thrash_pct: [s1=6%, s2=6%, s3=0%, s4=0%, s5=0%, s6=4%, s7=0%]  (sum=16%)

Original file line number	Diff line number	Diff line change
`@@ -1402,6 +1402,8 @@ func (s *state) SetClusterSetting(Key string, Value interface{}) {`
`1402`	`1402`	`switch Key {`
`1403`	`1403`	`case "LBRebalancingMode":`
`1404`	`1404`	`kvserverbase.LoadBasedRebalancingMode.Override(context.Background(), &s.settings.ST.SV, kvserverbase.LBRebalancingMode(Value.(int64)))`
	`1405`	`+ case "LBRebalancingObjective":`
	`1406`	`+ kvserver.LoadBasedRebalancingObjective.Override(context.Background(), &s.settings.ST.SV, kvserver.LBRebalancingObjective(Value.(int64)))`
`1405`	`1407`	`default:`
`1406`	`1408`	`panic("other cluster settings not supported")`
`1407`	`1409`	`}`