From e019e0d54e2d4132c865c0fa04f29fe1b544a2e7 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Sat, 18 Oct 2025 00:00:20 +0000 Subject: [PATCH 01/14] Deflake cluster-slot-migration-flaky-test-2693 Signed-off-by: Hanxi Zhang --- tests/unit/cluster/cli.tcl | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 3e138efd19..a82af17a15 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -284,6 +284,16 @@ test {Migrate the last slot away from a node using valkey-cli} { fail "Cluster doesn't stabilize" } + # Ensure all node IDs are propagated + wait_for_condition 1000 50 { + [CI 0 cluster_known_nodes] == 4 && + [CI 1 cluster_known_nodes] == 4 && + [CI 2 cluster_known_nodes] == 4 && + [CI 3 cluster_known_nodes] == 4 + } else { + fail "Cluster gossip hasn't propagated all node IDs" + } + set newnode_r [valkey_client -3] set newnode_id [$newnode_r CLUSTER MYID] From 1b2396376b072349049f9b492a3ca348ac56881a Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Mon, 20 Oct 2025 21:49:34 +0000 Subject: [PATCH 02/14] Added two files to test the flaky test Signed-off-by: Hanxi Zhang --- ...laky-cluster-slot-migration-flaky-test.yml | 51 ++++++++++++ .../unit/cluster/test-flaky-migrate-slots.tcl | 81 +++++++++++++++++++ 2 files changed, 132 insertions(+) create mode 100644 .github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml create mode 100644 tests/unit/cluster/test-flaky-migrate-slots.tcl diff --git a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml new file mode 100644 index 0000000000..3ee32b5c55 --- /dev/null +++ b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml @@ -0,0 +1,51 @@ +name: Test Flaky Cluster Slot Migration + +on: + workflow_dispatch: + push: + +jobs: + test-flaky: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: make + run: make valgrind SERVER_CFLAGS='-Werror' + + - name: testprep + run: | + sudo apt-get update + sudo apt-get install tcl8.6 tclx valgrind -y + + - name: Run test 100 times + run: | + FAILURES=0 + SUCCESSES=0 + + for i in {1..100}; do + echo "=========================================" + echo "Run $i of 100" + echo "=========================================" + + if ./runtest --valgrind --no-latency --verbose --clients 1 --timeout 2400 --single unit/cluster/test-flaky-migrate-slots; then + SUCCESSES=$((SUCCESSES + 1)) + echo "✓ Run $i: PASSED" + else + FAILURES=$((FAILURES + 1)) + echo "✗ Run $i: FAILED" + fi + done + + echo "=========================================" + echo "SUMMARY" + echo "=========================================" + echo "Total runs: 100" + echo "Successes: $SUCCESSES" + echo "Failures: $FAILURES" + echo "Failure rate: $((FAILURES * 100 / 100))%" + + if [ $FAILURES -gt 0 ]; then + echo "Test is FLAKY - failed $FAILURES out of 100 runs" + exit 1 + fi \ No newline at end of file diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl new file mode 100644 index 0000000000..219e966864 --- /dev/null +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -0,0 +1,81 @@ +test {Migrate the last slot away from a node using valkey-cli} { + start_multiple_servers 4 [list overrides $base_conf] { + # Create a cluster of 3 nodes + exec src/valkey-cli --cluster-yes --cluster create \ + 127.0.0.1:[srv 0 port] \ + 127.0.0.1:[srv -1 port] \ + 127.0.0.1:[srv -2 port] + + wait_for_condition 1000 50 { + [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize" + } + + # Add new node to the cluster + exec src/valkey-cli --cluster-yes --cluster add-node \ + 127.0.0.1:[srv -3 port] \ + 127.0.0.1:[srv 0 port] + + wait_for_cluster_size 4 + + wait_for_condition 1000 50 { + [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize" + } + + set newnode_r [valkey_client -3] + set newnode_id [$newnode_r CLUSTER MYID] + + # Find a node with slots and migrate one slot to the new node + set source_r [valkey_client 0] + set source_id [$source_r CLUSTER MYID] + set slots [$source_r CLUSTER SLOTS] + set slot [lindex [lindex $slots 0] 0] + + # Migrate the slot using valkey-cli + exec src/valkey-cli --cluster reshard 127.0.0.1:[srv 0 port] \ + --cluster-from $source_id \ + --cluster-to $newnode_id \ + --cluster-slots 1 \ + --cluster-yes + + wait_for_condition 1000 50 { + [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize after migration" + } + + # Now migrate the last slot away from the new node using valkey-cli + exec src/valkey-cli --cluster reshard 127.0.0.1:[srv -3 port] \ + --cluster-from $newnode_id \ + --cluster-to $source_id \ + --cluster-slots 1 \ + --cluster-yes + + wait_for_condition 1000 50 { + [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} + } else { + fail "Cluster doesn't stabilize after last slot migration" + } + + # Verify the node with no slots becomes a replica + wait_for_condition 1000 50 { + [string match "*slave*" [$newnode_r role]] + } else { + fail "Empty node didn't become a replica" + } + } +} \ No newline at end of file From 5bca61bc44cb66b7c6ce0e4452ea1b7d251c13f0 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Mon, 20 Oct 2025 21:54:10 +0000 Subject: [PATCH 03/14] Fixed a yml format issue Signed-off-by: Hanxi Zhang --- .../test-flaky-cluster-slot-migration-flaky-test.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml index 3ee32b5c55..2eaa24a22b 100644 --- a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml +++ b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml @@ -9,25 +9,20 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: make run: make valgrind SERVER_CFLAGS='-Werror' - - name: testprep run: | sudo apt-get update sudo apt-get install tcl8.6 tclx valgrind -y - - name: Run test 100 times run: | FAILURES=0 SUCCESSES=0 - for i in {1..100}; do echo "=========================================" echo "Run $i of 100" echo "=========================================" - if ./runtest --valgrind --no-latency --verbose --clients 1 --timeout 2400 --single unit/cluster/test-flaky-migrate-slots; then SUCCESSES=$((SUCCESSES + 1)) echo "✓ Run $i: PASSED" @@ -36,7 +31,6 @@ jobs: echo "✗ Run $i: FAILED" fi done - echo "=========================================" echo "SUMMARY" echo "=========================================" @@ -44,8 +38,8 @@ jobs: echo "Successes: $SUCCESSES" echo "Failures: $FAILURES" echo "Failure rate: $((FAILURES * 100 / 100))%" - if [ $FAILURES -gt 0 ]; then echo "Test is FLAKY - failed $FAILURES out of 100 runs" exit 1 - fi \ No newline at end of file + fi + \ No newline at end of file From a3c8eb997d925c417d482b814185f8d35116dbcf Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Mon, 20 Oct 2025 22:02:27 +0000 Subject: [PATCH 04/14] Added a missing line in the test Signed-off-by: Hanxi Zhang --- tests/unit/cluster/test-flaky-migrate-slots.tcl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index 219e966864..5430175497 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -1,3 +1,5 @@ +set base_conf [list cluster-enabled yes cluster-node-timeout 1000 cluster-databases 16] + test {Migrate the last slot away from a node using valkey-cli} { start_multiple_servers 4 [list overrides $base_conf] { # Create a cluster of 3 nodes From 440c04a8d30efbeda73628d015c5e36145c4d7e9 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Mon, 20 Oct 2025 22:27:57 +0000 Subject: [PATCH 05/14] With Fix, check to see if any test fail Signed-off-by: Hanxi Zhang --- .../unit/cluster/test-flaky-migrate-slots.tcl | 85 +++++++++++++------ 1 file changed, 57 insertions(+), 28 deletions(-) diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index 5430175497..25581a715b 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -2,11 +2,12 @@ set base_conf [list cluster-enabled yes cluster-node-timeout 1000 cluster-databa test {Migrate the last slot away from a node using valkey-cli} { start_multiple_servers 4 [list overrides $base_conf] { + # Create a cluster of 3 nodes exec src/valkey-cli --cluster-yes --cluster create \ - 127.0.0.1:[srv 0 port] \ - 127.0.0.1:[srv -1 port] \ - 127.0.0.1:[srv -2 port] + 127.0.0.1:[srv 0 port] \ + 127.0.0.1:[srv -1 port] \ + 127.0.0.1:[srv -2 port] wait_for_condition 1000 50 { [CI 0 cluster_state] eq {ok} && @@ -16,11 +17,16 @@ test {Migrate the last slot away from a node using valkey-cli} { fail "Cluster doesn't stabilize" } + # Insert some data + assert_equal OK [exec src/valkey-cli -c -p [srv 0 port] SET foo bar] + set slot [exec src/valkey-cli -c -p [srv 0 port] CLUSTER KEYSLOT foo] + # Add new node to the cluster exec src/valkey-cli --cluster-yes --cluster add-node \ - 127.0.0.1:[srv -3 port] \ - 127.0.0.1:[srv 0 port] + 127.0.0.1:[srv -3 port] \ + 127.0.0.1:[srv 0 port] + # First we wait for new node to be recognized by entire cluster wait_for_cluster_size 4 wait_for_condition 1000 50 { @@ -32,52 +38,75 @@ test {Migrate the last slot away from a node using valkey-cli} { fail "Cluster doesn't stabilize" } + # Ensure all node IDs are propagated + wait_for_condition 1000 50 { + [CI 0 cluster_known_nodes] == 4 && + [CI 1 cluster_known_nodes] == 4 && + [CI 2 cluster_known_nodes] == 4 && + [CI 3 cluster_known_nodes] == 4 + } else { + fail "Cluster gossip hasn't propagated all node IDs" + } + set newnode_r [valkey_client -3] set newnode_id [$newnode_r CLUSTER MYID] - - # Find a node with slots and migrate one slot to the new node - set source_r [valkey_client 0] - set source_id [$source_r CLUSTER MYID] - set slots [$source_r CLUSTER SLOTS] - set slot [lindex [lindex $slots 0] 0] - - # Migrate the slot using valkey-cli - exec src/valkey-cli --cluster reshard 127.0.0.1:[srv 0 port] \ - --cluster-from $source_id \ - --cluster-to $newnode_id \ - --cluster-slots 1 \ - --cluster-yes + # Find out which node has the key "foo" by asking the new node for a + # redirect. + catch { $newnode_r get foo } e + assert_match "MOVED $slot *" $e + lassign [split [lindex $e 2] :] owner_host owner_port + set owner_r [valkey $owner_host $owner_port 0 $::tls] + set owner_id [$owner_r CLUSTER MYID] + + # Move slot to new node using plain commands + assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id] + assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id] + assert_equal {foo} [$owner_r CLUSTER GETKEYSINSLOT $slot 10] + assert_equal OK [$owner_r MIGRATE 127.0.0.1 [srv -3 port] "" 0 5000 KEYS foo] + assert_equal OK [$newnode_r CLUSTER SETSLOT $slot NODE $newnode_id] + assert_equal OK [$owner_r CLUSTER SETSLOT $slot NODE $newnode_id] + + # Using --cluster check make sure we won't get `Not all slots are covered by nodes`. + # Wait for the cluster to become stable make sure the cluster is up during MIGRATE. wait_for_condition 1000 50 { + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 && [CI 0 cluster_state] eq {ok} && [CI 1 cluster_state] eq {ok} && [CI 2 cluster_state] eq {ok} && [CI 3 cluster_state] eq {ok} } else { - fail "Cluster doesn't stabilize after migration" + fail "Cluster doesn't stabilize" } - # Now migrate the last slot away from the new node using valkey-cli + # Move the only slot back to original node using valkey-cli exec src/valkey-cli --cluster reshard 127.0.0.1:[srv -3 port] \ --cluster-from $newnode_id \ - --cluster-to $source_id \ + --cluster-to $owner_id \ --cluster-slots 1 \ --cluster-yes + # The empty node will become a replica of the new owner before the + # `MOVED` check, so let's wait for the cluster to become stable. wait_for_condition 1000 50 { [CI 0 cluster_state] eq {ok} && [CI 1 cluster_state] eq {ok} && [CI 2 cluster_state] eq {ok} && [CI 3 cluster_state] eq {ok} } else { - fail "Cluster doesn't stabilize after last slot migration" + fail "Cluster doesn't stabilize" } - # Verify the node with no slots becomes a replica - wait_for_condition 1000 50 { - [string match "*slave*" [$newnode_r role]] - } else { - fail "Empty node didn't become a replica" - } + # Check that the key foo has been migrated back to the original owner. + catch { $newnode_r get foo } e + assert_equal "MOVED $slot $owner_host:$owner_port" $e + + # Check that the now empty primary node doesn't turn itself into + # a replica of any other nodes + wait_for_cluster_propagation + assert_match *master* [$owner_r role] } } \ No newline at end of file From be33235e8ae9fc38decd297c8415148480457fc4 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 29 Oct 2025 20:37:54 +0000 Subject: [PATCH 06/14] Removed the old fix Signed-off-by: Hanxi Zhang --- tests/unit/cluster/test-flaky-migrate-slots.tcl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index 25581a715b..be8bc8bf0e 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -38,16 +38,6 @@ test {Migrate the last slot away from a node using valkey-cli} { fail "Cluster doesn't stabilize" } - # Ensure all node IDs are propagated - wait_for_condition 1000 50 { - [CI 0 cluster_known_nodes] == 4 && - [CI 1 cluster_known_nodes] == 4 && - [CI 2 cluster_known_nodes] == 4 && - [CI 3 cluster_known_nodes] == 4 - } else { - fail "Cluster gossip hasn't propagated all node IDs" - } - set newnode_r [valkey_client -3] set newnode_id [$newnode_r CLUSTER MYID] From 1669e08e601593b0622998540bc32b37967ec9e8 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 29 Oct 2025 20:47:50 +0000 Subject: [PATCH 07/14] Removed the redundant cluster_known_nodes call Signed-off-by: Hanxi Zhang --- tests/unit/cluster/cli.tcl | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index a82af17a15..3e138efd19 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -284,16 +284,6 @@ test {Migrate the last slot away from a node using valkey-cli} { fail "Cluster doesn't stabilize" } - # Ensure all node IDs are propagated - wait_for_condition 1000 50 { - [CI 0 cluster_known_nodes] == 4 && - [CI 1 cluster_known_nodes] == 4 && - [CI 2 cluster_known_nodes] == 4 && - [CI 3 cluster_known_nodes] == 4 - } else { - fail "Cluster gossip hasn't propagated all node IDs" - } - set newnode_r [valkey_client -3] set newnode_id [$newnode_r CLUSTER MYID] From 057ca1e69e383bb96d323a9cbbbc3041ecf89231 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 29 Oct 2025 23:18:47 +0000 Subject: [PATCH 08/14] Fix: added new fix to deflake the test, waiting for the owner node to know the new node before move slot Signed-off-by: Hanxi Zhang --- tests/support/cluster_util.tcl | 11 +++++++++++ tests/unit/cluster/cli.tcl | 7 +++++++ tests/unit/cluster/test-flaky-migrate-slots.tcl | 7 +++++++ 3 files changed, 25 insertions(+) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index ee14c58648..10c488c497 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -458,3 +458,14 @@ proc wait_for_slot_state {srv_idx pattern} { fail "incorrect slot state on R $srv_idx: expected $pattern; got [get_open_slots $srv_idx]" } } + +# Check if server_a knows node_b_id +proc server_knows_node {server_a node_b_id} { + set nodes [get_cluster_nodes $server_a] + foreach n $nodes { + if {[dict get $n id] eq $node_b_id} { + return 1 + } + } + return 0 +} diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 3e138efd19..c0472d7e6b 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -295,6 +295,13 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] + # Wait until the owner node knows the new node + wait_for_condition 1000 50 { + server_knows_node $owner_id $newnode_id + } else { + fail "Owner node does not know the new node yet" + } + # Move slot to new node using plain commands assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id] assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id] diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index be8bc8bf0e..e2dc0c82d4 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -49,6 +49,13 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] + # Wait until the owner node knows the new node + wait_for_condition 1000 50 { + server_knows_node $owner_id $newnode_id + } else { + fail "Owner node does not know the new node yet" + } + # Move slot to new node using plain commands assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id] assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id] From 850296529e1503b6b6fc68a75f9443a18de11285 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 29 Oct 2025 23:33:15 +0000 Subject: [PATCH 09/14] Rewrote the fix Signed-off-by: Hanxi Zhang --- tests/unit/cluster/cli.tcl | 16 ++++++++++++---- tests/unit/cluster/test-flaky-migrate-slots.tcl | 16 ++++++++++++---- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index c0472d7e6b..bfe51cd191 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -295,10 +295,18 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] - # Wait until the owner node knows the new node - wait_for_condition 1000 50 { - server_knows_node $owner_id $newnode_id - } else { + # Wait until owner node knows the new node + wait_for_condition 1000 50 [list { + set nodes [get_cluster_nodes $owner_id] + set found 0 + foreach n $nodes { + if {[dict get $n id] eq $newnode_id} { + set found 1 + break + } + } + $found + }] else { fail "Owner node does not know the new node yet" } diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index e2dc0c82d4..dbf3d54a67 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -49,10 +49,18 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] - # Wait until the owner node knows the new node - wait_for_condition 1000 50 { - server_knows_node $owner_id $newnode_id - } else { + # Wait until owner node knows the new node + wait_for_condition 1000 50 [list { + set nodes [get_cluster_nodes $owner_id] + set found 0 + foreach n $nodes { + if {[dict get $n id] eq $newnode_id} { + set found 1 + break + } + } + $found + }] else { fail "Owner node does not know the new node yet" } From 5c9be0ecc60e23e393343ddc5e76a017544c9829 Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 29 Oct 2025 23:39:31 +0000 Subject: [PATCH 10/14] Wrap the logic into a single expression Signed-off-by: Hanxi Zhang --- tests/unit/cluster/cli.tcl | 12 ++++-------- tests/unit/cluster/test-flaky-migrate-slots.tcl | 14 +++++--------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index bfe51cd191..11b828c20b 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -296,17 +296,13 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_id [$owner_r CLUSTER MYID] # Wait until owner node knows the new node - wait_for_condition 1000 50 [list { - set nodes [get_cluster_nodes $owner_id] + wait_for_condition 1000 50 { set found 0 - foreach n $nodes { - if {[dict get $n id] eq $newnode_id} { - set found 1 - break - } + foreach n [get_cluster_nodes $owner_id] { + if {[dict get $n id] eq $newnode_id} { set found 1; break } } $found - }] else { + } else { fail "Owner node does not know the new node yet" } diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl index dbf3d54a67..de52f204a2 100644 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ b/tests/unit/cluster/test-flaky-migrate-slots.tcl @@ -50,17 +50,13 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_id [$owner_r CLUSTER MYID] # Wait until owner node knows the new node - wait_for_condition 1000 50 [list { - set nodes [get_cluster_nodes $owner_id] + wait_for_condition 1000 50 { set found 0 - foreach n $nodes { - if {[dict get $n id] eq $newnode_id} { - set found 1 - break - } + foreach n [get_cluster_nodes $owner_id] { + if {[dict get $n id] eq $newnode_id} { set found 1; break } } $found - }] else { + } else { fail "Owner node does not know the new node yet" } @@ -114,4 +110,4 @@ test {Migrate the last slot away from a node using valkey-cli} { wait_for_cluster_propagation assert_match *master* [$owner_r role] } -} \ No newline at end of file +} From 1a136a33352a7fc70f812042d7c516f24a823a8c Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Fri, 7 Nov 2025 01:50:18 +0000 Subject: [PATCH 11/14] Fix, added check to make sure the owner node knows the new node Signed-off-by: Hanxi Zhang --- tests/support/cluster_util.tcl | 13 ++++++------- tests/unit/cluster/cli.tcl | 12 +++++------- 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 10c488c497..c27564f732 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -459,13 +459,12 @@ proc wait_for_slot_state {srv_idx pattern} { } } -# Check if server_a knows node_b_id -proc server_knows_node {server_a node_b_id} { - set nodes [get_cluster_nodes $server_a] - foreach n $nodes { - if {[dict get $n id] eq $node_b_id} { - return 1 +# Returns the test index of a node given its node ID +proc get_node_index_by_id {node_id} { + for {set i 0} {$i < [llength $::servers]} {incr i} { + if {[R $i CLUSTER MYID] eq $node_id} { + return $i } } - return 0 + return -1 ;# not found } diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 11b828c20b..f79ee0a09b 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -295,15 +295,13 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] - # Wait until owner node knows the new node + # get the index of the owner node + set owner_index [get_node_index_by_id $owner_id] + wait_for_condition 1000 50 { - set found 0 - foreach n [get_cluster_nodes $owner_id] { - if {[dict get $n id] eq $newnode_id} { set found 1; break } - } - $found + [cluster_get_node_by_id $owner_index $newnode_id] ne {} } else { - fail "Owner node does not know the new node yet" + fail "Owner node never learned the new node" } # Move slot to new node using plain commands From d4b228a31ed322b9dd0344bdd9d63af08325096a Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Fri, 7 Nov 2025 02:03:40 +0000 Subject: [PATCH 12/14] Deleted two test files Signed-off-by: Hanxi Zhang --- ...laky-cluster-slot-migration-flaky-test.yml | 45 ------- .../unit/cluster/test-flaky-migrate-slots.tcl | 113 ------------------ 2 files changed, 158 deletions(-) delete mode 100644 .github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml delete mode 100644 tests/unit/cluster/test-flaky-migrate-slots.tcl diff --git a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml b/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml deleted file mode 100644 index 2eaa24a22b..0000000000 --- a/.github/workflows/test-flaky-cluster-slot-migration-flaky-test.yml +++ /dev/null @@ -1,45 +0,0 @@ -name: Test Flaky Cluster Slot Migration - -on: - workflow_dispatch: - push: - -jobs: - test-flaky: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - name: make - run: make valgrind SERVER_CFLAGS='-Werror' - - name: testprep - run: | - sudo apt-get update - sudo apt-get install tcl8.6 tclx valgrind -y - - name: Run test 100 times - run: | - FAILURES=0 - SUCCESSES=0 - for i in {1..100}; do - echo "=========================================" - echo "Run $i of 100" - echo "=========================================" - if ./runtest --valgrind --no-latency --verbose --clients 1 --timeout 2400 --single unit/cluster/test-flaky-migrate-slots; then - SUCCESSES=$((SUCCESSES + 1)) - echo "✓ Run $i: PASSED" - else - FAILURES=$((FAILURES + 1)) - echo "✗ Run $i: FAILED" - fi - done - echo "=========================================" - echo "SUMMARY" - echo "=========================================" - echo "Total runs: 100" - echo "Successes: $SUCCESSES" - echo "Failures: $FAILURES" - echo "Failure rate: $((FAILURES * 100 / 100))%" - if [ $FAILURES -gt 0 ]; then - echo "Test is FLAKY - failed $FAILURES out of 100 runs" - exit 1 - fi - \ No newline at end of file diff --git a/tests/unit/cluster/test-flaky-migrate-slots.tcl b/tests/unit/cluster/test-flaky-migrate-slots.tcl deleted file mode 100644 index de52f204a2..0000000000 --- a/tests/unit/cluster/test-flaky-migrate-slots.tcl +++ /dev/null @@ -1,113 +0,0 @@ -set base_conf [list cluster-enabled yes cluster-node-timeout 1000 cluster-databases 16] - -test {Migrate the last slot away from a node using valkey-cli} { - start_multiple_servers 4 [list overrides $base_conf] { - - # Create a cluster of 3 nodes - exec src/valkey-cli --cluster-yes --cluster create \ - 127.0.0.1:[srv 0 port] \ - 127.0.0.1:[srv -1 port] \ - 127.0.0.1:[srv -2 port] - - wait_for_condition 1000 50 { - [CI 0 cluster_state] eq {ok} && - [CI 1 cluster_state] eq {ok} && - [CI 2 cluster_state] eq {ok} - } else { - fail "Cluster doesn't stabilize" - } - - # Insert some data - assert_equal OK [exec src/valkey-cli -c -p [srv 0 port] SET foo bar] - set slot [exec src/valkey-cli -c -p [srv 0 port] CLUSTER KEYSLOT foo] - - # Add new node to the cluster - exec src/valkey-cli --cluster-yes --cluster add-node \ - 127.0.0.1:[srv -3 port] \ - 127.0.0.1:[srv 0 port] - - # First we wait for new node to be recognized by entire cluster - wait_for_cluster_size 4 - - wait_for_condition 1000 50 { - [CI 0 cluster_state] eq {ok} && - [CI 1 cluster_state] eq {ok} && - [CI 2 cluster_state] eq {ok} && - [CI 3 cluster_state] eq {ok} - } else { - fail "Cluster doesn't stabilize" - } - - set newnode_r [valkey_client -3] - set newnode_id [$newnode_r CLUSTER MYID] - - # Find out which node has the key "foo" by asking the new node for a - # redirect. - catch { $newnode_r get foo } e - assert_match "MOVED $slot *" $e - lassign [split [lindex $e 2] :] owner_host owner_port - set owner_r [valkey $owner_host $owner_port 0 $::tls] - set owner_id [$owner_r CLUSTER MYID] - - # Wait until owner node knows the new node - wait_for_condition 1000 50 { - set found 0 - foreach n [get_cluster_nodes $owner_id] { - if {[dict get $n id] eq $newnode_id} { set found 1; break } - } - $found - } else { - fail "Owner node does not know the new node yet" - } - - # Move slot to new node using plain commands - assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id] - assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id] - assert_equal {foo} [$owner_r CLUSTER GETKEYSINSLOT $slot 10] - assert_equal OK [$owner_r MIGRATE 127.0.0.1 [srv -3 port] "" 0 5000 KEYS foo] - assert_equal OK [$newnode_r CLUSTER SETSLOT $slot NODE $newnode_id] - assert_equal OK [$owner_r CLUSTER SETSLOT $slot NODE $newnode_id] - - # Using --cluster check make sure we won't get `Not all slots are covered by nodes`. - # Wait for the cluster to become stable make sure the cluster is up during MIGRATE. - wait_for_condition 1000 50 { - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 && - [CI 0 cluster_state] eq {ok} && - [CI 1 cluster_state] eq {ok} && - [CI 2 cluster_state] eq {ok} && - [CI 3 cluster_state] eq {ok} - } else { - fail "Cluster doesn't stabilize" - } - - # Move the only slot back to original node using valkey-cli - exec src/valkey-cli --cluster reshard 127.0.0.1:[srv -3 port] \ - --cluster-from $newnode_id \ - --cluster-to $owner_id \ - --cluster-slots 1 \ - --cluster-yes - - # The empty node will become a replica of the new owner before the - # `MOVED` check, so let's wait for the cluster to become stable. - wait_for_condition 1000 50 { - [CI 0 cluster_state] eq {ok} && - [CI 1 cluster_state] eq {ok} && - [CI 2 cluster_state] eq {ok} && - [CI 3 cluster_state] eq {ok} - } else { - fail "Cluster doesn't stabilize" - } - - # Check that the key foo has been migrated back to the original owner. - catch { $newnode_r get foo } e - assert_equal "MOVED $slot $owner_host:$owner_port" $e - - # Check that the now empty primary node doesn't turn itself into - # a replica of any other nodes - wait_for_cluster_propagation - assert_match *master* [$owner_r role] - } -} From eccd523ff562bf5c745ae9939f5da8bbdcaf128d Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Tue, 18 Nov 2025 20:57:53 +0000 Subject: [PATCH 13/14] Added code to ensure that all the nodes actually believe each other are healthy Signed-off-by: Hanxi Zhang --- tests/support/cluster_util.tcl | 10 ---------- tests/unit/cluster/cli.tcl | 19 +++++++++++++------ 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index c27564f732..ee14c58648 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -458,13 +458,3 @@ proc wait_for_slot_state {srv_idx pattern} { fail "incorrect slot state on R $srv_idx: expected $pattern; got [get_open_slots $srv_idx]" } } - -# Returns the test index of a node given its node ID -proc get_node_index_by_id {node_id} { - for {set i 0} {$i < [llength $::servers]} {incr i} { - if {[R $i CLUSTER MYID] eq $node_id} { - return $i - } - } - return -1 ;# not found -} diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index f79ee0a09b..3716383d3d 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -295,13 +295,20 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] - # get the index of the owner node - set owner_index [get_node_index_by_id $owner_id] - + # Cluster check just verifies the config state is self-consistent, + # waiting for cluster_state to be okay is an independent check that all the + # nodes actually believe each other are healthy, prevent cluster down error. wait_for_condition 1000 50 { - [cluster_get_node_by_id $owner_index $newnode_id] ne {} + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 && + [CI 0 cluster_state] eq {ok} && + [CI 1 cluster_state] eq {ok} && + [CI 2 cluster_state] eq {ok} && + [CI 3 cluster_state] eq {ok} } else { - fail "Owner node never learned the new node" + fail "Cluster doesn't stabilize" } # Move slot to new node using plain commands @@ -539,4 +546,4 @@ start_multiple_servers 3 [list overrides $base_conf] { } } -} +} \ No newline at end of file From 2708cdb9e96c5c5fc4e41884023c75c6792790cc Mon Sep 17 00:00:00 2001 From: Hanxi Zhang Date: Wed, 19 Nov 2025 18:51:41 +0000 Subject: [PATCH 14/14] Refactor: merge the fix with a previous step Signed-off-by: Hanxi Zhang --- tests/unit/cluster/cli.tcl | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/tests/unit/cluster/cli.tcl b/tests/unit/cluster/cli.tcl index 3716383d3d..163c1a6cc5 100644 --- a/tests/unit/cluster/cli.tcl +++ b/tests/unit/cluster/cli.tcl @@ -275,7 +275,14 @@ test {Migrate the last slot away from a node using valkey-cli} { # First we wait for new node to be recognized by entire cluster wait_for_cluster_size 4 + # Cluster check just verifies the config state is self-consistent, + # waiting for cluster_state to be okay is an independent check that all the + # nodes actually believe each other are healthy, prevent cluster down error. wait_for_condition 1000 50 { + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 && + [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 && [CI 0 cluster_state] eq {ok} && [CI 1 cluster_state] eq {ok} && [CI 2 cluster_state] eq {ok} && @@ -295,22 +302,6 @@ test {Migrate the last slot away from a node using valkey-cli} { set owner_r [valkey $owner_host $owner_port 0 $::tls] set owner_id [$owner_r CLUSTER MYID] - # Cluster check just verifies the config state is self-consistent, - # waiting for cluster_state to be okay is an independent check that all the - # nodes actually believe each other are healthy, prevent cluster down error. - wait_for_condition 1000 50 { - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv 0 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -1 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -2 port]}] == 0 && - [catch {exec src/valkey-cli --cluster check 127.0.0.1:[srv -3 port]}] == 0 && - [CI 0 cluster_state] eq {ok} && - [CI 1 cluster_state] eq {ok} && - [CI 2 cluster_state] eq {ok} && - [CI 3 cluster_state] eq {ok} - } else { - fail "Cluster doesn't stabilize" - } - # Move slot to new node using plain commands assert_equal OK [$newnode_r CLUSTER SETSLOT $slot IMPORTING $owner_id] assert_equal OK [$owner_r CLUSTER SETSLOT $slot MIGRATING $newnode_id]