diff --git a/tests/instances.tcl b/tests/instances.tcl index 265fa054a3..29e28a9f92 100644 --- a/tests/instances.tcl +++ b/tests/instances.tcl @@ -66,12 +66,14 @@ proc exec_instance {type dirname cfgfile} { # Spawn a server or sentinel instance, depending on 'type'. proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { + set current_instances_count [llength [set ::${type}_instances]] for {set j 0} {$j < $count} {incr j} { + set instance_id [expr $current_instances_count + $j] set port [find_available_port $base_port $::valkey_port_count] # plaintext port (only used for TLS cluster) set pport 0 # Create a directory for this instance. - set dirname "${type}_${j}" + set dirname "${type}_${instance_id}" lappend ::dirs $dirname catch {exec rm -rf $dirname} file mkdir $dirname @@ -141,7 +143,7 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { # Check availability if {[server_is_up 127.0.0.1 $port 100] == 0} { - puts "Starting $type #$j at port $port failed, try another" + puts "Starting $type #$instance_id at port $port failed, try another" incr retry -1 set port [find_available_port $base_port $::valkey_port_count] set cfg [open $cfgfile a+] @@ -154,7 +156,7 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { } close $cfg } else { - puts "Starting $type #$j at port $port" + puts "Starting $type #$instance_id at port $port" lappend ::pids $pid break } @@ -164,13 +166,14 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { if {[server_is_up $::host $port 100] == 0} { set logfile [file join $dirname log.txt] puts [exec tail $logfile] - abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile" + abort_sentinel_test "Problems starting $type #$instance_id: ping timeout, maybe server start failed, check $logfile" } # Push the instance into the right list set link [valkey $::host $port 0 $::tls] $link reconnect 1 lappend ::${type}_instances [list \ + instance_id $instance_id \ pid $pid \ host $::host \ port $port \ @@ -180,6 +183,23 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} { } } +proc remove_valkey_instance { ids } { + set ids [lsort -unique $ids] + set new {} + foreach instance $::valkey_instances { + # Treats the flat list entry {pid 93770 host 127.0.0.1 port 30000 ...} + # as alternating key/value pairs. + array set inst $instance + # Found the target instance + if { [lsearch -exact $ids $inst(instance_id)] >= 0} { + kill_instance "valkey" $inst(instance_id) + } else { + lappend new $instance + } + } + set ::valkey_instances $new +} + proc log_crashes {} { set start_pattern {*BUG REPORT START*} set logs [glob */log.txt] @@ -258,9 +278,9 @@ proc cleanup {} { if {$::dont_clean} { return } - foreach dir $::dirs { - catch {exec rm -rf $dir} - } + # Don't clean up the log files. We often need to examine the + # logs regardless of whether the test fail or not. + # The logs will be cleaned up in run.tcl prior to running tests } proc abort_sentinel_test msg { @@ -445,7 +465,8 @@ proc test {descr code} { # Check memory leaks when running on macOS using the "leaks" utility. proc check_leaks instance_types { if {[string match {*Darwin*} [exec uname -a]]} { - puts -nonewline "Testing for memory leaks..."; flush stdout + set ts [clock format [clock seconds] -format %H:%M:%S] + puts -nonewline "$ts> Testing for memory leaks..."; flush stdout foreach type $instance_types { foreach_instance_id [set ::${type}_instances] id { if {[instance_is_killed $type $id]} continue @@ -488,7 +509,7 @@ while 1 { continue } if {[file isdirectory $test]} continue - puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"] + puts [colorstr yellow "\nTesting unit: [lindex [file split $test] end]"] if {[catch { source $test } err]} { puts "FAILED: caught an error in the test $err" puts $::errorInfo @@ -621,29 +642,26 @@ proc set_instance_attrib {type id attrib newval} { lset ::${type}_instances $id $d } -# Create a master-slave cluster of the given number of total instances. -# The first instance "0" is the master, all others are configured as -# slaves. -proc create_valkey_master_slave_cluster n { - foreach_valkey_id id { - if {$id == 0} { - # Our master. +# Create a primary-replica cluster of the given number of total instances. +# The first instance (ID 0) is the primary, all others are configured as +# replicas. +proc create_valkey_primary_replica_cluster { n primary_id } { + for {set id $primary_id} {$id < [expr $primary_id + $n]} {incr id} { + if {$id == $primary_id} { R $id slaveof no one R $id flushall - } elseif {$id < $n} { - R $id slaveof [get_instance_attrib valkey 0 host] \ - [get_instance_attrib valkey 0 port] } else { - # Instances not part of the cluster. - R $id slaveof no one + R $id slaveof [get_instance_attrib valkey $primary_id host] [get_instance_attrib valkey $primary_id port] } } - # Wait for all the slaves to sync. + # Wait for all the replicas to sync. wait_for_condition 1000 50 { - [RI 0 connected_slaves] == ($n-1) + [RI $primary_id connected_slaves] == ($n-1) } else { - fail "Unable to create a master-slaves cluster." + fail "Unable to create a primary-replica cluster. Connected [RI $primary_id connected_slaves] \ + replicas but expected [expr $n-1]" } + puts "Successfully created a primary-replica cluster of $n instances with primary ID $primary_id" } proc get_instance_id_by_port {type port} { diff --git a/tests/sentinel/run.tcl b/tests/sentinel/run.tcl index 9cbb189bed..bc4c51c86a 100644 --- a/tests/sentinel/run.tcl +++ b/tests/sentinel/run.tcl @@ -9,6 +9,13 @@ set ::instances_count 5 ; # How many instances we use at max. set ::tlsdir "../../tls" proc main {} { + set start [clock milliseconds] + # Clean up log files from previous test run + foreach name [glob -tails -directory [pwd] * .*] { + if {$name in { . .. .gitignore }} continue + file delete -force -- $name + } + parse_options if {$::leaked_fds_file != ""} { set ::env(LEAKED_FDS_FILE) $::leaked_fds_file @@ -26,6 +33,9 @@ proc main {} { } run_tests cleanup + set end [clock milliseconds] + set duration [expr {($end - $start) / 1000}] + puts "Total test duration: ${duration} seconds" end_tests } diff --git a/tests/sentinel/tests/00-base.tcl b/tests/sentinel/tests/00-base.tcl index 97cc6540a9..1476bbf735 100644 --- a/tests/sentinel/tests/00-base.tcl +++ b/tests/sentinel/tests/00-base.tcl @@ -127,6 +127,7 @@ test "The old primary eventually gets reconfigured as a slave" { } test "ODOWN is not possible without N (quorum) Sentinels reports" { + set sentinels [llength $::sentinel_instances] foreach_sentinel_id id { S $id SENTINEL SET mymaster quorum [expr $sentinels+1] } @@ -135,13 +136,14 @@ test "ODOWN is not possible without N (quorum) Sentinels reports" { assert {[lindex $addr 1] == $old_port} kill_instance valkey $master_id - # Make sure failover did not happened. + # Make sure failover did not happen. set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] assert {[lindex $addr 1] == $old_port} restart_instance valkey $master_id } test "Failover is not possible without majority agreement" { + set quorum [expr {$sentinels/2 + 1}] foreach_sentinel_id id { S $id SENTINEL SET mymaster quorum $quorum } @@ -154,7 +156,7 @@ test "Failover is not possible without majority agreement" { # Kill the current master kill_instance valkey $master_id - # Make sure failover did not happened. + # Make sure failover did not happen. set addr [S $quorum SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] assert {[lindex $addr 1] == $old_port} restart_instance valkey $master_id @@ -202,7 +204,7 @@ test "New primary [join $addr {:}] role matches" { assert {[RI $master_id role] eq {master}} } -test "SENTINEL RESET can resets the primary" { +test "SENTINEL RESET can reset the primary" { # After SENTINEL RESET, sometimes the sentinel can sense the primary again, # causing the test to fail. Here we give it a few more chances. for {set j 0} {$j < 10} {incr j} { diff --git a/tests/sentinel/tests/04-replica-selection.tcl b/tests/sentinel/tests/04-replica-selection.tcl new file mode 100644 index 0000000000..0bdf4d2a58 --- /dev/null +++ b/tests/sentinel/tests/04-replica-selection.tcl @@ -0,0 +1,53 @@ +# Test replica selection algorithm. +# +# This unit should test: +# 1) That when there are no suitable replicas, no failover is performed. +# 2) That among the available replicas, the one with better offset is picked. +source "../tests/includes/init-tests.tcl" + +foreach_sentinel_id id { + S $id SENTINEL DEBUG ping-period 500 + S $id SENTINEL DEBUG ask-period 500 + S $id SENTINEL DEBUG info-period 500 + S $id SENTINEL DEBUG default-down-after 1000 +} + +# This unit is the only one in the whole sentinel test suite that +# requires two clusters. Here we will mainly operate on the second cluster. + +# Spawn 1 primary with only 1 replica +set num_instances 2 +spawn_instance valkey $::valkey_base_port $num_instances { + "enable-protected-configs yes" + "enable-debug-command yes" + "save ''" +} + +set primary_a_id 0 +set primary_a_name "mymaster" +# The first 5 IDs belong to the default primary-replica cluster +set primary_b_id $::instances_count +set primary_b_name "another_primary" +set replica_id [expr $primary_b_id + 1] + +# Create the second cluster +init_cluster $primary_b_name $primary_b_id $num_instances + +# Start tests +test "The second cluster works" { + # Put a simple string into the database + R $primary_b_id SET "mykey" "myvalue" + + wait_for_condition 200 50 { + [R $replica_id GET "mykey"] == "myvalue" + } else { + fail "The replica and primary in the second cluster cannot sync" + } +} + +# Now that we have two clusters, we need to do proper cleanup +# to avoid messing up other test suites. +foreach_sentinel_id id { + S $id SENTINEL REMOVE $primary_b_name +} +remove_valkey_instance [list $primary_b_id $replica_id] \ No newline at end of file diff --git a/tests/sentinel/tests/04-slave-selection.tcl b/tests/sentinel/tests/04-slave-selection.tcl deleted file mode 100644 index 3d2ca64845..0000000000 --- a/tests/sentinel/tests/04-slave-selection.tcl +++ /dev/null @@ -1,5 +0,0 @@ -# Test slave selection algorithm. -# -# This unit should test: -# 1) That when there are no suitable slaves no failover is performed. -# 2) That among the available slaves, the one with better offset is picked. diff --git a/tests/sentinel/tests/includes/init-tests.tcl b/tests/sentinel/tests/includes/init-tests.tcl index 456a810c5d..82c07dc0cc 100644 --- a/tests/sentinel/tests/includes/init-tests.tcl +++ b/tests/sentinel/tests/includes/init-tests.tcl @@ -1,63 +1,79 @@ # Initialization tests -- most units will start including this. source "../tests/includes/utils.tcl" -test "(init) Restart killed instances" { - restart_killed_instances -} - -test "(init) Remove old primary entry from sentinels" { - foreach_sentinel_id id { - catch {S $id SENTINEL REMOVE mymaster} - } -} - -set redis_slaves [expr $::instances_count - 1] -test "(init) Create a primary-replicas cluster of [expr $redis_slaves+1] instances" { - create_valkey_master_slave_cluster [expr {$redis_slaves+1}] -} -set master_id 0 - -test "(init) Sentinels can start monitoring a primary" { +proc sentinel_monitor_primary { primary_name primary_id } { set sentinels [llength $::sentinel_instances] set quorum [expr {$sentinels/2+1}] foreach_sentinel_id id { - S $id SENTINEL MONITOR mymaster \ - [get_instance_attrib valkey $master_id host] \ - [get_instance_attrib valkey $master_id port] $quorum + S $id SENTINEL MONITOR $primary_name \ + [get_instance_attrib valkey $primary_id host] [get_instance_attrib valkey $primary_id port] $quorum } foreach_sentinel_id id { - assert {[S $id sentinel primary mymaster] ne {}} - S $id SENTINEL SET mymaster down-after-milliseconds 2000 - S $id SENTINEL SET mymaster failover-timeout 10000 - S $id SENTINEL debug tilt-period 5000 - S $id SENTINEL SET mymaster parallel-syncs 10 + assert {[S $id sentinel primary $primary_name] ne {}} + S $id SENTINEL SET $primary_name down-after-milliseconds 2000 + S $id SENTINEL SET $primary_name failover-timeout 10000 + S $id SENTINEL DEBUG tilt-period 5000 + S $id SENTINEL SET $primary_name parallel-syncs 10 if {$::leaked_fds_file != "" && [exec uname] == "Linux"} { - S $id SENTINEL SET mymaster notification-script ../../tests/helpers/check_leaked_fds.tcl - S $id SENTINEL SET mymaster client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl + S $id SENTINEL SET $primary_name notification-script ../../tests/helpers/check_leaked_fds.tcl + S $id SENTINEL SET $primary_name client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl } } } -test "(init) Sentinels can talk with the primary" { +proc verify_sentinel_connect_to_primary { primary_name } { foreach_sentinel_id id { wait_for_condition 1000 50 { - [catch {S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster}] == 0 + [catch {S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME $primary_name}] == 0 } else { - fail "Sentinel $id can't talk with the primary." + fail "Sentinel $id can't talk with the primary $primary_name" } } } -test "(init) Sentinels are able to auto-discover other sentinels" { - verify_sentinel_auto_discovery -} - -test "(init) Sentinels are able to auto-discover replicas" { +proc verify_sentinel_discover_replicas { primary_name expected_replicas } { foreach_sentinel_id id { wait_for_condition 1000 50 { - [dict get [S $id SENTINEL PRIMARY mymaster] num-slaves] == $redis_slaves + [dict get [S $id SENTINEL primary $primary_name] num-slaves] == $expected_replicas } else { - fail "At least some sentinels can't detect some replicas" + fail "For primary $primary_name, at least some sentinel can't detect some replicas" + } + } +} + +proc init_cluster { primary_name primary_id num_instances } { + puts "Initializing test setup for cluster: primary $primary_name with $num_instances instances" + + test "(init) Restart killed instances" { + restart_killed_instances + } + + test "(init) Remove old primary entry from sentinels" { + foreach_sentinel_id id { + catch {S $id SENTINEL REMOVE $primary_name} } } + + test "(init) Create a primary-replicas cluster of $num_instances instances" { + create_valkey_primary_replica_cluster $num_instances $primary_id + } + + test "(init) Sentinels can start monitoring a primary" { + sentinel_monitor_primary $primary_name $primary_id + } + + test "(init) Sentinels can talk with the primary" { + verify_sentinel_connect_to_primary $primary_name + } + + test "(init) Sentinels are able to auto-discover other sentinels" { + verify_sentinel_auto_discovery $primary_name + } + + test "(init) Sentinels are able to auto-discover replicas" { + verify_sentinel_discover_replicas $primary_name [expr $num_instances - 1] + } } + +set master_id 0 +init_cluster "mymaster" $master_id $::instances_count \ No newline at end of file diff --git a/tests/sentinel/tests/includes/utils.tcl b/tests/sentinel/tests/includes/utils.tcl index b67efeaa85..64bcc694db 100644 --- a/tests/sentinel/tests/includes/utils.tcl +++ b/tests/sentinel/tests/includes/utils.tcl @@ -19,18 +19,22 @@ proc verify_sentinel_connect_sentinels {id} { return 1 } -proc verify_sentinel_auto_discovery {} { +proc verify_sentinel_auto_discovery { {primary_name {}} } { + if {$primary_name eq {}} { + set primary_name "mymaster" + } + set sentinels [llength $::sentinel_instances] foreach_sentinel_id id { wait_for_condition 1000 50 { [dict get [S $id SENTINEL PRIMARY mymaster] num-other-sentinels] == ($sentinels-1) } else { - fail "At least some sentinel can't detect some other sentinel" + fail "For primary $primary_name, at least some sentinel can't detect some other sentinel" } wait_for_condition 1000 50 { [verify_sentinel_connect_sentinels $id] == 1 } else { - fail "At least some sentinel can't connect to other sentinel" + fail "For primary $primary_name, at least some sentinel can't connect to other sentinel" } } }