Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 42 additions & 24 deletions tests/instances.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,14 @@ proc exec_instance {type dirname cfgfile} {

# Spawn a server or sentinel instance, depending on 'type'.
proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {
set current_instances_count [llength [set ::${type}_instances]]
for {set j 0} {$j < $count} {incr j} {
set instance_id [expr $current_instances_count + $j]
set port [find_available_port $base_port $::valkey_port_count]
# plaintext port (only used for TLS cluster)
set pport 0
# Create a directory for this instance.
set dirname "${type}_${j}"
set dirname "${type}_${instance_id}"
lappend ::dirs $dirname
catch {exec rm -rf $dirname}
file mkdir $dirname
Expand Down Expand Up @@ -141,7 +143,7 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {

# Check availability
if {[server_is_up 127.0.0.1 $port 100] == 0} {
puts "Starting $type #$j at port $port failed, try another"
puts "Starting $type #$instance_id at port $port failed, try another"
incr retry -1
set port [find_available_port $base_port $::valkey_port_count]
set cfg [open $cfgfile a+]
Expand All @@ -154,7 +156,7 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {
}
close $cfg
} else {
puts "Starting $type #$j at port $port"
puts "Starting $type #$instance_id at port $port"
lappend ::pids $pid
break
}
Expand All @@ -164,13 +166,14 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {
if {[server_is_up $::host $port 100] == 0} {
set logfile [file join $dirname log.txt]
puts [exec tail $logfile]
abort_sentinel_test "Problems starting $type #$j: ping timeout, maybe server start failed, check $logfile"
abort_sentinel_test "Problems starting $type #$instance_id: ping timeout, maybe server start failed, check $logfile"
}

# Push the instance into the right list
set link [valkey $::host $port 0 $::tls]
$link reconnect 1
lappend ::${type}_instances [list \
instance_id $instance_id \
pid $pid \
host $::host \
port $port \
Expand All @@ -180,6 +183,23 @@ proc spawn_instance {type base_port count {conf {}} {base_conf_file ""}} {
}
}

proc remove_valkey_instance { ids } {
set ids [lsort -unique $ids]
set new {}
foreach instance $::valkey_instances {
# Treats the flat list entry {pid 93770 host 127.0.0.1 port 30000 ...}
# as alternating key/value pairs.
array set inst $instance
# Found the target instance
if { [lsearch -exact $ids $inst(instance_id)] >= 0} {
kill_instance "valkey" $inst(instance_id)
} else {
lappend new $instance
}
}
set ::valkey_instances $new
}

proc log_crashes {} {
set start_pattern {*BUG REPORT START*}
set logs [glob */log.txt]
Expand Down Expand Up @@ -258,9 +278,9 @@ proc cleanup {} {
if {$::dont_clean} {
return
}
foreach dir $::dirs {
catch {exec rm -rf $dir}
}
# Don't clean up the log files. We often need to examine the
# logs regardless of whether the test fail or not.
# The logs will be cleaned up in run.tcl prior to running tests
}

proc abort_sentinel_test msg {
Expand Down Expand Up @@ -445,7 +465,8 @@ proc test {descr code} {
# Check memory leaks when running on macOS using the "leaks" utility.
proc check_leaks instance_types {
if {[string match {*Darwin*} [exec uname -a]]} {
puts -nonewline "Testing for memory leaks..."; flush stdout
set ts [clock format [clock seconds] -format %H:%M:%S]
puts -nonewline "$ts> Testing for memory leaks..."; flush stdout
foreach type $instance_types {
foreach_instance_id [set ::${type}_instances] id {
if {[instance_is_killed $type $id]} continue
Expand Down Expand Up @@ -488,7 +509,7 @@ while 1 {
continue
}
if {[file isdirectory $test]} continue
puts [colorstr yellow "Testing unit: [lindex [file split $test] end]"]
puts [colorstr yellow "\nTesting unit: [lindex [file split $test] end]"]
if {[catch { source $test } err]} {
puts "FAILED: caught an error in the test $err"
puts $::errorInfo
Expand Down Expand Up @@ -621,29 +642,26 @@ proc set_instance_attrib {type id attrib newval} {
lset ::${type}_instances $id $d
}

# Create a master-slave cluster of the given number of total instances.
# The first instance "0" is the master, all others are configured as
# slaves.
proc create_valkey_master_slave_cluster n {
foreach_valkey_id id {
if {$id == 0} {
# Our master.
# Create a primary-replica cluster of the given number of total instances.
# The first instance (ID 0) is the primary, all others are configured as
# replicas.
proc create_valkey_primary_replica_cluster { n primary_id } {
for {set id $primary_id} {$id < [expr $primary_id + $n]} {incr id} {
if {$id == $primary_id} {
R $id slaveof no one
R $id flushall
} elseif {$id < $n} {
R $id slaveof [get_instance_attrib valkey 0 host] \
[get_instance_attrib valkey 0 port]
} else {
# Instances not part of the cluster.
R $id slaveof no one
R $id slaveof [get_instance_attrib valkey $primary_id host] [get_instance_attrib valkey $primary_id port]
}
}
# Wait for all the slaves to sync.
# Wait for all the replicas to sync.
wait_for_condition 1000 50 {
[RI 0 connected_slaves] == ($n-1)
[RI $primary_id connected_slaves] == ($n-1)
} else {
fail "Unable to create a master-slaves cluster."
fail "Unable to create a primary-replica cluster. Connected [RI $primary_id connected_slaves] \
replicas but expected [expr $n-1]"
}
puts "Successfully created a primary-replica cluster of $n instances with primary ID $primary_id"
}

proc get_instance_id_by_port {type port} {
Expand Down
10 changes: 10 additions & 0 deletions tests/sentinel/run.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ set ::instances_count 5 ; # How many instances we use at max.
set ::tlsdir "../../tls"

proc main {} {
set start [clock milliseconds]
# Clean up log files from previous test run
foreach name [glob -tails -directory [pwd] * .*] {
if {$name in { . .. .gitignore }} continue
file delete -force -- $name
}

parse_options
if {$::leaked_fds_file != ""} {
set ::env(LEAKED_FDS_FILE) $::leaked_fds_file
Expand All @@ -26,6 +33,9 @@ proc main {} {
}
run_tests
cleanup
set end [clock milliseconds]
set duration [expr {($end - $start) / 1000}]
puts "Total test duration: ${duration} seconds"
end_tests
}

Expand Down
8 changes: 5 additions & 3 deletions tests/sentinel/tests/00-base.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ test "The old primary eventually gets reconfigured as a slave" {
}

test "ODOWN is not possible without N (quorum) Sentinels reports" {
set sentinels [llength $::sentinel_instances]
foreach_sentinel_id id {
S $id SENTINEL SET mymaster quorum [expr $sentinels+1]
}
Expand All @@ -135,13 +136,14 @@ test "ODOWN is not possible without N (quorum) Sentinels reports" {
assert {[lindex $addr 1] == $old_port}
kill_instance valkey $master_id

# Make sure failover did not happened.
# Make sure failover did not happen.
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
assert {[lindex $addr 1] == $old_port}
restart_instance valkey $master_id
}

test "Failover is not possible without majority agreement" {
set quorum [expr {$sentinels/2 + 1}]
foreach_sentinel_id id {
S $id SENTINEL SET mymaster quorum $quorum
}
Expand All @@ -154,7 +156,7 @@ test "Failover is not possible without majority agreement" {
# Kill the current master
kill_instance valkey $master_id

# Make sure failover did not happened.
# Make sure failover did not happen.
set addr [S $quorum SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
assert {[lindex $addr 1] == $old_port}
restart_instance valkey $master_id
Expand Down Expand Up @@ -202,7 +204,7 @@ test "New primary [join $addr {:}] role matches" {
assert {[RI $master_id role] eq {master}}
}

test "SENTINEL RESET can resets the primary" {
test "SENTINEL RESET can reset the primary" {
# After SENTINEL RESET, sometimes the sentinel can sense the primary again,
# causing the test to fail. Here we give it a few more chances.
for {set j 0} {$j < 10} {incr j} {
Expand Down
53 changes: 53 additions & 0 deletions tests/sentinel/tests/04-replica-selection.tcl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Test replica selection algorithm.
#
# This unit should test:
# 1) That when there are no suitable replicas, no failover is performed.
# 2) That among the available replicas, the one with better offset is picked.
source "../tests/includes/init-tests.tcl"

foreach_sentinel_id id {
S $id SENTINEL DEBUG ping-period 500
S $id SENTINEL DEBUG ask-period 500
S $id SENTINEL DEBUG info-period 500
S $id SENTINEL DEBUG default-down-after 1000
}

# This unit is the only one in the whole sentinel test suite that
# requires two clusters. Here we will mainly operate on the second cluster.

# Spawn 1 primary with only 1 replica
set num_instances 2
spawn_instance valkey $::valkey_base_port $num_instances {
"enable-protected-configs yes"
"enable-debug-command yes"
"save ''"
}

set primary_a_id 0
set primary_a_name "mymaster"
# The first 5 IDs belong to the default primary-replica cluster
set primary_b_id $::instances_count
set primary_b_name "another_primary"
set replica_id [expr $primary_b_id + 1]

# Create the second cluster
init_cluster $primary_b_name $primary_b_id $num_instances

# Start tests
test "The second cluster works" {
# Put a simple string into the database
R $primary_b_id SET "mykey" "myvalue"

wait_for_condition 200 50 {
[R $replica_id GET "mykey"] == "myvalue"
} else {
fail "The replica and primary in the second cluster cannot sync"
}
}

# Now that we have two clusters, we need to do proper cleanup
# to avoid messing up other test suites.
foreach_sentinel_id id {
S $id SENTINEL REMOVE $primary_b_name
}
remove_valkey_instance [list $primary_b_id $replica_id]
5 changes: 0 additions & 5 deletions tests/sentinel/tests/04-slave-selection.tcl

This file was deleted.

90 changes: 53 additions & 37 deletions tests/sentinel/tests/includes/init-tests.tcl
Original file line number Diff line number Diff line change
@@ -1,63 +1,79 @@
# Initialization tests -- most units will start including this.
source "../tests/includes/utils.tcl"

test "(init) Restart killed instances" {
restart_killed_instances
}

test "(init) Remove old primary entry from sentinels" {
foreach_sentinel_id id {
catch {S $id SENTINEL REMOVE mymaster}
}
}

set redis_slaves [expr $::instances_count - 1]
test "(init) Create a primary-replicas cluster of [expr $redis_slaves+1] instances" {
create_valkey_master_slave_cluster [expr {$redis_slaves+1}]
}
set master_id 0

test "(init) Sentinels can start monitoring a primary" {
proc sentinel_monitor_primary { primary_name primary_id } {
set sentinels [llength $::sentinel_instances]
set quorum [expr {$sentinels/2+1}]
foreach_sentinel_id id {
S $id SENTINEL MONITOR mymaster \
[get_instance_attrib valkey $master_id host] \
[get_instance_attrib valkey $master_id port] $quorum
S $id SENTINEL MONITOR $primary_name \
[get_instance_attrib valkey $primary_id host] [get_instance_attrib valkey $primary_id port] $quorum
}
foreach_sentinel_id id {
assert {[S $id sentinel primary mymaster] ne {}}
S $id SENTINEL SET mymaster down-after-milliseconds 2000
S $id SENTINEL SET mymaster failover-timeout 10000
S $id SENTINEL debug tilt-period 5000
S $id SENTINEL SET mymaster parallel-syncs 10
assert {[S $id sentinel primary $primary_name] ne {}}
S $id SENTINEL SET $primary_name down-after-milliseconds 2000
S $id SENTINEL SET $primary_name failover-timeout 10000
S $id SENTINEL DEBUG tilt-period 5000
S $id SENTINEL SET $primary_name parallel-syncs 10
if {$::leaked_fds_file != "" && [exec uname] == "Linux"} {
S $id SENTINEL SET mymaster notification-script ../../tests/helpers/check_leaked_fds.tcl
S $id SENTINEL SET mymaster client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl
S $id SENTINEL SET $primary_name notification-script ../../tests/helpers/check_leaked_fds.tcl
S $id SENTINEL SET $primary_name client-reconfig-script ../../tests/helpers/check_leaked_fds.tcl
}
}
}

test "(init) Sentinels can talk with the primary" {
proc verify_sentinel_connect_to_primary { primary_name } {
foreach_sentinel_id id {
wait_for_condition 1000 50 {
[catch {S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster}] == 0
[catch {S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME $primary_name}] == 0
} else {
fail "Sentinel $id can't talk with the primary."
fail "Sentinel $id can't talk with the primary $primary_name"
}
}
}

test "(init) Sentinels are able to auto-discover other sentinels" {
verify_sentinel_auto_discovery
}

test "(init) Sentinels are able to auto-discover replicas" {
proc verify_sentinel_discover_replicas { primary_name expected_replicas } {
foreach_sentinel_id id {
wait_for_condition 1000 50 {
[dict get [S $id SENTINEL PRIMARY mymaster] num-slaves] == $redis_slaves
[dict get [S $id SENTINEL primary $primary_name] num-slaves] == $expected_replicas
} else {
fail "At least some sentinels can't detect some replicas"
fail "For primary $primary_name, at least some sentinel can't detect some replicas"
}
}
}

proc init_cluster { primary_name primary_id num_instances } {
puts "Initializing test setup for cluster: primary $primary_name with $num_instances instances"

test "(init) Restart killed instances" {
restart_killed_instances
}

test "(init) Remove old primary entry from sentinels" {
foreach_sentinel_id id {
catch {S $id SENTINEL REMOVE $primary_name}
}
}

test "(init) Create a primary-replicas cluster of $num_instances instances" {
create_valkey_primary_replica_cluster $num_instances $primary_id
}

test "(init) Sentinels can start monitoring a primary" {
sentinel_monitor_primary $primary_name $primary_id
}

test "(init) Sentinels can talk with the primary" {
verify_sentinel_connect_to_primary $primary_name
}

test "(init) Sentinels are able to auto-discover other sentinels" {
verify_sentinel_auto_discovery $primary_name
}

test "(init) Sentinels are able to auto-discover replicas" {
verify_sentinel_discover_replicas $primary_name [expr $num_instances - 1]
}
}

set master_id 0
init_cluster "mymaster" $master_id $::instances_count
Loading
Loading