diff --git a/tests/cluster/tests/00-base.tcl b/tests/cluster/tests/00-base.tcl deleted file mode 100644 index cfb458fee0..0000000000 --- a/tests/cluster/tests/00-base.tcl +++ /dev/null @@ -1,89 +0,0 @@ -# Check the basic monitoring and failover capabilities. - -source "../tests/includes/init-tests.tcl" - -if {$::simulate_error} { - test "This test will fail" { - fail "Simulated error" - } -} - -test "Different nodes have different IDs" { - set ids {} - set numnodes 0 - foreach_valkey_id id { - incr numnodes - # Every node should just know itself. - set nodeid [dict get [get_myself $id] id] - assert {$nodeid ne {}} - lappend ids $nodeid - } - set numids [llength [lsort -unique $ids]] - assert {$numids == $numnodes} -} - -test "It is possible to perform slot allocation" { - cluster_allocate_slots 5 -} - -test "After the join, every node gets a different config epoch" { - set trynum 60 - while {[incr trynum -1] != 0} { - # We check that this condition is true for *all* the nodes. - set ok 1 ; # Will be set to 0 every time a node is not ok. - foreach_valkey_id id { - set epochs {} - foreach n [get_cluster_nodes $id] { - lappend epochs [dict get $n config_epoch] - } - if {[lsort $epochs] != [lsort -unique $epochs]} { - set ok 0 ; # At least one collision! - } - } - if {$ok} break - after 1000 - puts -nonewline . - flush stdout - } - if {$trynum == 0} { - fail "Config epoch conflict resolution is not working." - } -} - -test "Nodes should report cluster_state is ok now" { - assert_cluster_state ok -} - -test "Sanity for CLUSTER COUNTKEYSINSLOT" { - set reply [R 0 CLUSTER COUNTKEYSINSLOT 0] - assert {$reply eq 0} -} - -test "It is possible to write and read from the cluster" { - cluster_write_test 0 -} - -test "CLUSTER RESET SOFT test" { - set last_epoch_node0 [get_info_field [R 0 cluster info] cluster_current_epoch] - R 0 FLUSHALL - R 0 CLUSTER RESET - assert {[get_info_field [R 0 cluster info] cluster_current_epoch] eq $last_epoch_node0} - - set last_epoch_node1 [get_info_field [R 1 cluster info] cluster_current_epoch] - R 1 FLUSHALL - R 1 CLUSTER RESET SOFT - assert {[get_info_field [R 1 cluster info] cluster_current_epoch] eq $last_epoch_node1} -} - -test "Coverage: CLUSTER HELP" { - assert_match "*CLUSTER *" [R 0 CLUSTER HELP] -} - -test "Coverage: ASKING" { - assert_equal {OK} [R 0 ASKING] -} - -test "CLUSTER SLAVES and CLUSTER REPLICAS with zero replicas" { - assert_equal {} [R 0 cluster slaves [R 0 CLUSTER MYID]] - assert_equal {} [R 0 cluster replicas [R 0 CLUSTER MYID]] -} diff --git a/tests/cluster/tests/01-faildet.tcl b/tests/cluster/tests/01-faildet.tcl deleted file mode 100644 index 5d40aad7da..0000000000 --- a/tests/cluster/tests/01-faildet.tcl +++ /dev/null @@ -1,38 +0,0 @@ -# Check the basic monitoring and failover capabilities. - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster should start ok" { - assert_cluster_state ok -} - -test "Killing two slave nodes" { - kill_instance valkey 5 - kill_instance valkey 6 -} - -test "Cluster should be still up" { - assert_cluster_state ok -} - -test "Killing one master node" { - kill_instance valkey 0 -} - -# Note: the only slave of instance 0 is already down so no -# failover is possible, that would change the state back to ok. -test "Cluster should be down now" { - assert_cluster_state fail -} - -test "Restarting master node" { - restart_instance valkey 0 -} - -test "Cluster should be up again" { - assert_cluster_state ok -} diff --git a/tests/cluster/tests/11-manual-takeover.tcl b/tests/cluster/tests/11-manual-takeover.tcl deleted file mode 100644 index ebc95960f3..0000000000 --- a/tests/cluster/tests/11-manual-takeover.tcl +++ /dev/null @@ -1,71 +0,0 @@ -# Manual takeover test - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 0 -} - -# For this test, disable replica failover until -# all of the primaries are confirmed killed. Otherwise -# there might be enough time to elect a replica. -set replica_ids { 5 6 7 } -foreach id $replica_ids { - R $id config set cluster-replica-no-failover yes -} - -test "Killing majority of master nodes" { - kill_instance valkey 0 - kill_instance valkey 1 - kill_instance valkey 2 -} - -foreach id $replica_ids { - R $id config set cluster-replica-no-failover no -} - -test "Cluster should eventually be down" { - assert_cluster_state fail -} - -test "Use takeover to bring slaves back" { - foreach id $replica_ids { - R $id cluster failover takeover - } -} - -test "Cluster should eventually be up again" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 4 -} - -test "Instance #5, #6, #7 are now masters" { - foreach id $replica_ids { - assert {[RI $id role] eq {master}} - } -} - -test "Restarting the previously killed master nodes" { - restart_instance valkey 0 - restart_instance valkey 1 - restart_instance valkey 2 -} - -test "Instance #0, #1, #2 gets converted into a slaves" { - wait_for_condition 1000 50 { - [RI 0 role] eq {slave} && [RI 1 role] eq {slave} && [RI 2 role] eq {slave} - } else { - fail "Old masters not converted into slaves" - } -} diff --git a/tests/cluster/tests/13-no-failover-option.tcl b/tests/cluster/tests/13-no-failover-option.tcl deleted file mode 100644 index c11a502f8a..0000000000 --- a/tests/cluster/tests/13-no-failover-option.tcl +++ /dev/null @@ -1,61 +0,0 @@ -# Check that the no-failover option works - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} - -test "Cluster is up" { - assert_cluster_state ok -} - -test "Cluster is writable" { - cluster_write_test 0 -} - -test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} - - # Configure it to never failover the master - R 5 CONFIG SET cluster-slave-no-failover yes -} - -test "Instance #5 synced with the master" { - wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} - } else { - fail "Instance #5 master link status is not up" - } -} - -test "The nofailover flag is propagated" { - set slave5_id [dict get [get_myself 5] id] - - foreach_valkey_id id { - wait_for_condition 1000 50 { - [has_flag [get_node_by_id $id $slave5_id] nofailover] - } else { - fail "Instance $id can't see the nofailover flag of slave" - } - } -} - -set current_epoch [CI 1 cluster_current_epoch] - -test "Killing one master node" { - kill_instance valkey 0 -} - -test "Cluster should be still down after some time" { - after 10000 - assert_cluster_state fail -} - -test "Instance #5 is still a slave" { - assert {[RI 5 role] eq {slave}} -} - -test "Restarting the previously killed master node" { - restart_instance valkey 0 -} diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index d89a5a384d..ebca69d9ca 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -1,5 +1,91 @@ # Cluster helper functions +source tests/support/cli.tcl +source tests/support/cluster.tcl + +proc config_set_all_nodes {keyword value} { + for {set j 0} {$j < [llength $::servers]} {incr j} { + R $j config set $keyword $value + } +} + +proc get_instance_id_by_port {type port} { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[srv [expr -1*$j] port] == $port} { + return $j + } + } + fail "Instance port $port not found." +} + +# Check if the cluster is writable and readable. Use node "port" +# as a starting point to talk with the cluster. +proc cluster_write_test {port} { + set prefix [randstring 20 20 alpha] + set cluster [valkey_cluster 127.0.0.1:$port] + for {set j 0} {$j < 100} {incr j} { + $cluster set key.$j $prefix.$j + } + for {set j 0} {$j < 100} {incr j} { + assert {[$cluster get key.$j] eq "$prefix.$j"} + } + $cluster close +} + +# Helper function to attempt to have each node in a cluster +# meet each other. +proc join_nodes_in_cluster {} { + # Join node 0 with 1, 1 with 2, ... and so forth. + # If auto-discovery works all nodes will know every other node + # eventually. + set ids {} + for {set id 0} {$id < [llength $::servers]} {incr id} {lappend ids $id} + for {set j 0} {$j < [expr [llength $ids]-1]} {incr j} { + set a [lindex $ids $j] + set b [lindex $ids [expr $j+1]] + set b_port [srv -$b port] + R $a cluster meet 127.0.0.1 $b_port + } + + for {set id 0} {$id < [llength $::servers]} {incr id} { + wait_for_condition 1000 50 { + [llength [get_cluster_nodes $id connected]] == [llength $ids] + } else { + return 0 + } + } + return 1 +} + +# Search the first node starting from ID $first that is not +# already configured as a replica. +proc cluster_find_available_replica {first} { + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {$id < $first} continue + set me [cluster_get_myself $id] + if {[dict get $me slaveof] eq {-}} {return $id} + } + fail "No available replicas" +} + +proc fix_cluster {addr} { + set code [catch { + exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster fix $addr << yes + } result] + if {$code != 0} { + puts "valkey-cli --cluster fix returns non-zero exit code, output below:\n$result" + } + # Note: valkey-cli --cluster fix may return a non-zero exit code if nodes don't agree, + # but we can ignore that and rely on the check below. + wait_for_cluster_state ok + wait_for_condition 100 100 { + [catch {exec src/valkey-cli {*}[valkeycli_tls_config "./tests"] --cluster check $addr} result] == 0 + } else { + puts "valkey-cli --cluster check returns non-zero exit code, output below:\n$result" + fail "Cluster could not settle with configuration" + } +} + # Check if cluster configuration is consistent. # All the nodes in the cluster should show same slots configuration and have health # state "online" to be considered as consistent. @@ -59,7 +145,7 @@ proc wait_for_cluster_size {cluster_size} { # Check that cluster nodes agree about "state", or raise an error. proc wait_for_cluster_state {state} { for {set j 0} {$j < [llength $::servers]} {incr j} { - wait_for_condition 100 50 { + wait_for_condition 1000 50 { [CI $j cluster_state] eq $state } else { fail "Cluster node $j cluster_state:[CI $j cluster_state]" @@ -69,7 +155,7 @@ proc wait_for_cluster_state {state} { # Default slot allocation for clusters, each master has a continuous block # and approximately equal number of slots. -proc continuous_slot_allocation {masters} { +proc continuous_slot_allocation {masters replicas} { set avg [expr double(16384) / $masters] set slot_start 0 for {set j 0} {$j < $masters} {incr j} { @@ -79,9 +165,47 @@ proc continuous_slot_allocation {masters} { } } +# Assuming nodes are reset, this function performs slots allocation. +# Only the first 'masters' nodes are used. +proc cluster_allocate_slots {masters replicas} { + set slot 16383 + while {$slot >= 0} { + # Allocate successive slots to random nodes. + set node [randomInt $masters] + lappend slots_$node $slot + incr slot -1 + } + for {set j 0} {$j < $masters} {incr j} { + R $j cluster addslots {*}[set slots_${j}] + } +} + +proc default_replica_allocation {masters replicas} { + # Setup master/replica relationships + set node_count [expr $masters + $replicas] + for {set i 0} {$i < $masters} {incr i} { + set nodeid [R $i CLUSTER MYID] + for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} { + R $j CLUSTER REPLICATE $nodeid + } + } +} + +# Add 'replicas' replicas to a cluster composed of 'masters' masters. +# It assumes that masters are allocated sequentially from instance ID 0 +# to N-1. +proc cluster_allocate_replicas {masters replicas} { + for {set j 0} {$j < $replicas} {incr j} { + set master_id [expr {$j % $masters}] + set replica_id [cluster_find_available_replica $masters] + set master_myself [cluster_get_myself $master_id] + R $replica_id cluster replicate [dict get $master_myself id] + } +} + # Setup method to be executed to configure the cluster before the # tests run. -proc cluster_setup {masters node_count slot_allocator code} { +proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} { # Have all nodes meet if {$::tls} { set tls_cluster [lindex [R 0 CONFIG GET tls-cluster] 1] @@ -96,17 +220,12 @@ proc cluster_setup {masters node_count slot_allocator code} { } } - $slot_allocator $masters + $slot_allocator $masters $replicas wait_for_cluster_propagation # Setup master/replica relationships - for {set i 0} {$i < $masters} {incr i} { - set nodeid [R $i CLUSTER MYID] - for {set j [expr $i + $masters]} {$j < $node_count} {incr j $masters} { - R $j CLUSTER REPLICATE $nodeid - } - } + $replica_allocator $masters $replicas wait_for_cluster_propagation wait_for_cluster_state "ok" @@ -116,11 +235,11 @@ proc cluster_setup {masters node_count slot_allocator code} { # Start a cluster with the given number of masters and replicas. Replicas # will be allocated to masters by round robin. -proc start_cluster {masters replicas options code {slot_allocator continuous_slot_allocation}} { +proc start_cluster {masters replicas options code {slot_allocator continuous_slot_allocation} {replica_allocator default_replica_allocation}} { set node_count [expr $masters + $replicas] # Set the final code to be the tests + cluster setup - set code [list cluster_setup $masters $node_count $slot_allocator $code] + set code [list cluster_setup $masters $replicas $node_count $slot_allocator $replica_allocator $code] # Configure the starting of multiple servers. Set cluster node timeout # aggressively since many tests depend on ping/pong messages. @@ -149,8 +268,19 @@ proc cluster_get_myself id { return {} } -# Returns a parsed CLUSTER NODES output as a list of dictionaries. -proc get_cluster_nodes id { +# Get a specific node by ID by parsing the CLUSTER NODES output +# of the instance Number 'instance_id' +proc cluster_get_node_by_id {instance_id node_id} { + set nodes [get_cluster_nodes $instance_id] + foreach n $nodes { + if {[dict get $n id] eq $node_id} {return $n} + } + return {} +} + +# Returns a parsed CLUSTER NODES output as a list of dictionaries. Optional status field +# can be specified to only returns entries that match the provided status. +proc get_cluster_nodes {id {status "*"}} { set lines [split [R $id cluster nodes] "\r\n"] set nodes {} foreach l $lines { @@ -168,7 +298,9 @@ proc get_cluster_nodes id { linkstate [lindex $args 7] \ slots [lrange $args 8 end] \ ] - lappend nodes $node + if {[string match $status [lindex $args 7]]} { + lappend nodes $node + } } return $nodes } diff --git a/tests/support/util.tcl b/tests/support/util.tcl index d5af536b52..9d69e44232 100644 --- a/tests/support/util.tcl +++ b/tests/support/util.tcl @@ -653,6 +653,11 @@ proc process_is_alive pid { } } +# Return true if the specified process is paused by pause_process. +proc process_is_paused pid { + return [string match {*T*} [lindex [exec ps j $pid] 16]] +} + proc pause_process pid { exec kill -SIGSTOP $pid wait_for_condition 50 100 { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 4df9110552..57fb2beb13 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -221,6 +221,16 @@ proc valkey_client {args} { return $client } +proc valkey_deferring_client_by_addr {host port} { + set client [valkey $host $port 1 $::tls] + return $client +} + +proc valkey_client_by_addr {host port} { + set client [valkey $host $port 0 $::tls] + return $client +} + # Provide easy access to INFO properties. Same semantic as "proc r". proc s {args} { set level 0 diff --git a/tests/unit/cluster/base.tcl b/tests/unit/cluster/base.tcl new file mode 100644 index 0000000000..688fd0f275 --- /dev/null +++ b/tests/unit/cluster/base.tcl @@ -0,0 +1,149 @@ +# Check the basic monitoring and failover capabilities. + +# make sure the test infra won't use SELECT +set old_singledb $::singledb +set ::singledb 1 + +tags {tls:skip external:skip cluster} { + +set base_conf [list cluster-enabled yes] +start_multiple_servers 5 [list overrides $base_conf] { + +test "Cluster nodes are reachable" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + # Every node should be reachable. + wait_for_condition 1000 50 { + ([catch {R $id ping} ping_reply] == 0) && + ($ping_reply eq {PONG}) + } else { + catch {R $id ping} err + fail "Node #$id keeps replying '$err' to PING." + } + } +} + +test "Cluster nodes hard reset" { + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {$::valgrind} { + set node_timeout 10000 + } else { + set node_timeout 3000 + } + catch {R $id flushall} ; # May fail for readonly slaves. + R $id MULTI + R $id cluster reset hard + R $id cluster set-config-epoch [expr {$id+1}] + R $id EXEC + R $id config set cluster-node-timeout $node_timeout + R $id config set cluster-slave-validity-factor 10 + R $id config set loading-process-events-interval-bytes 2097152 + R $id config set key-load-delay 0 + R $id config set repl-diskless-load disabled + R $id config set cluster-announce-hostname "" + R $id DEBUG DROP-CLUSTER-PACKET-FILTER -1 + R $id config rewrite + } +} + +test "Cluster Join and auto-discovery test" { + # Use multiple attempts since sometimes nodes timeout + # while attempting to connect. + for {set attempts 3} {$attempts > 0} {incr attempts -1} { + if {[join_nodes_in_cluster] == 1} { + break + } + } + if {$attempts == 0} { + fail "Cluster failed to form full mesh" + } +} + +test "Before slots allocation, all nodes report cluster failure" { + wait_for_cluster_state fail +} + +test "Different nodes have different IDs" { + set ids {} + set numnodes 0 + for {set id 0} {$id < [llength $::servers]} {incr id} { + incr numnodes + # Every node should just know itself. + set nodeid [dict get [cluster_get_myself $id] id] + assert {$nodeid ne {}} + lappend ids $nodeid + } + set numids [llength [lsort -unique $ids]] + assert {$numids == $numnodes} +} + +test "It is possible to perform slot allocation" { + cluster_allocate_slots 5 0 +} + +test "After the join, every node gets a different config epoch" { + set trynum 60 + while {[incr trynum -1] != 0} { + # We check that this condition is true for *all* the nodes. + set ok 1 ; # Will be set to 0 every time a node is not ok. + for {set id 0} {$id < [llength $::servers]} {incr id} { + set epochs {} + foreach n [get_cluster_nodes $id] { + lappend epochs [dict get $n config_epoch] + } + if {[lsort $epochs] != [lsort -unique $epochs]} { + set ok 0 ; # At least one collision! + } + } + if {$ok} break + after 1000 + puts -nonewline . + flush stdout + } + if {$trynum == 0} { + fail "Config epoch conflict resolution is not working." + } +} + +test "Nodes should report cluster_state is ok now" { + wait_for_cluster_state ok +} + +test "Sanity for CLUSTER COUNTKEYSINSLOT" { + set reply [R 0 CLUSTER COUNTKEYSINSLOT 0] + assert {$reply eq 0} +} + +test "It is possible to write and read from the cluster" { + cluster_write_test [srv 0 port] +} + +test "CLUSTER RESET SOFT test" { + set last_epoch_node0 [CI 0 cluster_current_epoch] + R 0 FLUSHALL + R 0 CLUSTER RESET + assert {[CI 0 cluster_current_epoch] eq $last_epoch_node0} + + set last_epoch_node1 [CI 1 cluster_current_epoch] + R 1 FLUSHALL + R 1 CLUSTER RESET SOFT + assert {[CI 1 cluster_current_epoch] eq $last_epoch_node1} +} + +test "Coverage: CLUSTER HELP" { + assert_match "*CLUSTER *" [R 0 CLUSTER HELP] +} + +test "Coverage: ASKING" { + assert_equal {OK} [R 0 ASKING] +} + +test "CLUSTER SLAVES and CLUSTER REPLICAS with zero replicas" { + assert_equal {} [R 0 cluster slaves [R 0 CLUSTER MYID]] + assert_equal {} [R 0 cluster replicas [R 0 CLUSTER MYID]] +} + +} ;# stop servers + +} ;# tags + +set ::singledb $old_singledb diff --git a/tests/cluster/tests/19-cluster-nodes-slots.tcl b/tests/unit/cluster/cluster-nodes-slots.tcl similarity index 89% rename from tests/cluster/tests/19-cluster-nodes-slots.tcl rename to tests/unit/cluster/cluster-nodes-slots.tcl index 77faec9128..e584ed1e0b 100644 --- a/tests/cluster/tests/19-cluster-nodes-slots.tcl +++ b/tests/unit/cluster/cluster-nodes-slots.tcl @@ -1,17 +1,13 @@ # Optimize CLUSTER NODES command by generating all nodes slot topology firstly -source "../tests/includes/init-tests.tcl" - -test "Create a 2 nodes cluster" { - cluster_create_with_continuous_slots 2 2 -} +start_cluster 2 2 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set master1 [Rn 0] -set master2 [Rn 1] +set master1 [srv 0 "client"] +set master2 [srv -1 "client"] test "Continuous slots distribution" { assert_match "* 0-8191*" [$master1 CLUSTER NODES] @@ -23,7 +19,6 @@ test "Continuous slots distribution" { assert_match "* 0-4095 4097-8191*" [$master1 CLUSTER NODES] assert_match "*0 4095*4097 8191*" [$master1 CLUSTER SLOTS] - $master2 CLUSTER DELSLOTS 12288 assert_match "* 8192-12287 12289-16383*" [$master2 CLUSTER NODES] assert_match "*8192 12287*12289 16383*" [$master2 CLUSTER SLOTS] @@ -48,3 +43,5 @@ test "Discontinuous slots distribution" { assert_match "* 8192-12283 12285 12287 12289-16379 16381*" [$master2 CLUSTER NODES] assert_match "*8192 12283*12285 12285*12287 12287*12289 16379*16381 16381*" [$master2 CLUSTER SLOTS] } + +} ;# start_cluster diff --git a/tests/cluster/tests/15-cluster-slots.tcl b/tests/unit/cluster/cluster-slots.tcl similarity index 75% rename from tests/cluster/tests/15-cluster-slots.tcl rename to tests/unit/cluster/cluster-slots.tcl index 927c1ff0dd..2bbb594392 100644 --- a/tests/cluster/tests/15-cluster-slots.tcl +++ b/tests/unit/cluster/cluster-slots.tcl @@ -1,39 +1,27 @@ -source "../tests/includes/init-tests.tcl" - -proc cluster_allocate_mixedSlots {n} { +proc cluster_allocate_mixedSlots {masters replicas} { set slot 16383 while {$slot >= 0} { - set node [expr {$slot % $n}] + set node [expr {$slot % $masters}] lappend slots_$node $slot incr slot -1 } - for {set j 0} {$j < $n} {incr j} { + for {set j 0} {$j < $masters} {incr j} { R $j cluster addslots {*}[set slots_${j}] } } -proc create_cluster_with_mixedSlot {masters slaves} { - cluster_allocate_mixedSlots $masters - if {$slaves} { - cluster_allocate_slaves $masters $slaves - } - assert_cluster_state ok -} - -test "Create a 5 nodes cluster" { - create_cluster_with_mixedSlot 5 15 -} +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "client do not break when cluster slot" { @@ -44,13 +32,13 @@ test "client do not break when cluster slot" { } test "client can handle keys with hash tag" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] $cluster set foo{tag} bar $cluster close } test "slot migration is valid from primary to another primary" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] set key order1 set slot [$cluster cluster keyslot $key] array set nodefrom [$cluster masternode_for_slot $slot] @@ -61,17 +49,15 @@ test "slot migration is valid from primary to another primary" { } test "slot migration is invalid from primary to replica" { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] set key order1 set slot [$cluster cluster keyslot $key] array set nodefrom [$cluster masternode_for_slot $slot] # Get replica node serving slot. set replicanodeinfo [$cluster cluster replicas $nodefrom(id)] - puts $replicanodeinfo set args [split $replicanodeinfo " "] set replicaid [lindex [split [lindex $args 0] \{] 1] - puts $replicaid catch {[$nodefrom(link) cluster setslot $slot node $replicaid]} err assert_match "*Target node is not a master" $err @@ -117,12 +103,14 @@ proc count_bound_slots {n} { if {$::tls} { test {CLUSTER SLOTS from non-TLS client in TLS cluster} { set slots_tls [R 0 cluster slots] - set host [get_instance_attrib valkey 0 host] - set plaintext_port [get_instance_attrib valkey 0 plaintext-port] - set client_plain [valkey $host $plaintext_port 0 0] + set host [srv 0 host] + set plaintext_port [srv 0 pport] + set client_plain [redis $host $plaintext_port 0 0] set slots_plain [$client_plain cluster slots] $client_plain close # Compare the ports in the first row assert_no_match [lindex $slots_tls 0 3 1] [lindex $slots_plain 0 3 1] } -} \ No newline at end of file +} + +} cluster_allocate_mixedSlots cluster_allocate_replicas ;# start_cluster diff --git a/tests/cluster/tests/14-consistency-check.tcl b/tests/unit/cluster/consistency-check.tcl similarity index 80% rename from tests/cluster/tests/14-consistency-check.tcl rename to tests/unit/cluster/consistency-check.tcl index bc7c893214..e84ab55baa 100644 --- a/tests/cluster/tests/14-consistency-check.tcl +++ b/tests/unit/cluster/consistency-check.tcl @@ -1,22 +1,18 @@ -source "../tests/includes/init-tests.tcl" -source "../../../tests/support/cli.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } proc find_non_empty_master {} { set master_id_no {} - foreach_valkey_id id { - if {[RI $id role] eq {master} && [R $id dbsize] > 0} { + + for {set id 0} {$id < [llength $::servers]} {incr id} { + if {[s -$id role] eq {master} && [R $id dbsize] > 0} { set master_id_no $id break } @@ -31,13 +27,13 @@ proc get_one_of_my_replica {id} { fail "replicas didn't connect" } set replica_port [lindex [lindex [lindex [R $id role] 2] 0] 1] - set replica_id_num [get_instance_id_by_port valkey $replica_port] + set replica_id_num [get_instance_id_by_port redis $replica_port] return $replica_id_num } proc cluster_write_keys_with_expire {id ttl} { set prefix [randstring 20 20 alpha] - set port [get_instance_attrib valkey $id port] + set port [srv -$id port] set cluster [valkey_cluster 127.0.0.1:$port] for {set j 100} {$j < 200} {incr j} { $cluster setex key_expire.$j $ttl $prefix.$j @@ -80,11 +76,10 @@ proc test_slave_load_expired_keys {aof} { # make replica create persistence file if {$aof == "yes"} { - # we need to wait for the initial AOFRW to be done, otherwise - # kill_instance (which now uses SIGTERM will fail ("Writing initial AOF, can't exit") + # we need to wait for the initial AOFRW to be done wait_for_condition 100 10 { - [RI $replica_id aof_rewrite_scheduled] eq 0 && - [RI $replica_id aof_rewrite_in_progress] eq 0 + [s -$replica_id aof_rewrite_scheduled] eq 0 && + [s -$replica_id aof_rewrite_in_progress] eq 0 } else { fail "AOFRW didn't finish" } @@ -93,7 +88,8 @@ proc test_slave_load_expired_keys {aof} { } # kill the replica (would stay down until re-started) - kill_instance valkey $replica_id + set paused_pid [srv -$replica_id pid] + pause_process $paused_pid # Make sure the master doesn't do active expire (sending DELs to the replica) R $master_id DEBUG SET-ACTIVE-EXPIRE 0 @@ -102,7 +98,7 @@ proc test_slave_load_expired_keys {aof} { after [expr $data_ttl*1000] # start the replica again (loading an RDB or AOF file) - restart_instance valkey $replica_id + resume_process $paused_pid # make sure the keys are still there set replica_dbsize_3 [R $replica_id dbsize] @@ -122,3 +118,5 @@ proc test_slave_load_expired_keys {aof} { test_slave_load_expired_keys no test_slave_load_expired_keys yes + +} ;# start_cluster diff --git a/tests/cluster/tests/17-diskless-load-swapdb.tcl b/tests/unit/cluster/diskless-load-swapdb.tcl similarity index 80% rename from tests/cluster/tests/17-diskless-load-swapdb.tcl rename to tests/unit/cluster/diskless-load-swapdb.tcl index e7b69d71b8..68c2135493 100644 --- a/tests/cluster/tests/17-diskless-load-swapdb.tcl +++ b/tests/unit/cluster/diskless-load-swapdb.tcl @@ -1,24 +1,20 @@ # Check that replica keys and keys to slots map are right after failing to diskless load using SWAPDB. -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 1 1 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Main db not affected when fail to diskless load" { - set master [Rn 0] - set replica [Rn 1] + set master [srv 0 "client"] + set replica [srv -1 "client"] set master_id 0 - set replica_id 1 + set replica_id -1 $replica READONLY $replica config set repl-diskless-load swapdb @@ -42,7 +38,7 @@ test "Main db not affected when fail to diskless load" { # Save an RDB and kill the replica $replica save - kill_instance valkey $replica_id + pause_process [srv $replica_id pid] # Delete the key from master $master del $slot0_key @@ -60,7 +56,9 @@ test "Main db not affected when fail to diskless load" { } # Start the replica again - restart_instance valkey $replica_id + resume_process [srv $replica_id pid] + restart_server $replica_id true false + set replica [srv -1 "client"] $replica READONLY # Start full sync, wait till after db started loading in background @@ -71,16 +69,20 @@ test "Main db not affected when fail to diskless load" { } # Kill master, abort full sync - kill_instance valkey $master_id + pause_process [srv $master_id pid] # Start full sync, wait till the replica detects the disconnection wait_for_condition 500 10 { [s $replica_id async_loading] eq 0 } else { - fail "Fail to full sync" + fail "Fail to stop the full sync" } # Replica keys and keys to slots map still both are right assert_equal {1} [$replica get $slot0_key] assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] + + resume_process [srv $master_id pid] } + +} ;# start_cluster diff --git a/tests/unit/cluster/faildet.tcl b/tests/unit/cluster/faildet.tcl new file mode 100644 index 0000000000..1a0b888392 --- /dev/null +++ b/tests/unit/cluster/faildet.tcl @@ -0,0 +1,64 @@ +# Check the basic monitoring and failover capabilities. + +start_cluster 5 5 {tags {external:skip cluster}} { + +test "Cluster should start ok" { + wait_for_cluster_state ok +} + +set paused_pid5 [srv -5 pid] +set paused_pid6 [srv -6 pid] +test "Killing two slave nodes" { + pause_process $paused_pid5 + pause_process $paused_pid6 +} + +test "Cluster should be still up" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +set paused_pid [srv -5 pid] +test "Killing one master node" { + pause_process $paused_pid +} + +# Note: the only slave of instance 0 is already down so no +# failover is possible, that would change the state back to ok. +test "Cluster should be down now" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Restarting master node" { + pause_process $paused_pid +} + +test "Cluster should be up again" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid5]} continue + if {[process_is_paused $paused_pid6]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +} ;# start_cluster diff --git a/tests/cluster/tests/02-failover.tcl b/tests/unit/cluster/failover.tcl similarity index 56% rename from tests/cluster/tests/02-failover.tcl rename to tests/unit/cluster/failover.tcl index f5b83a6665..b2c68db3d2 100644 --- a/tests/cluster/tests/02-failover.tcl +++ b/tests/unit/cluster/failover.tcl @@ -1,26 +1,22 @@ # Check the basic monitoring and failover capabilities. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -28,8 +24,9 @@ test "Instance #5 synced with the master" { set current_epoch [CI 1 cluster_current_epoch] +set paused_pid [srv 0 pid] test "Killing one master node" { - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for failover" { @@ -41,25 +38,34 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Restarting the previously killed master node" { - restart_instance valkey 0 + resume_process $paused_pid } test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } + +} ;# start_cluster diff --git a/tests/cluster/tests/20-half-migrated-slot.tcl b/tests/unit/cluster/half-migrated-slot.tcl similarity index 90% rename from tests/cluster/tests/20-half-migrated-slot.tcl rename to tests/unit/cluster/half-migrated-slot.tcl index 8049ca1be4..5629322582 100644 --- a/tests/cluster/tests/20-half-migrated-slot.tcl +++ b/tests/unit/cluster/half-migrated-slot.tcl @@ -7,21 +7,19 @@ # TODO: Test is currently disabled until it is stabilized (fixing the test # itself or real issues in the server). +source tests/support/cluster_util.tcl if {false} { -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" -test "Create a 2 nodes cluster" { - create_cluster 2 0 - config_set_all_nodes cluster-allow-replica-migration no -} +start_cluster 2 0 {tags {external:skip cluster}} { + +config_set_all_nodes cluster-allow-replica-migration no test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -95,4 +93,8 @@ test "Half-finish importing" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster + } + diff --git a/tests/cluster/tests/18-info.tcl b/tests/unit/cluster/info.tcl similarity index 85% rename from tests/cluster/tests/18-info.tcl rename to tests/unit/cluster/info.tcl index 68c62d3576..0d7b249899 100644 --- a/tests/cluster/tests/18-info.tcl +++ b/tests/unit/cluster/info.tcl @@ -1,17 +1,13 @@ # Check cluster info stats -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 2 0 -} +start_cluster 2 0 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set primary1 [Rn 0] -set primary2 [Rn 1] +set primary1 [srv 0 "client"] +set primary2 [srv -1 "client"] proc cmdstat {instance cmd} { return [cmdrstat $cmd $instance] @@ -43,3 +39,5 @@ test "errorstats: rejected call due to MOVED Redirection" { assert_match {*count=1*} [errorstat $perr MOVED] assert_match {*calls=0,*,rejected_calls=1,failed_calls=0} [cmdstat $perr set] } + +} ;# start_cluster diff --git a/tests/cluster/tests/10-manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl similarity index 76% rename from tests/cluster/tests/10-manual-failover.tcl rename to tests/unit/cluster/manual-failover.tcl index 4af4148cee..2d0a8921cb 100644 --- a/tests/cluster/tests/10-manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -1,26 +1,21 @@ # Check the manual failover - -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -30,7 +25,7 @@ set current_epoch [CI 1 cluster_current_epoch] set numkeys 50000 set numops 10000 -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset content} array set content {} @@ -47,7 +42,7 @@ test "Send CLUSTER FAILOVER to #5, during load" { if {$listid % 2} { $cluster rpush $key $ele } else { - $cluster eval {redis.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele + $cluster eval {server.call("rpush",KEYS[1],ARGV[1])} 1 $key $ele } lappend content($key) $ele @@ -68,15 +63,15 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Verify $numkeys keys for consistency with logical content" { @@ -88,35 +83,32 @@ test "Verify $numkeys keys for consistency with logical content" { test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } -## Check that manual failover does not happen if we can't talk with the master. - -source "../tests/includes/init-tests.tcl" +} ;# start_cluster -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +## Check that manual failover does not happen if we can't talk with the master. +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -133,7 +125,7 @@ test "Send CLUSTER FAILOVER to instance #5" { test "Instance #5 is still a slave after some time (no failover)" { after 5000 - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } test "Wait for instance #0 to return back alive" { @@ -141,29 +133,26 @@ test "Wait for instance #0 to return back alive" { assert {[R 0 read] eq {OK}} } -## Check with "force" failover happens anyway. - -source "../tests/includes/init-tests.tcl" +} ;# start_cluster -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +## Check with "force" failover happens anyway. +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -180,7 +169,7 @@ test "Send CLUSTER FAILOVER to instance #5" { test "Instance #5 is a master after some time" { wait_for_condition 1000 50 { - [RI 5 role] eq {master} + [s -5 role] eq {master} } else { fail "Instance #5 is not a master after some time regardless of FORCE" } @@ -190,3 +179,5 @@ test "Wait for instance #0 to return back alive" { R 0 deferred 0 assert {[R 0 read] eq {OK}} } + +} ;# start_cluster diff --git a/tests/unit/cluster/manual-takeover.tcl b/tests/unit/cluster/manual-takeover.tcl new file mode 100644 index 0000000000..8a4509b397 --- /dev/null +++ b/tests/unit/cluster/manual-takeover.tcl @@ -0,0 +1,90 @@ +# Manual takeover test + +start_cluster 5 5 {tags {external:skip cluster}} { + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Cluster is writable" { + cluster_write_test [srv -1 port] +} + +# For this test, disable replica failover until +# all of the primaries are confirmed killed. Otherwise +# there might be enough time to elect a replica. +set replica_ids { 5 6 7 } +foreach id $replica_ids { + R $id config set cluster-replica-no-failover yes +} + +set paused_pid [srv 0 pid] +set paused_pid1 [srv -1 pid] +set paused_pid2 [srv -2 pid] +test "Killing majority of master nodes" { + pause_process $paused_pid + pause_process $paused_pid1 + pause_process $paused_pid2 +} + +foreach id $replica_ids { + R $id config set cluster-replica-no-failover no +} + +test "Cluster should eventually be down" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid1]} continue + if {[process_is_paused $paused_pid2]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Use takeover to bring slaves back" { + foreach id $replica_ids { + R $id cluster failover takeover + } +} + +test "Cluster should eventually be up again" { + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid1]} continue + if {[process_is_paused $paused_pid2]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } +} + +test "Cluster is writable" { + cluster_write_test [srv -4 port] +} + +test "Instance #5, #6, #7 are now masters" { + assert {[s -5 role] eq {master}} + assert {[s -6 role] eq {master}} + assert {[s -7 role] eq {master}} +} + +test "Restarting the previously killed master nodes" { + resume_process $paused_pid + resume_process $paused_pid1 + resume_process $paused_pid2 +} + +test "Instance #0, #1, #2 gets converted into a slaves" { + wait_for_condition 1000 50 { + [s 0 role] eq {slave} && [s -1 role] eq {slave} && [s -2 role] eq {slave} + } else { + fail "Old masters not converted into slaves" + } +} + +} ;# start_cluster diff --git a/tests/cluster/tests/21-many-slot-migration.tcl b/tests/unit/cluster/many-slot-migration.tcl similarity index 85% rename from tests/cluster/tests/21-many-slot-migration.tcl rename to tests/unit/cluster/many-slot-migration.tcl index 703cf58614..ebdfda9bf2 100644 --- a/tests/cluster/tests/21-many-slot-migration.tcl +++ b/tests/unit/cluster/many-slot-migration.tcl @@ -5,23 +5,18 @@ if {false} { -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" - # TODO: This test currently runs without replicas, as failovers (which may # happen on lower-end CI platforms) are still not handled properly by the # cluster during slot migration (related to #6339). -test "Create a 10 nodes cluster" { - create_cluster 10 0 +start_cluster 10 0 {tags {external:skip cluster}} { config_set_all_nodes cluster-allow-replica-migration no -} test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -61,4 +56,7 @@ test "Keys are accessible" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster + } diff --git a/tests/unit/cluster/multi-slot-operations.tcl b/tests/unit/cluster/multi-slot-operations.tcl index cc7bb7ae0f..fe3246a3fa 100644 --- a/tests/unit/cluster/multi-slot-operations.tcl +++ b/tests/unit/cluster/multi-slot-operations.tcl @@ -1,5 +1,5 @@ # This test uses a custom slot allocation for testing -proc cluster_allocate_with_continuous_slots_local {n} { +proc cluster_allocate_with_continuous_slots_local {masters replicas} { R 0 cluster ADDSLOTSRANGE 0 3276 R 1 cluster ADDSLOTSRANGE 3277 6552 R 2 cluster ADDSLOTSRANGE 6553 9828 diff --git a/tests/unit/cluster/no-failover-option.tcl b/tests/unit/cluster/no-failover-option.tcl new file mode 100644 index 0000000000..deaf61dadc --- /dev/null +++ b/tests/unit/cluster/no-failover-option.tcl @@ -0,0 +1,66 @@ +# Check that the no-failover option works + +source tests/support/cluster.tcl + +start_cluster 5 5 {tags {external:skip cluster}} { + +test "Cluster is up" { + wait_for_cluster_state ok +} + +test "Instance #5 is a slave" { + assert {[s -5 role] eq {slave}} + + # Configure it to never failover the master + R 5 CONFIG SET cluster-slave-no-failover yes +} + +test "Instance #5 synced with the master" { + wait_for_condition 1000 50 { + [s -5 master_link_status] eq {up} + } else { + fail "Instance #5 master link status is not up" + } +} + +test "The nofailover flag is propagated" { + set slave5_id [dict get [cluster_get_myself 5] id] + + for {set j 0} {$j < [llength $::servers]} {incr j} { + wait_for_condition 1000 50 { + [cluster_has_flag [cluster_get_node_by_id $j $slave5_id] nofailover] + } else { + fail "Instance $id can't see the nofailover flag of slave" + } + } +} + +test "Killing one master node" { + pause_process [srv 0 pid] +} + +test "Cluster should be still down after some time" { + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} && + [CI 3 cluster_state] eq {fail} && + [CI 4 cluster_state] eq {fail} && + [CI 5 cluster_state] eq {fail} && + [CI 6 cluster_state] eq {fail} && + [CI 7 cluster_state] eq {fail} && + [CI 8 cluster_state] eq {fail} && + [CI 9 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } +} + +test "Instance #5 is still a slave" { + assert {[s -5 role] eq {slave}} +} + +test "Restarting the previously killed master node" { + resume_process [srv 0 pid] +} + +} ;# start_cluster diff --git a/tests/cluster/tests/09-pubsub.tcl b/tests/unit/cluster/pubsub.tcl similarity index 90% rename from tests/cluster/tests/09-pubsub.tcl rename to tests/unit/cluster/pubsub.tcl index e62b91c4b5..12cb409fd9 100644 --- a/tests/cluster/tests/09-pubsub.tcl +++ b/tests/unit/cluster/pubsub.tcl @@ -1,10 +1,6 @@ # Test PUBLISH propagation across the cluster. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { proc test_cluster_publish {instance instances} { # Subscribe all the instances but the one we use to send. @@ -38,3 +34,5 @@ test "Test publishing to master" { test "Test publishing to slave" { test_cluster_publish 5 10 } + +} ;# start_cluster diff --git a/tests/cluster/tests/25-pubsubshard-slot-migration.tcl b/tests/unit/cluster/pubsubshard-slot-migration.tcl similarity index 96% rename from tests/cluster/tests/25-pubsubshard-slot-migration.tcl rename to tests/unit/cluster/pubsubshard-slot-migration.tcl index 45ec500eb7..c5a324f094 100644 --- a/tests/cluster/tests/25-pubsubshard-slot-migration.tcl +++ b/tests/unit/cluster/pubsubshard-slot-migration.tcl @@ -1,17 +1,15 @@ -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 3 nodes cluster" { - cluster_create_with_continuous_slots 3 3 -} +start_cluster 3 3 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] proc get_addr_replica_serving_slot slot { - set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] + set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] array set node [$cluster masternode_for_slot $slot] set replicanodeinfo [$cluster cluster replicas $node(id)] @@ -209,4 +207,6 @@ test "Reset cluster, verify sunsubscribe message" { $cluster close $subscribeclient close -} \ No newline at end of file +} + +} ;# start_cluster diff --git a/tests/cluster/tests/26-pubsubshard.tcl b/tests/unit/cluster/pubsubshard.tcl similarity index 95% rename from tests/cluster/tests/26-pubsubshard.tcl rename to tests/unit/cluster/pubsubshard.tcl index fe0e7d39ab..e32b6a3a0e 100644 --- a/tests/cluster/tests/26-pubsubshard.tcl +++ b/tests/unit/cluster/pubsubshard.tcl @@ -1,14 +1,13 @@ # Test PUBSUB shard propagation in a cluster slot. -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 3 nodes cluster" { - cluster_create_with_continuous_slots 3 3 -} +# Start a cluster with 3 masters and 3 replicas. +start_cluster 3 3 {tags {external:skip cluster}} { -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] -test "Pub/Sub shard basics" { +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] +test "Pub/Sub shard basics" { set slot [$cluster cluster keyslot "channel.0"] array set publishnode [$cluster masternode_for_slot $slot] array set notshardnode [$cluster masternode_notfor_slot $slot] @@ -123,8 +122,11 @@ test "PUBSUB channels/shardchannels" { assert_equal {3} [llength [$publishclient pubsub shardchannels]] sunsubscribe $subscribeclient + $subscribeclient read set channel_list [$publishclient pubsub shardchannels] assert_equal {2} [llength $channel_list] assert {[lsearch -exact $channel_list "\{channel.0\}2"] >= 0} assert {[lsearch -exact $channel_list "\{channel.0\}3"] >= 0} } + +} ;# start_cluster diff --git a/tests/cluster/tests/22-replica-in-sync.tcl b/tests/unit/cluster/replica-in-sync.tcl similarity index 91% rename from tests/cluster/tests/22-replica-in-sync.tcl rename to tests/unit/cluster/replica-in-sync.tcl index b5645aa75f..776c3ca73c 100644 --- a/tests/cluster/tests/22-replica-in-sync.tcl +++ b/tests/unit/cluster/replica-in-sync.tcl @@ -1,15 +1,13 @@ -source "../tests/includes/init-tests.tcl" +source tests/support/cluster.tcl -test "Create a 1 node cluster" { - create_cluster 1 0 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } proc is_in_slots {master_id replica} { @@ -90,17 +88,17 @@ test "Replica in loading state is hidden" { # The master will be the last to know the replica # is loading, so we will wait on that and assert - # the replica is loading afterwards. + # the replica is loading afterwards. wait_for_condition 100 50 { ![is_in_slots $master_id $replica] } else { fail "Replica was always present in cluster slots" } - assert_equal 1 [s $replica_id loading] + assert_equal 1 [s [expr {-1*$replica_id}] loading] # Wait for the replica to finish full-sync and become online wait_for_condition 200 50 { - [s $replica_id master_link_status] eq "up" + [s [expr {-1*$replica_id}] master_link_status] eq "up" } else { fail "Replica didn't finish loading" } @@ -115,7 +113,7 @@ test "Replica in loading state is hidden" { } else { fail "Replica is not back to slots" } - assert_equal 1 [is_in_slots $replica_id $replica] + assert_equal 1 [is_in_slots $replica_id $replica] } test "Check disconnected replica not hidden from slots" { @@ -144,3 +142,5 @@ test "Check disconnected replica not hidden from slots" { # undo config R $master_id config set requirepass "" } + +} ;# start_cluster diff --git a/tests/cluster/tests/05-slave-selection.tcl b/tests/unit/cluster/slave-selection.tcl similarity index 73% rename from tests/cluster/tests/05-slave-selection.tcl rename to tests/unit/cluster/slave-selection.tcl index bb3a06134e..9c047a0d2d 100644 --- a/tests/cluster/tests/05-slave-selection.tcl +++ b/tests/unit/cluster/slave-selection.tcl @@ -1,16 +1,12 @@ # Slave selection test # Check the algorithm trying to pick the slave with the most complete history. -source "../tests/includes/init-tests.tcl" - # Create a cluster with 5 master and 10 slaves, so that we have 2 # slaves for each master. -test "Create a 5 nodes cluster" { - create_cluster 5 10 -} +start_cluster 5 10 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually two slaves" { @@ -34,21 +30,21 @@ test "CLUSTER SLAVES and CLUSTER REPLICAS output is consistent" { } test {Slaves of #0 are instance #5 and #10 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 5 role] 2] == $port0} assert {[lindex [R 10 role] 2] == $port0} } test "Instance #5 and #10 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} && - [RI 10 master_link_status] eq {up} + [s -5 master_link_status] eq {up} && + [s -10 master_link_status] eq {up} } else { fail "Instance #5 or #10 master link status is not up" } } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] test "Slaves are both able to receive and acknowledge writes" { for {set j 0} {$j < 100} {incr j} { @@ -57,6 +53,7 @@ test "Slaves are both able to receive and acknowledge writes" { assert {[R 0 wait 2 60000] == 2} } +set paused_pid [srv 0 pid] test "Write data while slave #10 is paused and can't receive it" { # Stop the slave with a multi/exec transaction so that the master will # be killed as soon as it can accept writes again. @@ -80,12 +77,12 @@ test "Write data while slave #10 is paused and can't receive it" { assert {[R 10 read] eq {OK OK}} # Kill the master so that a reconnection will not be possible. - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for instance #5 (and not #10) to turn into a master" { wait_for_condition 1000 50 { - [RI 5 role] eq {master} + [s -5 role] eq {master} } else { fail "No failover detected" } @@ -96,11 +93,18 @@ test "Wait for the node #10 to return alive before ending the test" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Node #10 should eventually replicate node #5" { - set port5 [get_instance_attrib valkey 5 port] + set port5 [srv -5 port] wait_for_condition 1000 50 { ([lindex [R 10 role] 2] == $port5) && ([lindex [R 10 role] 3] eq {connected}) @@ -109,16 +113,14 @@ test "Node #10 should eventually replicate node #5" { } } -source "../tests/includes/init-tests.tcl" +} ;# start_cluster # Create a cluster with 3 master and 15 slaves, so that we have 5 # slaves for eatch master. -test "Create a 3 nodes cluster" { - create_cluster 3 15 -} +start_cluster 3 15 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually 5 slaves" { @@ -130,7 +132,7 @@ test "The first master has actually 5 slaves" { } test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 3 role] 2] == $port0} assert {[lindex [R 6 role] 2] == $port0} assert {[lindex [R 9 role] 2] == $port0} @@ -140,11 +142,11 @@ test {Slaves of #0 are instance #3, #6, #9, #12 and #15 as expected} { test {Instance #3, #6, #9, #12 and #15 synced with the master} { wait_for_condition 1000 50 { - [RI 3 master_link_status] eq {up} && - [RI 6 master_link_status] eq {up} && - [RI 9 master_link_status] eq {up} && - [RI 12 master_link_status] eq {up} && - [RI 15 master_link_status] eq {up} + [s -3 master_link_status] eq {up} && + [s -6 master_link_status] eq {up} && + [s -9 master_link_status] eq {up} && + [s -12 master_link_status] eq {up} && + [s -15 master_link_status] eq {up} } else { fail "Instance #3 or #6 or #9 or #12 or #15 master link status is not up" } @@ -152,7 +154,7 @@ test {Instance #3, #6, #9, #12 and #15 synced with the master} { proc master_detected {instances} { foreach instance [dict keys $instances] { - if {[RI $instance role] eq {master}} { + if {[s -$instance role] eq {master}} { return true } } @@ -167,7 +169,7 @@ test "New Master down consecutively" { for {set i 0} {$i < $loops} {incr i} { set master_id -1 foreach instance [dict keys $instances] { - if {[RI $instance role] eq {master}} { + if {[s -$instance role] eq {master}} { set master_id $instance break; } @@ -179,13 +181,23 @@ test "New Master down consecutively" { set instances [dict remove $instances $master_id] - kill_instance valkey $master_id + set paused_pid [srv [expr $master_id * -1] pid] + pause_process $paused_pid wait_for_condition 1000 50 { [master_detected $instances] } else { fail "No failover detected when master $master_id fails" } - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } } + +} ;# start_cluster diff --git a/tests/cluster/tests/06-slave-stop-cond.tcl b/tests/unit/cluster/slave-stop-cond.tcl similarity index 76% rename from tests/cluster/tests/06-slave-stop-cond.tcl rename to tests/unit/cluster/slave-stop-cond.tcl index 3813f37365..b97c7b6907 100644 --- a/tests/cluster/tests/06-slave-stop-cond.tcl +++ b/tests/unit/cluster/slave-stop-cond.tcl @@ -2,15 +2,11 @@ # Check that if there is a disconnection time limit, the slave will not try # to failover its master. -source "../tests/includes/init-tests.tcl" - # Create a cluster with 5 master and 5 slaves. -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "The first master has actually one slave" { @@ -22,13 +18,13 @@ test "The first master has actually one slave" { } test {Slaves of #0 is instance #5 as expected} { - set port0 [get_instance_attrib valkey 0 port] + set port0 [srv 0 port] assert {[lindex [R 5 role] 2] == $port0} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -38,6 +34,7 @@ test "Lower the slave validity factor of #5 to the value of 2" { assert {[R 5 config set cluster-slave-validity-factor 2] eq {OK}} } +set paused_pid [srv 0 pid] test "Break master-slave link and prevent further reconnections" { # Stop the slave with a multi/exec transaction so that the master will # be killed as soon as it can accept writes again. @@ -60,7 +57,7 @@ test "Break master-slave link and prevent further reconnections" { assert {[R 5 read] eq {OK OK}} # Kill the master so that a reconnection will not be possible. - kill_instance valkey 0 + pause_process $paused_pid } test "Slave #5 is reachable and alive" { @@ -69,9 +66,18 @@ test "Slave #5 is reachable and alive" { test "Slave #5 should not be able to failover" { after 10000 - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Cluster should be down" { - assert_cluster_state fail + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 100 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } + +} ;# start_cluster diff --git a/tests/cluster/tests/29-slot-migration-response.tcl b/tests/unit/cluster/slot-migration-response.tcl similarity index 77% rename from tests/cluster/tests/29-slot-migration-response.tcl rename to tests/unit/cluster/slot-migration-response.tcl index bc76735591..e1db7041c2 100644 --- a/tests/cluster/tests/29-slot-migration-response.tcl +++ b/tests/unit/cluster/slot-migration-response.tcl @@ -1,18 +1,15 @@ # Tests for the response of slot migrations. +source tests/support/cluster.tcl -source "../tests/includes/init-tests.tcl" -source "../tests/includes/utils.tcl" +start_cluster 2 0 {tags {external:skip cluster}} { -test "Create a 2 nodes cluster" { - create_cluster 2 0 - config_set_all_nodes cluster-allow-replica-migration no -} +config_set_all_nodes cluster-allow-replica-migration no test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } -set cluster [valkey_cluster 127.0.0.1:[get_instance_attrib valkey 0 port]] +set cluster [valkey_cluster 127.0.0.1:[srv 0 port]] catch {unset nodefrom} catch {unset nodeto} @@ -48,3 +45,5 @@ test "Test cluster responses during migration of slot x" { } config_set_all_nodes cluster-allow-replica-migration yes + +} ;# start_cluster diff --git a/tests/cluster/tests/16-transactions-on-replica.tcl b/tests/unit/cluster/transactions-on-replica.tcl similarity index 86% rename from tests/cluster/tests/16-transactions-on-replica.tcl rename to tests/unit/cluster/transactions-on-replica.tcl index b509892f54..b53af58cac 100644 --- a/tests/cluster/tests/16-transactions-on-replica.tcl +++ b/tests/unit/cluster/transactions-on-replica.tcl @@ -1,17 +1,13 @@ # Check basic transactions on a replica. -source "../tests/includes/init-tests.tcl" - -test "Create a primary with a replica" { - create_cluster 1 1 -} +start_cluster 1 1 {tags {external:skip cluster}} { test "Cluster should start ok" { - assert_cluster_state ok + wait_for_cluster_state ok } -set primary [Rn 0] -set replica [Rn 1] +set primary [srv 0 "client"] +set replica [srv -1 "client"] test "Can't read from replica without READONLY" { $primary SET a 1 @@ -58,13 +54,13 @@ test "MULTI-EXEC with write operations is MOVED" { } test "read-only blocking operations from replica" { - set rd [valkey_deferring_client valkey 1] + set rd [valkey_deferring_client -1] $rd readonly $rd read $rd XREAD BLOCK 0 STREAMS k 0 wait_for_condition 1000 50 { - [RI 1 blocked_clients] eq {1} + [s -1 blocked_clients] eq {1} } else { fail "client wasn't blocked" } @@ -78,8 +74,10 @@ test "read-only blocking operations from replica" { test "reply MOVED when eval from replica for update" { catch {[$replica eval {#!lua - return redis.call('del','a') + return server.call('del','a') } 1 a ]} err assert {[string range $err 0 4] eq {MOVED}} -} \ No newline at end of file +} + +} ;# start_cluster diff --git a/tests/cluster/tests/08-update-msg.tcl b/tests/unit/cluster/update-msg.tcl similarity index 55% rename from tests/cluster/tests/08-update-msg.tcl rename to tests/unit/cluster/update-msg.tcl index bff3d0a862..2bec2de27c 100644 --- a/tests/cluster/tests/08-update-msg.tcl +++ b/tests/unit/cluster/update-msg.tcl @@ -9,27 +9,23 @@ # of the UPDATE messages it will receive from the other nodes when its # configuration will be found to be outdated. -source "../tests/includes/init-tests.tcl" - -test "Create a 5 nodes cluster" { - create_cluster 5 5 -} +start_cluster 5 5 {tags {external:skip cluster}} { test "Cluster is up" { - assert_cluster_state ok + wait_for_cluster_state ok } test "Cluster is writable" { - cluster_write_test 0 + cluster_write_test [srv 0 port] } test "Instance #5 is a slave" { - assert {[RI 5 role] eq {slave}} + assert {[s -5 role] eq {slave}} } test "Instance #5 synced with the master" { wait_for_condition 1000 50 { - [RI 5 master_link_status] eq {up} + [s -5 master_link_status] eq {up} } else { fail "Instance #5 master link status is not up" } @@ -37,8 +33,9 @@ test "Instance #5 synced with the master" { set current_epoch [CI 1 cluster_current_epoch] +set paused_pid [srv 0 pid] test "Killing one master node" { - kill_instance valkey 0 + pause_process $paused_pid } test "Wait for failover" { @@ -50,41 +47,59 @@ test "Wait for failover" { } test "Cluster should eventually be up again" { - assert_cluster_state ok + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Cluster is writable" { - cluster_write_test 1 + cluster_write_test [srv -1 port] } test "Instance #5 is now a master" { - assert {[RI 5 role] eq {master}} + assert {[s -5 role] eq {master}} } +set paused_pid5 [srv -5 pid] test "Killing the new master #5" { - kill_instance valkey 5 + pause_process $paused_pid5 } test "Cluster should be down now" { - assert_cluster_state fail + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused $paused_pid]} continue + if {[process_is_paused $paused_pid5]} continue + wait_for_condition 1000 50 { + [CI $j cluster_state] eq "fail" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } } test "Restarting the old master node" { - restart_instance valkey 0 + resume_process $paused_pid } test "Instance #0 gets converted into a slave" { wait_for_condition 1000 50 { - [RI 0 role] eq {slave} + [s 0 role] eq {slave} } else { fail "Old master was not converted into slave" } } test "Restarting the new master node" { - restart_instance valkey 5 + resume_process $paused_pid5 } test "Cluster is up again" { - assert_cluster_state ok + wait_for_cluster_state ok } + +} ;# start_cluster