diff --git a/src/sentinel.c b/src/sentinel.c index 711c4aea3e..4943910c52 100644 --- a/src/sentinel.c +++ b/src/sentinel.c @@ -110,10 +110,11 @@ static mstime_t sentinel_default_failover_timeout = 60 * 3 * 1000; #define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */ #define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* Wait for failover_start_time*/ #define SENTINEL_FAILOVER_STATE_SELECT_REPLICA 2 /* Select replica to promote */ -#define SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE 3 /* Replica -> Primary */ -#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait replica to change role */ -#define SENTINEL_FAILOVER_STATE_RECONF_REPLICAS 5 /* REPLICAOF newprimary */ -#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* Monitor promoted replica. */ +#define SENTINEL_FAILOVER_STATE_SEND_FAILOVER 3 /* Send FAILOVER Command to primary. */ +#define SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE 4 /* Replica -> Primary */ +#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 5 /* Wait replica to change role */ +#define SENTINEL_FAILOVER_STATE_RECONF_REPLICAS 6 /* REPLICAOF newprimary */ +#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 7 /* Monitor promoted replica. */ #define SENTINEL_PRIMARY_LINK_STATUS_UP 0 #define SENTINEL_PRIMARY_LINK_STATUS_DOWN 1 @@ -3221,6 +3222,7 @@ const char *sentinelFailoverStateStr(int state) { case SENTINEL_FAILOVER_STATE_NONE: return "none"; case SENTINEL_FAILOVER_STATE_WAIT_START: return "wait_start"; case SENTINEL_FAILOVER_STATE_SELECT_REPLICA: return "select_slave"; + case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: return "send_failover"; case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: return "send_slaveof_noone"; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion"; case SENTINEL_FAILOVER_STATE_RECONF_REPLICAS: return "reconf_slaves"; @@ -4635,6 +4637,41 @@ char *sentinelGetLeader(sentinelValkeyInstance *primary, uint64_t epoch) { return winner; } +void sentinelFailoverReplyCallback(redisAsyncContext *c, void *reply, void *privdata) { + sentinelValkeyInstance *ri = privdata; + instanceLink *link = c->data; + redisReply *r; + + if (!reply || !link) return; + link->pending_commands--; + r = reply; + + /* Primary does not support FAILOVER, fallback to REPLICAOF NO ONE. */ + if (r->type == REDIS_REPLY_ERROR) { + sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", ri->promoted_replica, "%@"); + ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE; + ri->failover_state_change_time = mstime(); + } +} + +/* Send FAILOVER to the specified primary instance, the replica addr passed in + * at the same time will be used as the TO parameter. */ +int sentinelSendFailover(sentinelValkeyInstance *ri, const sentinelAddr *addr) { + char portstr[32]; + const char *host; + int retval; + + host = announceSentinelAddr(addr); + ll2string(portstr, sizeof(portstr), addr->port); + + retval = redisAsyncCommand(ri->link->cc, sentinelFailoverReplyCallback, ri, "%s TO %s %s TIMEOUT %lld", + sentinelInstanceMapCommand(ri, "FAILOVER"), host, portstr, ri->failover_timeout); + if (retval == C_ERR) return retval; + ri->link->pending_commands++; + + return C_OK; +} + /* Send REPLICAOF to the specified instance, always followed by a * CONFIG REWRITE command in order to store the new configuration on disk * when possible (that is, if the instance is recent enough to support @@ -4901,12 +4938,41 @@ void sentinelFailoverSelectReplica(sentinelValkeyInstance *ri) { sentinelEvent(LL_WARNING, "+selected-slave", replica, "%@"); replica->flags |= SRI_PROMOTED; ri->promoted_replica = replica; - ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE; + ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_FAILOVER; ri->failover_state_change_time = mstime(); - sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", replica, "%@"); + sentinelEvent(LL_NOTICE, "+failover-state-send-failover", replica, "%@"); } } +void sentinelFailoverSendFailover(sentinelValkeyInstance *ri) { + /* We can't send the command to the promoted replica if it is now + * disconnected. Retry again and again with this state until the timeout + * is reached, then abort the failover. */ + if (ri->promoted_replica->link->disconnected) { + if (mstime() - ri->failover_state_change_time > ri->failover_timeout) { + sentinelEvent(LL_WARNING, "-failover-abort-slave-timeout", ri, "%@"); + sentinelAbortFailover(ri); + } + return; + } + + /* We will first try to use SHUTDOWN to coordinate a failover between the primary + * and the promoted replica to avoid data loss. */ + if ((ri->flags & (SRI_S_DOWN | SRI_O_DOWN)) == 0 && !ri->link->disconnected) { + if (sentinelSendFailover(ri, ri->promoted_replica->addr) == C_OK) { + sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion", ri->promoted_replica, "%@"); + ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION; + ri->failover_state_change_time = mstime(); + return; + } + } + + /* Fallback to REPLICAOF NO ONE. */ + sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", ri->promoted_replica, "%@"); + ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE; + ri->failover_state_change_time = mstime(); +} + void sentinelFailoverSendReplicaOfNoOne(sentinelValkeyInstance *ri) { int retval; @@ -5078,6 +5144,7 @@ void sentinelFailoverStateMachine(sentinelValkeyInstance *ri) { switch (ri->failover_state) { case SENTINEL_FAILOVER_STATE_WAIT_START: sentinelFailoverWaitStart(ri); break; case SENTINEL_FAILOVER_STATE_SELECT_REPLICA: sentinelFailoverSelectReplica(ri); break; + case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: sentinelFailoverSendFailover(ri); break; case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: sentinelFailoverSendReplicaOfNoOne(ri); break; case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: sentinelFailoverWaitPromotion(ri); break; case SENTINEL_FAILOVER_STATE_RECONF_REPLICAS: sentinelFailoverReconfNextReplica(ri); break; diff --git a/tests/sentinel/tests/05-manual.tcl b/tests/sentinel/tests/05-manual.tcl index 7f5485c42c..02e3e28485 100644 --- a/tests/sentinel/tests/05-manual.tcl +++ b/tests/sentinel/tests/05-manual.tcl @@ -1,6 +1,6 @@ # Test manual failover -source "../tests/includes/init-tests.tcl" +proc test_sentinel_failover {type master_id} { foreach_sentinel_id id { S $id sentinel debug info-period 2000 @@ -8,11 +8,20 @@ foreach_sentinel_id id { S $id sentinel debug publish-period 1000 } -test "Manual failover works" { +set val 0 + +test "Manual failover works - $type" { + R $master_id del foo set old_port [RPort $master_id] set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] assert {[lindex $addr 1] == $old_port} + # Rename the FAILOVER command so that we can fallback to REPLICAOF NO ONE. + # We simulate a server that doesn't have the FAILOVER (NON-EXISTENT) command. + if {$type == "replicaof"} { + S 0 SENTINEL SET mymaster rename-command FAILOVER NON-EXISTENT + } + # Since we reduced the info-period (default 10000) above immediately, # sentinel - replica may not have enough time to exchange INFO and update # the replica's info-period, so the test may get a NOGOODSLAVE. @@ -27,6 +36,16 @@ test "Manual failover works" { catch {S 0 SENTINEL FAILOVER mymaster} reply assert_match {*INPROG*} $reply ;# Failover already in progress + # After sending sentinel failover, continue writing to the primary + # to observe the final data consistency. + for {set j 0} {$j < 1000000} {incr j} { + catch {R $master_id incr foo} err + if {[string match "READONLY*" $err]} { + break + } + set val $err + } + foreach_sentinel_id id { wait_for_condition 1000 50 { [lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] != $old_port @@ -38,11 +57,11 @@ test "Manual failover works" { set master_id [get_instance_id_by_port valkey [lindex $addr 1]] } -test "New primary [join $addr {:}] role matches" { +test "New primary [join $addr {:}] role matches - $type" { assert {[RI $master_id role] eq {master}} } -test "All the other slaves now point to the new primary" { +test "All the other slaves now point to the new primary - $type" { foreach_valkey_id id { if {$id != $master_id && $id != 0} { wait_for_condition 1000 50 { @@ -54,7 +73,7 @@ test "All the other slaves now point to the new primary" { } } -test "The old primary eventually gets reconfigured as a slave" { +test "The old primary eventually gets reconfigured as a replica - $type" { wait_for_condition 1000 50 { [RI 0 master_port] == [lindex $addr 1] } else { @@ -62,6 +81,35 @@ test "The old primary eventually gets reconfigured as a slave" { } } +test "Check data consistency - $type" { + if {$type == "replicaof"} { + # In replicaof type, there is a good chance that data will be lost eventually. + foreach_valkey_id id { + wait_for_condition 1000 50 { + [R $id get foo] != $val + } else { + fail "Data is consistency in replicaof type" + } + } + } elseif {$type == "failover"} { + foreach_valkey_id id { + wait_for_condition 1000 50 { + [R $id get foo] == $val + } else { + fail "Data is not consistency in failover type" + } + } + } +} + +} ;# end proc test_sentinel_failover + +source "../tests/includes/init-tests.tcl" +test_sentinel_failover "replicaof" $master_id + +source "../tests/includes/init-tests.tcl" +test_sentinel_failover "failover" $master_id + foreach flag {crash-after-election crash-after-promotion} { # Before each SIMULATE-FAILURE test, re-source init-tests to get a clean environment source "../tests/includes/init-tests.tcl" @@ -91,4 +139,4 @@ foreach flag {crash-after-election crash-after-promotion} { restart_instance sentinel 0 } -} +} \ No newline at end of file