Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance SENTINEL FAILOVER to use the FAILOVER command to avoid data loss #1238

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 73 additions & 6 deletions src/sentinel.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,11 @@ static mstime_t sentinel_default_failover_timeout = 60 * 3 * 1000;
#define SENTINEL_FAILOVER_STATE_NONE 0 /* No failover in progress. */
#define SENTINEL_FAILOVER_STATE_WAIT_START 1 /* Wait for failover_start_time*/
#define SENTINEL_FAILOVER_STATE_SELECT_REPLICA 2 /* Select replica to promote */
#define SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE 3 /* Replica -> Primary */
#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 4 /* Wait replica to change role */
#define SENTINEL_FAILOVER_STATE_RECONF_REPLICAS 5 /* REPLICAOF newprimary */
#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 6 /* Monitor promoted replica. */
#define SENTINEL_FAILOVER_STATE_SEND_FAILOVER 3 /* Send FAILOVER Command to primary. */
#define SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE 4 /* Replica -> Primary */
#define SENTINEL_FAILOVER_STATE_WAIT_PROMOTION 5 /* Wait replica to change role */
#define SENTINEL_FAILOVER_STATE_RECONF_REPLICAS 6 /* REPLICAOF newprimary */
#define SENTINEL_FAILOVER_STATE_UPDATE_CONFIG 7 /* Monitor promoted replica. */

#define SENTINEL_PRIMARY_LINK_STATUS_UP 0
#define SENTINEL_PRIMARY_LINK_STATUS_DOWN 1
Expand Down Expand Up @@ -3221,6 +3222,7 @@ const char *sentinelFailoverStateStr(int state) {
case SENTINEL_FAILOVER_STATE_NONE: return "none";
case SENTINEL_FAILOVER_STATE_WAIT_START: return "wait_start";
case SENTINEL_FAILOVER_STATE_SELECT_REPLICA: return "select_slave";
case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: return "send_failover";
case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: return "send_slaveof_noone";
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: return "wait_promotion";
case SENTINEL_FAILOVER_STATE_RECONF_REPLICAS: return "reconf_slaves";
Expand Down Expand Up @@ -4635,6 +4637,41 @@ char *sentinelGetLeader(sentinelValkeyInstance *primary, uint64_t epoch) {
return winner;
}

void sentinelFailoverReplyCallback(redisAsyncContext *c, void *reply, void *privdata) {
sentinelValkeyInstance *ri = privdata;
instanceLink *link = c->data;
redisReply *r;

if (!reply || !link) return;
link->pending_commands--;
r = reply;

/* Primary does not support FAILOVER, fallback to REPLICAOF NO ONE. */
if (r->type == REDIS_REPLY_ERROR) {
sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", ri->promoted_replica, "%@");
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE;
ri->failover_state_change_time = mstime();
}
}

/* Send FAILOVER to the specified primary instance, the replica addr passed in
* at the same time will be used as the TO parameter. */
int sentinelSendFailover(sentinelValkeyInstance *ri, const sentinelAddr *addr) {
char portstr[32];
const char *host;
int retval;

host = announceSentinelAddr(addr);
ll2string(portstr, sizeof(portstr), addr->port);

retval = redisAsyncCommand(ri->link->cc, sentinelFailoverReplyCallback, ri, "%s TO %s %s TIMEOUT %lld",
sentinelInstanceMapCommand(ri, "FAILOVER"), host, portstr, ri->failover_timeout);
if (retval == C_ERR) return retval;
ri->link->pending_commands++;

return C_OK;
}

/* Send REPLICAOF to the specified instance, always followed by a
* CONFIG REWRITE command in order to store the new configuration on disk
* when possible (that is, if the instance is recent enough to support
Expand Down Expand Up @@ -4901,12 +4938,41 @@ void sentinelFailoverSelectReplica(sentinelValkeyInstance *ri) {
sentinelEvent(LL_WARNING, "+selected-slave", replica, "%@");
replica->flags |= SRI_PROMOTED;
ri->promoted_replica = replica;
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE;
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_FAILOVER;
ri->failover_state_change_time = mstime();
sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", replica, "%@");
sentinelEvent(LL_NOTICE, "+failover-state-send-failover", replica, "%@");
}
}

void sentinelFailoverSendFailover(sentinelValkeyInstance *ri) {
/* We can't send the command to the promoted replica if it is now
* disconnected. Retry again and again with this state until the timeout
* is reached, then abort the failover. */
if (ri->promoted_replica->link->disconnected) {
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(LL_WARNING, "-failover-abort-slave-timeout", ri, "%@");
sentinelAbortFailover(ri);
}
return;
}

/* We will first try to use SHUTDOWN to coordinate a failover between the primary
* and the promoted replica to avoid data loss. */
if ((ri->flags & (SRI_S_DOWN | SRI_O_DOWN)) == 0 && !ri->link->disconnected) {
if (sentinelSendFailover(ri, ri->promoted_replica->addr) == C_OK) {
sentinelEvent(LL_NOTICE, "+failover-state-wait-promotion", ri->promoted_replica, "%@");
ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
ri->failover_state_change_time = mstime();
return;
}
}

/* Fallback to REPLICAOF NO ONE. */
sentinelEvent(LL_NOTICE, "+failover-state-send-slaveof-noone", ri->promoted_replica, "%@");
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE;
ri->failover_state_change_time = mstime();
}

void sentinelFailoverSendReplicaOfNoOne(sentinelValkeyInstance *ri) {
int retval;

Expand Down Expand Up @@ -5078,6 +5144,7 @@ void sentinelFailoverStateMachine(sentinelValkeyInstance *ri) {
switch (ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START: sentinelFailoverWaitStart(ri); break;
case SENTINEL_FAILOVER_STATE_SELECT_REPLICA: sentinelFailoverSelectReplica(ri); break;
case SENTINEL_FAILOVER_STATE_SEND_FAILOVER: sentinelFailoverSendFailover(ri); break;
case SENTINEL_FAILOVER_STATE_SEND_REPLICAOF_NOONE: sentinelFailoverSendReplicaOfNoOne(ri); break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION: sentinelFailoverWaitPromotion(ri); break;
case SENTINEL_FAILOVER_STATE_RECONF_REPLICAS: sentinelFailoverReconfNextReplica(ri); break;
Expand Down
60 changes: 54 additions & 6 deletions tests/sentinel/tests/05-manual.tcl
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
# Test manual failover

source "../tests/includes/init-tests.tcl"
proc test_sentinel_failover {type master_id} {

foreach_sentinel_id id {
S $id sentinel debug info-period 2000
S $id sentinel debug default-down-after 6000
S $id sentinel debug publish-period 1000
}

test "Manual failover works" {
set val 0

test "Manual failover works - $type" {
R $master_id del foo
set old_port [RPort $master_id]
set addr [S 0 SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster]
assert {[lindex $addr 1] == $old_port}

# Rename the FAILOVER command so that we can fallback to REPLICAOF NO ONE.
# We simulate a server that doesn't have the FAILOVER (NON-EXISTENT) command.
if {$type == "replicaof"} {
S 0 SENTINEL SET mymaster rename-command FAILOVER NON-EXISTENT
enjoy-binbin marked this conversation as resolved.
Show resolved Hide resolved
}

# Since we reduced the info-period (default 10000) above immediately,
# sentinel - replica may not have enough time to exchange INFO and update
# the replica's info-period, so the test may get a NOGOODSLAVE.
Expand All @@ -27,6 +36,16 @@ test "Manual failover works" {
catch {S 0 SENTINEL FAILOVER mymaster} reply
assert_match {*INPROG*} $reply ;# Failover already in progress

# After sending sentinel failover, continue writing to the primary
# to observe the final data consistency.
for {set j 0} {$j < 1000000} {incr j} {
catch {R $master_id incr foo} err
if {[string match "READONLY*" $err]} {
break
}
set val $err
}

foreach_sentinel_id id {
wait_for_condition 1000 50 {
[lindex [S $id SENTINEL GET-PRIMARY-ADDR-BY-NAME mymaster] 1] != $old_port
Expand All @@ -38,11 +57,11 @@ test "Manual failover works" {
set master_id [get_instance_id_by_port valkey [lindex $addr 1]]
}

test "New primary [join $addr {:}] role matches" {
test "New primary [join $addr {:}] role matches - $type" {
assert {[RI $master_id role] eq {master}}
}

test "All the other slaves now point to the new primary" {
test "All the other slaves now point to the new primary - $type" {
foreach_valkey_id id {
if {$id != $master_id && $id != 0} {
wait_for_condition 1000 50 {
Expand All @@ -54,14 +73,43 @@ test "All the other slaves now point to the new primary" {
}
}

test "The old primary eventually gets reconfigured as a slave" {
test "The old primary eventually gets reconfigured as a replica - $type" {
wait_for_condition 1000 50 {
[RI 0 master_port] == [lindex $addr 1]
} else {
fail "Old master not reconfigured as slave of new master"
}
}

test "Check data consistency - $type" {
if {$type == "replicaof"} {
# In replicaof type, there is a good chance that data will be lost eventually.
foreach_valkey_id id {
wait_for_condition 1000 50 {
[R $id get foo] != $val
} else {
fail "Data is consistency in replicaof type"
}
}
} elseif {$type == "failover"} {
foreach_valkey_id id {
wait_for_condition 1000 50 {
[R $id get foo] == $val
} else {
fail "Data is not consistency in failover type"
}
}
}
}

} ;# end proc test_sentinel_failover

source "../tests/includes/init-tests.tcl"
test_sentinel_failover "replicaof" $master_id

source "../tests/includes/init-tests.tcl"
test_sentinel_failover "failover" $master_id

foreach flag {crash-after-election crash-after-promotion} {
# Before each SIMULATE-FAILURE test, re-source init-tests to get a clean environment
source "../tests/includes/init-tests.tcl"
Expand Down Expand Up @@ -91,4 +139,4 @@ foreach flag {crash-after-election crash-after-promotion} {

restart_instance sentinel 0
}
}
}
Loading