Skip to content

Commit

Permalink
Slot migration improvement
Browse files Browse the repository at this point in the history
Signed-off-by: Ping Xie <pingxie@google.com>
  • Loading branch information
PingXie committed Apr 6, 2024
1 parent ba0c93c commit 620fd99
Show file tree
Hide file tree
Showing 17 changed files with 907 additions and 148 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,4 @@ Makefile.dep
compile_commands.json
redis.code-workspace
.cache
.cscope.*
1 change: 1 addition & 0 deletions src/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ char *clusterNodeHostname(clusterNode *node);
const char *clusterNodePreferredEndpoint(clusterNode *n);
long long clusterNodeReplOffset(clusterNode *node);
clusterNode *clusterLookupNode(const char *name, int length);
void clusterReplicateOpenSlots(void);

/* functions with shared implementations */
clusterNode *getNodeByQuery(client *c, struct serverCommand *cmd, robj **argv, int argc, int *hashslot, int *ask);
Expand Down
613 changes: 496 additions & 117 deletions src/cluster_legacy.c

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/cluster_legacy.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ typedef struct clusterLink {
#define CLUSTER_NODE_NOFAILOVER 512 /* Slave will not try to failover. */
#define CLUSTER_NODE_NULL_NAME "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000"

#define nodeIsMaster(n) ((n)->flags & CLUSTER_NODE_MASTER)
#define nodeIsSlave(n) ((n)->flags & CLUSTER_NODE_SLAVE)
#define nodeInHandshake(n) ((n)->flags & CLUSTER_NODE_HANDSHAKE)
#define nodeHasAddr(n) (!((n)->flags & CLUSTER_NODE_NOADDR))
Expand Down
3 changes: 2 additions & 1 deletion src/commands/cluster-setslot.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
"command_flags": [
"NO_ASYNC_LOADING",
"ADMIN",
"STALE"
"STALE",
"MAY_REPLICATE"
],
"arguments": [
{
Expand Down
3 changes: 1 addition & 2 deletions src/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -873,8 +873,7 @@ NULL
server.aof_flush_sleep = atoi(c->argv[2]->ptr);
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"replicate") && c->argc >= 3) {
replicationFeedSlaves(server.slaves, -1,
c->argv + 2, c->argc - 2);
replicationFeedSlaves(-1, c->argv + 2, c->argc - 2);
addReply(c,shared.ok);
} else if (!strcasecmp(c->argv[1]->ptr,"error") && c->argc == 3) {
sds errstr = sdsnewlen("-",1);
Expand Down
2 changes: 1 addition & 1 deletion src/rdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -3310,7 +3310,7 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin
robj *argv[2];
argv[0] = server.lazyfree_lazy_expire ? shared.unlink : shared.del;
argv[1] = &keyobj;
replicationFeedSlaves(server.slaves,dbid,argv,2);
replicationFeedSlaves(dbid,argv,2);
}
sdsfree(key);
decrRefCount(val);
Expand Down
10 changes: 6 additions & 4 deletions src/replication.c
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ void feedReplicationBuffer(char *s, size_t len) {
* received by our clients in order to create the replication stream.
* Instead if the instance is a replica and has sub-replicas attached, we use
* replicationFeedStreamFromMasterStream() */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {
void replicationFeedSlaves(int dictid, robj **argv, int argc) {
int j, len;
char llstr[LONG_STR_SIZE];

Expand All @@ -451,7 +451,7 @@ void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc) {

/* If there aren't slaves, and there is no backlog buffer to populate,
* we can return ASAP. */
if (server.repl_backlog == NULL && listLength(slaves) == 0) {
if (server.repl_backlog == NULL && listLength(server.slaves) == 0) {
/* We increment the repl_offset anyway, since we use that for tracking AOF fsyncs
* even when there's no replication active. This code will not be reached if AOF
* is also disabled. */
Expand Down Expand Up @@ -1313,6 +1313,9 @@ int replicaPutOnline(client *slave) {
NULL);
serverLog(LL_NOTICE,"Synchronization with replica %s succeeded",
replicationGetSlaveName(slave));

/* Replicate slot being migrated/imported to the new replica */
clusterReplicateOpenSlots();
return 1;
}

Expand Down Expand Up @@ -3788,8 +3791,7 @@ void replicationCron(void) {

if (!manual_failover_in_progress) {
ping_argv[0] = shared.ping;
replicationFeedSlaves(server.slaves, -1,
ping_argv, 1);
replicationFeedSlaves(-1, ping_argv, 1);
}
}

Expand Down
4 changes: 2 additions & 2 deletions src/server.c
Original file line number Diff line number Diff line change
Expand Up @@ -1612,7 +1612,7 @@ static void sendGetackToReplicas(void) {
argv[0] = shared.replconf;
argv[1] = shared.getack;
argv[2] = shared.special_asterick; /* Not used argument. */
replicationFeedSlaves(server.slaves, -1, argv, 3);
replicationFeedSlaves(-1, argv, 3);
}

extern int ProcessingEventsWhileBlocked;
Expand Down Expand Up @@ -3296,7 +3296,7 @@ static void propagateNow(int dbid, robj **argv, int argc, int target) {
if (server.aof_state != AOF_OFF && target & PROPAGATE_AOF)
feedAppendOnlyFile(dbid,argv,argc);
if (target & PROPAGATE_REPL)
replicationFeedSlaves(server.slaves,dbid,argv,argc);
replicationFeedSlaves(dbid,argv,argc);
}

/* Used inside commands to schedule the propagation of additional commands
Expand Down
2 changes: 1 addition & 1 deletion src/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -2816,7 +2816,7 @@ ssize_t syncRead(int fd, char *ptr, ssize_t size, long long timeout);
ssize_t syncReadLine(int fd, char *ptr, ssize_t size, long long timeout);

/* Replication */
void replicationFeedSlaves(list *slaves, int dictid, robj **argv, int argc);
void replicationFeedSlaves(int dictid, robj **argv, int argc);
void replicationFeedStreamFromMasterStream(char *buf, size_t buflen);
void resetReplicationBuffer(void);
void feedReplicationBuffer(char *buf, size_t len);
Expand Down
34 changes: 34 additions & 0 deletions src/valkey-cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -4762,6 +4762,39 @@ static clusterManagerNode *clusterManagerGetSlotOwner(clusterManagerNode *n,
return owner;
}

static void clusterManagerSetSlotNodeReplicaOnly(clusterManagerNode *node1,
clusterManagerNode *node2,
int slot) {
/* A new CLUSTER SETSLOT variant that finalizes slot ownership on replicas
* only (CLUSTER SETSLOT s NODE n REPLICAONLY) is introduced in Redis 8.0+
* to help mitigate the single-point-of-failure issue related to the slot
* ownership finalization on HA clusters. We make a best-effort attempt below
* to utilize this enhanced reliability. Regardless of the result, we continue
* with finalizing slot ownership on the primary nodes. Note that this command
* is not essential. Redis 8.0+ will attempt to recover from failed slot
* ownership finalizations if they occur, although there may be a brief period
* where slots caught in this transition stage are unavailable. Including this
* additional step ensures no downtime for these slots if any failures arise. */
redisReply *reply = CLUSTER_MANAGER_COMMAND(node1, "CLUSTER SETSLOT %d NODE %s REPLICAONLY",
slot, (char *) node2->name);
if (reply->type == REDIS_REPLY_ERROR) {
/* Either target Redis server doesn't support this command or the slot is
* not in the right state*/
clusterManagerLogWarn("*** Failed to finalize slot on replicas: %s\n", reply->str);
freeReplyObject(reply);
return;
}
freeReplyObject(reply);
clusterManagerLogInfo(">>> Waiting for %d replicas to complete slot finalization\n", node1->replicas_count);
reply = CLUSTER_MANAGER_COMMAND(node1, "WAIT %d 1000", node1->replicas_count);
if (reply->type == REDIS_REPLY_ERROR) {
clusterManagerLogWarn("*** Failed to wait for slot finalization on replicas: %s\n", reply->str);
} else {
clusterManagerLogInfo(">>> %d replicas completed slot finalization in time\n", reply->integer);
}
freeReplyObject(reply);
}

/* Set slot status to "importing" or "migrating" */
static int clusterManagerSetSlot(clusterManagerNode *node1,
clusterManagerNode *node2,
Expand Down Expand Up @@ -5233,6 +5266,7 @@ static int clusterManagerMoveSlot(clusterManagerNode *source,
* If we inform any other node first, it can happen that the target node
* crashes before it is set as the new owner and then the slot is left
* without an owner which results in redirect loops. See issue #7116. */
clusterManagerSetSlotNodeReplicaOnly(target, target, slot);
success = clusterManagerSetSlot(target, target, slot, "node", err);
if (!success) return 0;

Expand Down
5 changes: 0 additions & 5 deletions tests/cluster/tests/20-half-migrated-slot.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
# 4. migration is half finished on "migrating" node
# 5. migration is half finished on "importing" node

# TODO: Test is currently disabled until it is stabilized (fixing the test
# itself or real issues in Redis).

if {false} {
source "../tests/includes/init-tests.tcl"
source "../tests/includes/utils.tcl"

Expand Down Expand Up @@ -95,4 +91,3 @@ test "Half-finish importing" {
}

config_set_all_nodes cluster-allow-replica-migration yes
}
6 changes: 0 additions & 6 deletions tests/cluster/tests/21-many-slot-migration.tcl
Original file line number Diff line number Diff line change
@@ -1,10 +1,5 @@
# Tests for many simultaneous migrations.

# TODO: Test is currently disabled until it is stabilized (fixing the test
# itself or real issues in Redis).

if {false} {

source "../tests/includes/init-tests.tcl"
source "../tests/includes/utils.tcl"

Expand Down Expand Up @@ -61,4 +56,3 @@ test "Keys are accessible" {
}

config_set_all_nodes cluster-allow-replica-migration yes
}
1 change: 1 addition & 0 deletions tests/test_helper.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ set ::all_tests {
unit/cluster/links
unit/cluster/cluster-response-tls
unit/cluster/failure-marking
unit/cluster/slot-migration
}
# Index to the next test to run in the ::all_tests list.
set ::next_test 0
Expand Down
11 changes: 4 additions & 7 deletions tests/unit/cluster/cli.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -317,13 +317,10 @@ test {Migrate the last slot away from a node using valkey-cli} {
catch { $newnode_r get foo } e
assert_equal "MOVED $slot $owner_host:$owner_port" $e

# Check that the empty node has turned itself into a replica of the new
# owner and that the new owner knows that.
wait_for_condition 1000 50 {
[string match "*slave*" [$owner_r CLUSTER REPLICAS $owner_id]]
} else {
fail "Empty node didn't turn itself into a replica."
}
# Check that the now empty primary node doesn't turn itself into
# a replica of any other nodes
wait_for_cluster_propagation
assert_match *master* [$owner_r role]
}
}

Expand Down
Loading

0 comments on commit 620fd99

Please sign in to comment.