From 85408b73912596a7b4f4d99b9f8c2ce8172402b9 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 8 Jan 2024 12:54:41 +0800 Subject: [PATCH] Fix CLUSTER SHARDS crash in 7.0/7.2 mixed clusters where shard ids are not sync (#12832) Crash reported in #12695. In the process of upgrading the cluster from 7.0 to 7.2, because the 7.0 nodes will not gossip shard id, in 7.2 we will rely on shard id to build the server.cluster->shards dict. In some cases, for example, the 7.0 master node and the 7.2 replica node. From the view of 7.2 replica node, the cluster->shards dictionary does not have its master node. In this case calling CLUSTER SHARDS on the 7.2 replica node may crash. We should fix the underlying assumption of updateShardId, which is that the shard dict should be always in sync with the node's shard_id. The fix was suggested by PingXie, see more details in #12695. (cherry picked from commit 5b0c6a8255af2d0e4921fa60d631bb3857724cb6) --- src/cluster.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/cluster.c b/src/cluster.c index e4b2d0f53d..c985d0b18b 100644 --- a/src/cluster.c +++ b/src/cluster.c @@ -1687,6 +1687,7 @@ void clusterRenameNode(clusterNode *node, char *newname) { serverAssert(retval == DICT_OK); memcpy(node->name, newname, CLUSTER_NAMELEN); clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); } void clusterAddNodeToShard(const char *shard_id, clusterNode *node) { @@ -2234,6 +2235,7 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { node->tls_port = msg_tls_port; node->cport = ntohs(g->cport); clusterAddNode(node); + clusterAddNodeToShard(node->shard_id, node); } } @@ -3036,6 +3038,10 @@ int clusterProcessPacket(clusterLink *link) { clusterNodeAddSlave(master,sender); sender->slaveof = master; + /* Update the shard_id when a replica is connected to its + * primary in the very first time. */ + updateShardId(sender, master->shard_id); + /* Update config. */ clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); }