diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 9104a76d87..21aa620dd9 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -116,6 +116,12 @@ int verifyClusterNodeId(const char *name, int length); sds clusterEncodeOpenSlotsAuxField(int rdbflags); int clusterDecodeOpenSlotsAuxField(int rdbflags, sds s); +/* Only primaries that own slots have voting rights. + * Returns 1 if the node has voting rights, otherwise returns 0. */ +static inline int clusterNodeIsVotingPrimary(clusterNode *n) { + return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; +} + int getNodeDefaultClientPort(clusterNode *n) { return server.tls_cluster ? n->tls_port : n->tcp_port; } @@ -1867,8 +1873,8 @@ void markNodeAsFailingIfNeeded(clusterNode *node) { if (nodeFailed(node)) return; /* Already FAILing. */ failures = clusterNodeFailureReportsCount(node); - /* Also count myself as a voter if I'm a primary. */ - if (clusterNodeIsPrimary(myself)) failures++; + /* Also count myself as a voter if I'm a voting primary. */ + if (clusterNodeIsVotingPrimary(myself)) failures++; if (failures < needed_quorum) return; /* No weak agreement from primaries. */ serverLog(LL_NOTICE, "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename); @@ -1908,7 +1914,7 @@ void clearNodeFailureIfNeeded(clusterNode *node) { * 1) The FAIL state is old enough. * 2) It is yet serving slots from our point of view (not failed over). * Apparently no one is going to fix these slots, clear the FAIL flag. */ - if (clusterNodeIsPrimary(node) && node->numslots > 0 && + if (clusterNodeIsVotingPrimary(node) && (now - node->fail_time) > (server.cluster_node_timeout * CLUSTER_FAIL_UNDO_TIME_MULT)) { serverLog( LL_NOTICE, @@ -2090,8 +2096,8 @@ void clusterProcessGossipSection(clusterMsg *hdr, clusterLink *link) { /* Ignore gossips about self. */ if (node && node != myself) { /* We already know this node. - Handle failure reports, only when the sender is a primary. */ - if (sender && clusterNodeIsPrimary(sender)) { + Handle failure reports, only when the sender is a voting primary. */ + if (sender && clusterNodeIsVotingPrimary(sender)) { if (flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) { if (clusterNodeAddFailureReport(node, sender)) { serverLog(LL_VERBOSE, "Node %.40s (%s) reported node %.40s (%s) as not reachable.", @@ -3250,8 +3256,7 @@ int clusterProcessPacket(clusterLink *link) { /* We consider this vote only if the sender is a primary serving * a non zero number of slots, and its currentEpoch is greater or * equal to epoch where this node started the election. */ - if (clusterNodeIsPrimary(sender) && sender->numslots > 0 && - senderCurrentEpoch >= server.cluster->failover_auth_epoch) { + if (clusterNodeIsVotingPrimary(sender) && senderCurrentEpoch >= server.cluster->failover_auth_epoch) { server.cluster->failover_auth_count++; /* Maybe we reached a quorum here, set a flag to make sure * we check ASAP. */ @@ -4768,7 +4773,7 @@ void clusterCron(void) { if (!(node->flags & (CLUSTER_NODE_PFAIL | CLUSTER_NODE_FAIL))) { node->flags |= CLUSTER_NODE_PFAIL; update_state = 1; - if (clusterNodeIsPrimary(myself) && server.cluster->size == 1) { + if (server.cluster->size == 1 && clusterNodeIsVotingPrimary(myself)) { markNodeAsFailingIfNeeded(node); } else { serverLog(LL_DEBUG, "*** NODE %.40s possibly failing", node->name); @@ -5038,7 +5043,7 @@ void clusterUpdateState(void) { while ((de = dictNext(di)) != NULL) { clusterNode *node = dictGetVal(de); - if (clusterNodeIsPrimary(node) && node->numslots) { + if (clusterNodeIsVotingPrimary(node)) { server.cluster->size++; if ((node->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_PFAIL)) == 0) reachable_primaries++; } diff --git a/tests/unit/cluster/failure-marking.tcl b/tests/unit/cluster/failure-marking.tcl index c4746c8264..cfed7fff0f 100644 --- a/tests/unit/cluster/failure-marking.tcl +++ b/tests/unit/cluster/failure-marking.tcl @@ -16,6 +16,8 @@ start_cluster 1 1 {tags {external:skip cluster}} { pause_process $replica1_pid wait_node_marked_fail 0 $replica1_instance_id + + resume_process $replica1_pid } } @@ -49,5 +51,71 @@ start_cluster 2 1 {tags {external:skip cluster}} { resume_process $primary2_pid wait_node_marked_fail 0 $replica1_instance_id + + resume_process $replica1_pid + } +} + +set old_singledb $::singledb +set ::singledb 1 + +tags {external:skip tls:skip cluster} { + set base_conf [list cluster-enabled yes cluster-ping-interval 100 cluster-node-timeout 3000 save ""] + start_multiple_servers 5 [list overrides $base_conf] { + test "Only primary with slots has the right to mark a node as failed" { + set primary_host [srv 0 host] + set primary_port [srv 0 port] + set primary_pid [srv 0 pid] + set primary_id [R 0 CLUSTER MYID] + set replica_id [R 1 CLUSTER MYID] + set replica_pid [srv -1 pid] + + # Meet others nodes. + R 1 CLUSTER MEET $primary_host $primary_port + R 2 CLUSTER MEET $primary_host $primary_port + R 3 CLUSTER MEET $primary_host $primary_port + R 4 CLUSTER MEET $primary_host $primary_port + + # Build a single primary cluster. + cluster_allocate_slots 1 1 + wait_for_cluster_propagation + R 1 CLUSTER REPLICATE $primary_id + wait_for_cluster_propagation + wait_for_cluster_state "ok" + + # Pause the primary, marking the primary as pfail. + pause_process $primary_pid + wait_node_marked_pfail 1 $primary_id + wait_node_marked_pfail 2 $primary_id + wait_node_marked_pfail 3 $primary_id + wait_node_marked_pfail 4 $primary_id + + # Pause the replica, marking the replica as pfail. + pause_process $replica_pid + wait_node_marked_pfail 2 $replica_id + wait_node_marked_pfail 3 $replica_id + wait_node_marked_pfail 4 $replica_id + + # Resume the primary, marking the replica as fail. + resume_process $primary_pid + wait_node_marked_fail 0 $replica_id + wait_node_marked_fail 2 $replica_id + wait_node_marked_fail 3 $replica_id + wait_node_marked_fail 4 $replica_id + + # Check if we got the right failure reports. + wait_for_condition 1000 50 { + [R 0 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 0 && + [R 2 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 && + [R 3 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 && + [R 4 CLUSTER COUNT-FAILURE-REPORTS $replica_id] == 1 + } else { + fail "Cluster COUNT-FAILURE-REPORTS is not right." + } + + resume_process $replica_pid + } } } + +set ::singledb $old_singledb