Skip to content

Commit

Permalink
HDDS-11380. Make node decommission error message more comprehensive (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
VarshaRaviCV authored Oct 29, 2024
1 parent 61c094f commit 980b960
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,12 @@ private synchronized boolean checkIfDecommissionPossible(List<DatanodeDetails> d
if (opState != NodeOperationalState.IN_SERVICE) {
numDecom--;
validDns.remove(dn);
LOG.warn("Cannot decommission {} because it is not IN-SERVICE", dn.getHostName());
}
} catch (NodeNotFoundException ex) {
numDecom--;
validDns.remove(dn);
LOG.warn("Cannot decommission {} because it is not found in SCM", dn.getHostName());
}
}

Expand Down Expand Up @@ -430,9 +432,11 @@ private synchronized boolean checkIfDecommissionPossible(List<DatanodeDetails> d
}
int reqNodes = cif.getReplicationConfig().getRequiredNodes();
if ((inServiceTotal - numDecom) < reqNodes) {
int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal;
String errorMsg = "Insufficient nodes. Tried to decommission " + dns.size() +
" nodes of which " + numDecom + " nodes were valid. Cluster has " + inServiceTotal +
" IN-SERVICE nodes, " + reqNodes + " of which are required for minimum replication. ";
" nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal +
" not IN-SERVICE or not HEALTHY nodes. Cannot decommission as a minimum of " + reqNodes +
" IN-SERVICE HEALTHY nodes are required to maintain replication after decommission. ";
LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid);
errors.add(new DatanodeAdminError("AllHosts", errorMsg));
return false;
Expand Down Expand Up @@ -552,10 +556,12 @@ private synchronized boolean checkIfMaintenancePossible(List<DatanodeDetails> dn
if (opState != NodeOperationalState.IN_SERVICE) {
numMaintenance--;
validDns.remove(dn);
LOG.warn("{} cannot enter maintenance because it is not IN-SERVICE", dn.getHostName());
}
} catch (NodeNotFoundException ex) {
numMaintenance--;
validDns.remove(dn);
LOG.warn("{} cannot enter maintenance because it is not found in SCM", dn.getHostName());
}
}

Expand Down Expand Up @@ -594,9 +600,11 @@ private synchronized boolean checkIfMaintenancePossible(List<DatanodeDetails> dn
minInService = maintenanceReplicaMinimum;
}
if ((inServiceTotal - numMaintenance) < minInService) {
int unHealthyTotal = nodeManager.getAllNodes().size() - inServiceTotal;
String errorMsg = "Insufficient nodes. Tried to start maintenance for " + dns.size() +
" nodes of which " + numMaintenance + " nodes were valid. Cluster has " + inServiceTotal +
" IN-SERVICE nodes, " + minInService + " of which are required for minimum replication. ";
" nodes out of " + inServiceTotal + " IN-SERVICE HEALTHY and " + unHealthyTotal +
" not IN-SERVICE or not HEALTHY nodes. Cannot enter maintenance mode as a minimum of " + minInService +
" IN-SERVICE HEALTHY nodes are required to maintain replication after maintenance. ";
LOG.info(errorMsg + "Failing due to datanode : {}, container : {}", dn, cid);
errors.add(new DatanodeAdminError("AllHosts", errorMsg));
return false;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionForRatis() throws
error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress(),
dns.get(2).getIpAddress(), dns.get(3).getIpAddress(), dns.get(4).getIpAddress()), false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 3);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
Expand Down Expand Up @@ -489,6 +493,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionForEc() throws

error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 5);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), true);
Expand Down Expand Up @@ -537,6 +545,10 @@ public void testInsufficientNodeDecommissionThrowsExceptionRatisAndEc() throws

error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot decommission as a minimum of %d IN-SERVICE HEALTHY nodes are required", 5);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
error = decom.decommissionNodes(Arrays.asList(dns.get(1).getIpAddress()), true);
Expand Down Expand Up @@ -637,6 +649,7 @@ public void testInsufficientNodeDecommissionChecksForNNF() throws
error = decom.decommissionNodes(Arrays.asList(dns.get(0).getIpAddress(),
dns.get(1).getIpAddress(), dns.get(2).getIpAddress()), false);
assertFalse(error.get(0).getHostname().contains("AllHosts"));
assertTrue(error.get(0).getError().contains("The host was not found in SCM"));
assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
assertEquals(HddsProtos.NodeOperationalState.DECOMMISSIONING,
Expand Down Expand Up @@ -673,6 +686,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForRatis() throws
error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(),
dns.get(2).getIpAddress(), dns.get(3).getIpAddress(), dns.get(4).getIpAddress()), 100, false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required",
2);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
Expand Down Expand Up @@ -768,6 +786,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForEc() throws
error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress(), dns.get(2).getIpAddress()),
100, false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required",
4);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
Expand Down Expand Up @@ -869,6 +892,11 @@ public void testInsufficientNodeMaintenanceThrowsExceptionForRatisAndEc() throws
// it should not be allowed as for EC, maintenance.remaining.redundancy is 2 => 3+2=5 DNs are required
error = decom.startMaintenanceNodes(Arrays.asList(dns.get(1).getIpAddress()), 100, false);
assertTrue(error.get(0).getHostname().contains("AllHosts"));
String errorMsg = String.format("%d IN-SERVICE HEALTHY and %d not IN-SERVICE or not HEALTHY nodes.", 5, 0);
assertTrue(error.get(0).getError().contains(errorMsg));
errorMsg = String.format("Cannot enter maintenance mode as a minimum of %d IN-SERVICE HEALTHY nodes are required",
5);
assertTrue(error.get(0).getError().contains(errorMsg));
assertEquals(HddsProtos.NodeOperationalState.IN_SERVICE,
nodeManager.getNodeStatus(dns.get(1)).getOperationalState());

Expand Down

0 comments on commit 980b960

Please sign in to comment.