diff --git a/helpers/node-mark-offline b/helpers/node-mark-offline index f98adc9..209c328 100644 --- a/helpers/node-mark-offline +++ b/helpers/node-mark-offline @@ -63,7 +63,8 @@ elif [[ "$NHC_RM" == "slurm" ]]; then OLD_NOTE_LEADER="${LINE[1]}" OLD_NOTE="${LINE[*]:2}" case "$STATUS" in - alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*) + # Node states: src/common/slurm_protocol_defs.c --> node_state_string() + alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*|boot*) case "$STATUS" in drain*|drng*|fail*|maint*) # If the node is already offline, and there is no old note, and @@ -73,9 +74,38 @@ elif [[ "$NHC_RM" == "slurm" ]]; then exit 0 fi ;; + boot*) + # Offline node after reboot if vanilla `scontrol reboot` was + # called, so jobs can't run until NHC onlines the node. + # Note: This won't happen while node is waiting to boot, + # because $STATUS would show MIX@ or ALLOC@, not BOOT. + # See src/common/slurm_protocol_defs.c-->node_state_string() + SHOW_NODE_OUTPUT="$($SLURM_SCONTROL show node $HOSTNAME)" + if [[ $SHOW_NODE_OUTPUT == *"State=REBOOT "* ]]; then + MSG="Temporarily offlining $HOSTNAME after reboot until NHC can online it" + echo "$0: $MSG" + $SLURM_SCONTROL update State=DRAIN NodeName=$HOSTNAME Reason="$LEADER $MSG" + exit 0 + fi + + # If `Reboot ASAP` has been cleared, then the node is + # already set to stay in DRAIN until NHC onlines it, so exit + if [[ "$OLD_NOTE_LEADER" != "Reboot" && "$OLD_NOTE" != "ASAP" ]]; then + echo "$0: $HOSTNAME already set to remain offline after reboot until NHC onlines it" + exit 0 + fi + ;; esac + # `scontrol reboot asap` will set the node state to REBOOT+DRAIN and + # reason to `Reboot ASAP`. Then, after boot, and after NHC runs + # once, Slurm will set the node base state to IDLE. If reason == + # `Reboot ASAP`, Slurm will also clear the DRAIN flag. We want + # NHC to clear the DRAIN flag, not Slurm, so delete the + # `Reboot ASAP` reason by not preserving it below. + # See https://slurm.schedmd.com/scontrol.html --> reboot + # If there's an old note that wasn't set by NHC, preserve it. - if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then + if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" && "$OLD_NOTE_LEADER" != "Reboot" && "$OLD_NOTE" != "ASAP" ]]; then LEADER="$OLD_NOTE_LEADER" NOTE="$OLD_NOTE" fi diff --git a/helpers/node-mark-online b/helpers/node-mark-online index a202787..897d6cf 100644 --- a/helpers/node-mark-online +++ b/helpers/node-mark-online @@ -60,7 +60,12 @@ elif [[ "$NHC_RM" == "slurm" ]]; then # Slurm does not run the HealthCheckProgram on nodes in the DOWN state, # but if someone runs NHC by hand, we want to be able to do the right thing. case "$STATUS" in - down*|drain*|drng*|fail*|maint*) + *@) + # Onlining a node will cancel a pending reboot, so prevent this + echo "$0: Not onlining $HOSTNAME: Reboot is pending." + exit 0 + ;; + down*|drain*|drng*|fail*|maint*|boot*) # If there is no old note, and we've not been told to ignore that, do not online the node. if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then echo "$0: Not onlining $HOSTNAME: No note set."