Skip to content

Commit

Permalink
Numerous improvements to Slurm reboot handling and planned node state
Browse files Browse the repository at this point in the history
  • Loading branch information
treydock committed Apr 13, 2022
1 parent d534d41 commit 869c4ad
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 6 deletions.
48 changes: 45 additions & 3 deletions helpers/node-mark-offline
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,13 @@ elif [[ "$NHC_RM" == "slurm" ]]; then
OLD_NOTE_LEADER="${LINE[1]}"
OLD_NOTE="${LINE[*]:2}"
case "$STATUS" in
*'@'*|*'#'*|boot*|*-*|plnd*)
*'@'*|*'#'*|*-*)
# These states aren't handled yet.
echo "$0: State \"$STATUS\" not yet handled; ignoring."
exit 0
;;
alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*)
# Node states: src/common/slurm_protocol_defs.c --> node_state_string()
alloc*|boot*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|plnd*|resume*|resv*|undrain*)
case "$STATUS" in
drain*|drng*|fail*|maint*)
# If the node is already offline, and there is no old note, and
Expand All @@ -77,10 +78,51 @@ elif [[ "$NHC_RM" == "slurm" ]]; then
echo "$0: Not offlining $HOSTNAME: Already offline with no note set."
exit 0
fi
if [[ "$OLD_NOTE_LEADER" == "Reboot" && "$OLD_NOTE" == "ASAP" ]]; then
echo "$0: Not offlining $HOSTNAME: Pending reboot."
exit 0
fi
;;
boot*)
# Offline node after reboot if vanilla `scontrol reboot` was
# called, so jobs can't run until NHC onlines the node.
# Note: This won't happen while node is waiting to boot,
# because $STATUS would show MIX@ or ALLOC@, not BOOT.
# See src/common/slurm_protocol_defs.c-->node_state_string()
SHOW_NODE_OUTPUT="$($SLURM_SCONTROL show node $HOSTNAME)"
if [[ $SHOW_NODE_OUTPUT == *"State=REBOOT"* ]]; then
MSG="Temporarily offlining $HOSTNAME after reboot until NHC can online it"
echo "$0: $MSG"
$SLURM_SCONTROL update State=DRAIN NodeName=$HOSTNAME Reason="$LEADER $MSG"
exit 0
fi

# If `Reboot ASAP` has been cleared, then the node is
# already set to stay in DRAIN until NHC onlines it, so exit
if [[ "$OLD_NOTE_LEADER" != "Reboot" && "$OLD_NOTE" != "ASAP" ]]; then
echo "$0: $HOSTNAME already set to remain offline after reboot until NHC onlines it"
exit 0
fi
;;
*"*")
UPTIME=( $(cat /proc/uptime) )
UPTIME_SEC="${UPTIME[0]%.*}"
if (( UPTIME_SEC < 600 )); then
echo "$0: Node not responding, unexpectedly rebooted, ignoring"
exit 0
fi
;;
esac
# `scontrol reboot asap` will set the node state to REBOOT+DRAIN and
# reason to `Reboot ASAP`. Then, after boot, and after NHC runs
# once, Slurm will set the node base state to IDLE. If reason ==
# `Reboot ASAP`, Slurm will also clear the DRAIN flag. We want
# NHC to clear the DRAIN flag, not Slurm, so delete the
# `Reboot ASAP` reason by not preserving it below.
# See https://slurm.schedmd.com/scontrol.html --> reboot

# If there's an old note that wasn't set by NHC, preserve it.
if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" && "$OLD_NOTE_LEADER" != "Reboot" && "$OLD_NOTE" != "ASAP" ]]; then
LEADER="$OLD_NOTE_LEADER"
NOTE="$OLD_NOTE"
fi
Expand Down
11 changes: 8 additions & 3 deletions helpers/node-mark-online
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,17 @@ elif [[ "$NHC_RM" == "slurm" ]]; then
# Slurm does not run the HealthCheckProgram on nodes in the DOWN state,
# but if someone runs NHC by hand, we want to be able to do the right thing.
case "$STATUS" in
*'@'*|*'#'*|boot*|*-*|plnd*)
*'#'*|*-*)
# These states aren't handled yet.
echo "$0: State \"$STATUS\" not yet handled; ignoring."
exit 0
;;
down*|drain*|drng*|fail*|maint*)
*@)
# Onlining a node will cancel a pending reboot, so prevent this
echo "$0: Not onlining $HOSTNAME: Reboot is pending."
exit 0
;;
boot*|down*|drain*|drng*|fail*|maint*)
# If there is no old note, and we've not been told to ignore that, do not online the node.
if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
echo "$0: Not onlining $HOSTNAME: No note set."
Expand All @@ -79,7 +84,7 @@ elif [[ "$NHC_RM" == "slurm" ]]; then
echo "$0: Marking $HOSTNAME online and clearing note ($OLD_NOTE_LEADER $OLD_NOTE)"
exec $SLURM_SCONTROL $SLURM_SC_ONLINE_ARGS NodeName=$HOSTNAME
;;
alloc*|comp*|idle*|mix*|resume*|resv*|undrain*)
alloc*|comp*|idle*|mix*|plnd*|resume*|resv*|undrain*)
# Node is already online.
echo "$0: Node $HOSTNAME is already online."
;;
Expand Down

0 comments on commit 869c4ad

Please sign in to comment.