diff --git a/helpers/node-mark-online b/helpers/node-mark-online index 4af85ad..ab612cc 100644 --- a/helpers/node-mark-online +++ b/helpers/node-mark-online @@ -60,12 +60,12 @@ elif [[ "$NHC_RM" == "slurm" ]]; then # Slurm does not run the HealthCheckProgram on nodes in the DOWN state, # but if someone runs NHC by hand, we want to be able to do the right thing. case "$STATUS" in - *'@'*|*'#'*|boot*|*-*|plnd*) + *'@'*|*'#'*|*-*|plnd*) # These states aren't handled yet. echo "$0: State \"$STATUS\" not yet handled; ignoring." exit 0 ;; - down*|drain*|drng*|fail*|maint*) + down*|drain*|drng*|fail*|maint*|boot*) # If there is no old note, and we've not been told to ignore that, do not online the node. if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then echo "$0: Not onlining $HOSTNAME: No note set." diff --git a/helpers/node-mark-reboot b/helpers/node-mark-reboot new file mode 100644 index 0000000..c55650c --- /dev/null +++ b/helpers/node-mark-reboot @@ -0,0 +1,66 @@ +#!/bin/bash +# +# SURFsara Node Health Check -- Node Rebooting Helper +# +# Martijn Kruiten +# 26 apr 2018 +# + +# This script is a simple pbsnodes wrapper that the node health check +# can run in the background to mark nodes for reboot. It will first +# obtain the current node state information to avoid rebooting a node +# that is already offline or in maintenance. If these checks pass, +# the node is marked for reboot. + +FORCE_REBOOT="${FORCE_REBOOT:-0}" +LEADER="NHC:" + +echo "`date '+%Y%m%d %H:%M:%S'` $0 $*" + +HOSTNAME="$1" +shift +NOTE="$*" + +### SLURM +if [[ "$NHC_RM" == "slurm" ]]; then + SLURM_SINFO="${SLURM_SINFO:-sinfo}" + SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}" + SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=DOWN}" + + LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) ) + STATUS="${LINE[0]}" + OLD_NOTE_LEADER="${LINE[1]}" + OLD_NOTE="${LINE[*]:2}" + case "$STATUS" in + alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*) + case "$STATUS" in + drain*|drng*|fail*|maint*) + # If the node is already offline, and we've not been told to ignore that, + # do not touch the node. + if [[ "$OLD_NOTE_LEADER" != "$LEADER" && "$FORCE_REBOOT" != "1" ]]; then + echo "$0: Not rebooting $HOSTNAME: Already offline." + exit 0 + fi + ;; + esac + # If there's an old note that wasn't set by NHC, preserve it. + if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then + LEADER="$OLD_NOTE_LEADER" + NOTE="$OLD_NOTE" + fi + echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE" + exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason="$LEADER $NOTE" $HOSTNAME + ;; + down*) + echo "$0: Not changing state of down node $HOSTNAME." + ;; + *) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;; + esac + +### Everything else is unsupported. +else + echo "$0: Unsupported RM detected in $0: \"$NHC_RM\"" + exit -1 +fi +exit 0 +