Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Node mark reboot helper #65

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions helpers/node-mark-online
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,12 @@ elif [[ "$NHC_RM" == "slurm" ]]; then
# Slurm does not run the HealthCheckProgram on nodes in the DOWN state,
# but if someone runs NHC by hand, we want to be able to do the right thing.
case "$STATUS" in
*'@'*|*'#'*|boot*|*-*|plnd*)
*'@'*|*'#'*|*-*|plnd*)
# These states aren't handled yet.
echo "$0: State \"$STATUS\" not yet handled; ignoring."
exit 0
;;
down*|drain*|drng*|fail*|maint*)
down*|drain*|drng*|fail*|maint*|boot*)
# If there is no old note, and we've not been told to ignore that, do not online the node.
if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then
echo "$0: Not onlining $HOSTNAME: No note set."
Expand Down
66 changes: 66 additions & 0 deletions helpers/node-mark-reboot
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/bin/bash
#
# SURFsara Node Health Check -- Node Rebooting Helper
#
# Martijn Kruiten <martijn.kruiten@surfsara.nl>
# 26 apr 2018
#

# This script is a simple pbsnodes wrapper that the node health check
# can run in the background to mark nodes for reboot. It will first
# obtain the current node state information to avoid rebooting a node
# that is already offline or in maintenance. If these checks pass,
# the node is marked for reboot.

FORCE_REBOOT="${FORCE_REBOOT:-0}"
LEADER="NHC:"

echo "`date '+%Y%m%d %H:%M:%S'` $0 $*"

HOSTNAME="$1"
shift
NOTE="$*"

### SLURM
if [[ "$NHC_RM" == "slurm" ]]; then
SLURM_SINFO="${SLURM_SINFO:-sinfo}"
SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}"
SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=DOWN}"

LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) )
STATUS="${LINE[0]}"
OLD_NOTE_LEADER="${LINE[1]}"
OLD_NOTE="${LINE[*]:2}"
case "$STATUS" in
alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*)
case "$STATUS" in
drain*|drng*|fail*|maint*)
# If the node is already offline, and we've not been told to ignore that,
# do not touch the node.
if [[ "$OLD_NOTE_LEADER" != "$LEADER" && "$FORCE_REBOOT" != "1" ]]; then
echo "$0: Not rebooting $HOSTNAME: Already offline."
exit 0
fi
;;
esac
# If there's an old note that wasn't set by NHC, preserve it.
if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then
LEADER="$OLD_NOTE_LEADER"
NOTE="$OLD_NOTE"
fi
echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE"
exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason="$LEADER $NOTE" $HOSTNAME
;;
down*)
echo "$0: Not changing state of down node $HOSTNAME."
;;
*) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;;
esac

### Everything else is unsupported.
else
echo "$0: Unsupported RM detected in $0: \"$NHC_RM\""
exit -1
fi
exit 0