From 751a6b3ac1121d93a7c9aa2dc4e3edb9cb079cc2 Mon Sep 17 00:00:00 2001 From: Martijn Kruiten Date: Mon, 1 Oct 2018 15:56:30 +0200 Subject: [PATCH 1/3] Node mark reboot helper --- helpers/node-mark-reboot | 48 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100755 helpers/node-mark-reboot diff --git a/helpers/node-mark-reboot b/helpers/node-mark-reboot new file mode 100755 index 0000000..de692b4 --- /dev/null +++ b/helpers/node-mark-reboot @@ -0,0 +1,48 @@ +#!/bin/bash +# +# SURFsara Node Health Check -- Node Rebooting Helper +# +# Martijn Kruiten +# 26 apr 2018 +# + +# This script is a simple wrapper that the node health check can run +# in the background to mark nodes for reboot. It will first obtain +# the current node state information to avoid rebooting a node that +# is already offline or in maintenance. If these checks pass, the +# node is marked for reboot. + +IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}" +LEADER="NHC:" + +echo "`date '+%Y%m%d %H:%M:%S'` $0 $*" + +HOSTNAME="$1" +shift +NOTE="$*" + +### SLURM +if [[ "$NHC_RM" == "slurm" ]]; then + SLURM_SINFO="${SLURM_SINFO:-sinfo}" + SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}" + SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-reboot ASAP}" + + LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) ) + STATUS="${LINE[0]}" + case "$STATUS" in + alloc*|comp*|idle*|mix*|resume*|resv*|undrain*) + echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE" + exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME + ;; + down*|drain*|drng*|fail*|maint*) + echo "$0: Not changing state of down node $HOSTNAME." + ;; + *) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;; + esac + +### Everything else is unsupported. +else + echo "$0: Unsupported RM detected in $0: \"$NHC_RM\"" + exit -1 +fi +exit 0 From 8de1ea721170bcbdced7708dbcf85c18e75991e4 Mon Sep 17 00:00:00 2001 From: Martijn Kruiten Date: Fri, 22 May 2020 15:59:12 +0200 Subject: [PATCH 2/3] Update to newest version that we use internally --- helpers/node-mark-reboot | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/helpers/node-mark-reboot b/helpers/node-mark-reboot index de692b4..828bc9c 100755 --- a/helpers/node-mark-reboot +++ b/helpers/node-mark-reboot @@ -6,13 +6,13 @@ # 26 apr 2018 # -# This script is a simple wrapper that the node health check can run -# in the background to mark nodes for reboot. It will first obtain -# the current node state information to avoid rebooting a node that -# is already offline or in maintenance. If these checks pass, the -# node is marked for reboot. +# This script is a simple pbsnodes wrapper that the node health check +# can run in the background to mark nodes for reboot. It will first +# obtain the current node state information to avoid rebooting a node +# that is already offline or in maintenance. If these checks pass, +# the node is marked for reboot. -IGNORE_EMPTY_NOTE="${IGNORE_EMPTY_NOTE:-0}" +FORCE_REBOOT="${FORCE_REBOOT:-0}" LEADER="NHC:" echo "`date '+%Y%m%d %H:%M:%S'` $0 $*" @@ -25,16 +25,34 @@ NOTE="$*" if [[ "$NHC_RM" == "slurm" ]]; then SLURM_SINFO="${SLURM_SINFO:-sinfo}" SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}" - SLURM_SC_OFFLINE_ARGS="${SLURM_SC_OFFLINE_ARGS:-reboot ASAP}" + SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=RESUME}" LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) ) STATUS="${LINE[0]}" + OLD_NOTE_LEADER="${LINE[1]}" + OLD_NOTE="${LINE[*]:2}" case "$STATUS" in - alloc*|comp*|idle*|mix*|resume*|resv*|undrain*) + alloc*|comp*|drain*|drng*|fail*|idle*|maint*|mix*|resume*|resv*|undrain*) + case "$STATUS" in + drain*|drng*|fail*|maint*) + # If the node is already offline, and we've not been told to ignore that, + # do not touch the node. + if [[ "$OLD_NOTE_LEADER" != "$LEADER" && "$FORCE_REBOOT" != "1" ]]; then + echo "$0: Not rebooting $HOSTNAME: Already offline." + exit 0 + fi + ;; + esac + # If there's an old note that wasn't set by NHC, preserve it. + if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then + LEADER="$OLD_NOTE_LEADER" + NOTE="$OLD_NOTE" + SLURM_SC_REBOOT_ARGS="reboot ASAP NextState=DOWN" + fi echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE" - exec $SLURM_SCONTROL $SLURM_SC_OFFLINE_ARGS $HOSTNAME + exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason="$LEADER $NOTE" $HOSTNAME ;; - down*|drain*|drng*|fail*|maint*) + down*) echo "$0: Not changing state of down node $HOSTNAME." ;; *) echo "$0: Not sure how to handle node state \"$STATUS\" on $HOSTNAME" ;; @@ -46,3 +64,4 @@ else exit -1 fi exit 0 + From eebeed7628cc44dcfb950a46c8227536fe379eea Mon Sep 17 00:00:00 2001 From: Martijn Kruiten Date: Tue, 22 Mar 2022 13:58:14 +0100 Subject: [PATCH 3/3] Rebased on upstream master branch --- helpers/node-mark-online | 4 ++-- helpers/node-mark-reboot | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) mode change 100755 => 100644 helpers/node-mark-reboot diff --git a/helpers/node-mark-online b/helpers/node-mark-online index 4af85ad..ab612cc 100644 --- a/helpers/node-mark-online +++ b/helpers/node-mark-online @@ -60,12 +60,12 @@ elif [[ "$NHC_RM" == "slurm" ]]; then # Slurm does not run the HealthCheckProgram on nodes in the DOWN state, # but if someone runs NHC by hand, we want to be able to do the right thing. case "$STATUS" in - *'@'*|*'#'*|boot*|*-*|plnd*) + *'@'*|*'#'*|*-*|plnd*) # These states aren't handled yet. echo "$0: State \"$STATUS\" not yet handled; ignoring." exit 0 ;; - down*|drain*|drng*|fail*|maint*) + down*|drain*|drng*|fail*|maint*|boot*) # If there is no old note, and we've not been told to ignore that, do not online the node. if [[ "$OLD_NOTE_LEADER" == "none" && "$IGNORE_EMPTY_NOTE" != "1" ]]; then echo "$0: Not onlining $HOSTNAME: No note set." diff --git a/helpers/node-mark-reboot b/helpers/node-mark-reboot old mode 100755 new mode 100644 index 828bc9c..c55650c --- a/helpers/node-mark-reboot +++ b/helpers/node-mark-reboot @@ -25,7 +25,7 @@ NOTE="$*" if [[ "$NHC_RM" == "slurm" ]]; then SLURM_SINFO="${SLURM_SINFO:-sinfo}" SLURM_SCONTROL="${SLURM_SCONTROL:-scontrol}" - SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=RESUME}" + SLURM_SC_REBOOT_ARGS="${SLURM_SC_REBOOT_ARGS:-reboot ASAP NextState=DOWN}" LINE=( $($SLURM_SINFO -o '%t %E' -hn $HOSTNAME) ) STATUS="${LINE[0]}" @@ -47,7 +47,6 @@ if [[ "$NHC_RM" == "slurm" ]]; then if [[ "$OLD_NOTE_LEADER" != "none" && "$OLD_NOTE_LEADER" != "$LEADER" ]]; then LEADER="$OLD_NOTE_LEADER" NOTE="$OLD_NOTE" - SLURM_SC_REBOOT_ARGS="reboot ASAP NextState=DOWN" fi echo "$0: Marking $STATUS $HOSTNAME for reboot: $LEADER $NOTE" exec $SLURM_SCONTROL $SLURM_SC_REBOOT_ARGS Reason="$LEADER $NOTE" $HOSTNAME