From f23e98396529d7c4328d4ecdcee89eeed098c4a3 Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Thu, 1 Nov 2018 14:41:51 -0400 Subject: [PATCH 1/6] Add GPFS health check --- scripts/osc_gpfs.nhc | 121 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) create mode 100644 scripts/osc_gpfs.nhc diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc new file mode 100644 index 0000000..3b4098e --- /dev/null +++ b/scripts/osc_gpfs.nhc @@ -0,0 +1,121 @@ +# OSC NHC - GPFS checks +# +# Trey Dockendorf +# 1 November 2018 +# + +GPFS_COMPONENT=() +GPFS_ENTITY=() +GPFS_STATUS=() +export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS + +function nhc_gpfs_health_gather_data() { + local LINE_CNT + local -a FIELD + + GPFS_COMPONENT=() GPFS_ENTITY=() GPFS_STATUS=() + + ((LINE_CNT=0)) + while IFS=: read -a FIELD; do + if [[ "${FIELD[2]}" == "HEADER" ]]; then + continue + fi + if [[ "${FIELD[1]}" == "Event" ]]; then + continue + fi + if [[ "${FIELD[9]}" != "NODE" ]]; then + continue + fi + GPFS_COMPONENT[$LINE_CNT]="${FIELD[7]}" + GPFS_ENTITY[$LINE_CNT]="${FIELD[8]}" + GPFS_STATUS[$LINE_CNT]="${FIELD[10]}" + dbg "Got GPFS health ${GPFS_COMPONENT[$LINE_CNT]} ${GPFS_ENTITY[$LINE_CNT]} ${GPFS_STATUS[$LINE_CNT]}" + ((LINE_CNT++)) + done < <(/usr/lpp/mmfs/bin/mmhealth node show -Y) + + export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS +} + +# Checks GPFS health for a given component +# check_gpfs_health [-0] [-a] [-l] [-s] [-e ] +function check_gpfs_health() { + local NONFATAL=0 ALL=0 LOG=0 SYSLOG=0 ACTION="" + local THIS_COMPONENT THIS_ENTITY THIS_STATUS MSG i + + if [[ ${#GPFS_COMPONENT[*]} -eq 0 ]]; then + nhc_gpfs_health_gather_data + fi + + OPTIND=1 + while getopts ":0alse:" OPTION ; do + case "$OPTION" in + 0) NONFATAL=1 ;; + a) ALL=1 ;; + l) LOG=1 ;; + s) SYSLOG=1 ;; + e) ACTION="$OPTARG" ;; + :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;; + \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;; + esac + done + shift $((OPTIND-1)) + COMPONENT="$1" + if [[ -z "$COMPONENT" ]]; then + die 1 "$CHECK: Syntax error: Must provide component to check." + fi + dbg "Looking for GPFS health component \"$COMPONENT\"" + for ((i=0; i < ${#GPFS_COMPONENT[*]}; i++)); do + THIS_COMPONENT="${GPFS_COMPONENT[$i]}" + THIS_ENTITY="${GPFS_ENTITY[$i]}" + THIS_STATUS="${GPFS_STATUS[$i]}" + dbg "CHECKING \"$THIS_COMPONENT\" vs. \"$GPFS_COMPONENT\"" + if ! mcheck "$THIS_COMPONENT" "$COMPONENT"; then + continue + fi + dbg "Matching GPFS health found: $THIS_COMPONENT: entity=$THIS_ENTITY status=$THIS_STATUS" + if [[ "$THIS_STATUS" == "HEALTHY" ]]; then + continue + else + MSG="$CHECK: GPFS health for \"$THIS_COMPONENT\" is $THIS_STATUS" + fi + # We have a winner. Or loser, as the case may be. + if [[ "$LOG" == "1" ]]; then + log $MSG + fi + if [[ "$SYSLOG" == "1" ]]; then + syslog $MSG + fi + if [[ "$ACTION" != "" ]]; then + ${SHELL:-/bin/bash} -c "$ACTION" & + fi + if [[ $ALL -ge 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG ($ALL)" + fi + ((ALL++)) + continue + elif [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + die 1 "$MSG" + return 1 + done + # -a (all) does not necessarily imply -0 (non-fatal). A value of 1 for $ALL + # means -a was passed in but no errors were found. 2 or above is an error. + if [[ $ALL -gt 1 ]]; then + # We had at least 1 flagged process. Fail unless we're also non-fatal. + if [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + ((ALL--)) + die $ALL "$MSG (last of $ALL)" + return $ALL + fi + return 0 +} From 1c9ef42d766f4b8aa0949ddf6eac25cbe0f722bd Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Mon, 5 Nov 2018 13:11:55 -0500 Subject: [PATCH 2/6] Allow path to mmhealth to be configured --- scripts/osc_gpfs.nhc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc index 3b4098e..02b1869 100644 --- a/scripts/osc_gpfs.nhc +++ b/scripts/osc_gpfs.nhc @@ -7,7 +7,8 @@ GPFS_COMPONENT=() GPFS_ENTITY=() GPFS_STATUS=() -export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS +MMHEALTH="${MMHEALTH:-/usr/lpp/mmfs/bin/mmhealth}" +export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS MMHEALTH function nhc_gpfs_health_gather_data() { local LINE_CNT @@ -31,7 +32,7 @@ function nhc_gpfs_health_gather_data() { GPFS_STATUS[$LINE_CNT]="${FIELD[10]}" dbg "Got GPFS health ${GPFS_COMPONENT[$LINE_CNT]} ${GPFS_ENTITY[$LINE_CNT]} ${GPFS_STATUS[$LINE_CNT]}" ((LINE_CNT++)) - done < <(/usr/lpp/mmfs/bin/mmhealth node show -Y) + done < <($MMHEALTH node show -Y) export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS } From 986e63f46d5703756a05ac3b3d669afe75e4b800 Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Mon, 5 Nov 2018 13:22:28 -0500 Subject: [PATCH 3/6] Add check_gpfs_health to README --- README.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/README.md b/README.md index 0b375e0..4173907 100644 --- a/README.md +++ b/README.md @@ -452,6 +452,7 @@ The table below provides a list of the configuration variables which may be used | MCELOG_MAX_CORRECTED_RATE | `9` | Maximum number of **corrected** MCEs allowed before `check_hw_mcelog()` returns failure | | MCELOG_MAX_UNCORRECTED_RATE | `0` | Maximum number of **uncorrected** MCEs allowed before `check_hw_mcelog()` returns failure | | MDIAG_CMD | `mdiag` | Command to use to invoke Moab's `mdiag` command (may include path) | +| MMHEALTH | `/usr/lpp/mmfs/bin/mmhealth` | Command to use to invoke the GPFS `mmhealth` command | | *NAME | `nhc` | Used to populate default paths/filenames for configuration | | NHC_AUTH_USERS | `root nobody` | Users authorized to have arbitrary processes running on compute nodes | | NHC_CHECK_ALL | `0` | Forces all checks to be non-fatal. Displays each failure message, reports total number of failed checks, and returns that number. | @@ -768,6 +769,23 @@ _**Example**_: `check_fs_size /tmp 512m 4g` _**Example**_: `check_fs_used / 98%` +
+ +##### check_gpfs_health +`check_gpfs_health [-0] [-a] [-l] [-s] [-e ] ` + +Checks the health of a GPFS component. The value for _component_ must match a component reported by mmhealth. + +| **Check Option** | **Purpose** | +| ---------------- | ----------- | +| `-0` | Non-fatal. Failure of this check will be ignored. | +| `-a` | Find, report, and act on all matching components. Default behavior is to fail check after first matching component. | +| `-l` | Log unhealthy component (or components, if used with `-a`) to NHC log (`$LOGFILE`). | +| `-s` | Log unhealthy component (or components, if used with `-a`) to the syslog. | +| `-e`_`action`_ | Execute `/bin/bash -c`_`action`_ if component is NOT healthy. | + +_**Example**_: `check_gpfs_health NETWORK` +
From b6775cf33db517da637a05e02cbb93f119b1693b Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Wed, 12 Dec 2018 12:08:19 -0500 Subject: [PATCH 4/6] Add check_gpfs_verbs_status check --- README.md | 17 ++++++++++++++ scripts/osc_gpfs.nhc | 53 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4173907..27980e3 100644 --- a/README.md +++ b/README.md @@ -771,6 +771,7 @@ _**Example**_: `check_fs_used / 98%`
+ ##### check_gpfs_health `check_gpfs_health [-0] [-a] [-l] [-s] [-e ] ` @@ -789,6 +790,22 @@ _**Example**_: `check_gpfs_health NETWORK`
+##### check_gpfs_verbs_status +`check_gpfs_verbs_status [-0] [-l] [-s]` + +Checks that GPFS has started Verbs and is using RDMA. + +| **Check Option** | **Purpose** | +| ---------------- | ----------- | +| `-0` | Non-fatal. Failure of this check will be ignored. | +| `-l` | Log if verbs is not started to NHC log (`$LOGFILE`). | +| `-s` | Log if verbs is not started to the syslog. | + +_**Example**_: `check_gpfs_verbs_status` + +
+ + ##### check_hw_cpuinfo `check_hw_cpuinfo [sockets] [cores] [threads]` diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc index 02b1869..42f914f 100644 --- a/scripts/osc_gpfs.nhc +++ b/scripts/osc_gpfs.nhc @@ -8,7 +8,8 @@ GPFS_COMPONENT=() GPFS_ENTITY=() GPFS_STATUS=() MMHEALTH="${MMHEALTH:-/usr/lpp/mmfs/bin/mmhealth}" -export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS MMHEALTH +MMFSADM="${MMFSADM:-/usr/lpp/mmfs/bin/mmfsadm}" +export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS MMHEALTH MMFSADM function nhc_gpfs_health_gather_data() { local LINE_CNT @@ -120,3 +121,53 @@ function check_gpfs_health() { fi return 0 } + +# Checks GPFS verbs status +# check_gpfs_verbs_status [-0] [-l] [-s] +function check_gpfs_verbs_status() { + local NONFATAL=0 LOG=0 SYSLOG=0 MSG='' + local RET OUTPUT OLD_DEBUG + + OPTIND=1 + while getopts ":0ls" OPTION ; do + case "$OPTION" in + 0) NONFATAL=1 ;; + l) LOG=1 ;; + s) SYSLOG=1 ;; + :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;; + \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;; + esac + done + shift $((OPTIND-1)) + + OLD_DEBUG=$DEBUG + unset DEBUG + check_cmd_output -t ${CMD_TIMEOUT:-5} -C "$FUNCNAME" -O OUTPUT -m '/status/' $MMFSADM test verbs status + RET=$? + export DEBUG=$OLD_DEBUG + if [[ $RET -ne 0 ]]; then + return $RET + fi + + dbg "$MMFSADM test verbs status: \"$OUTPUT\"" + + if [[ "$OUTPUT" == *": started" ]]; then + continue + else + MSG="$FUNCNAME: GPFS verbs is not started" + fi + if [[ "$LOG" == "1" ]]; then + log $MSG + fi + if [[ "$SYSLOG" == "1" ]]; then + syslog $MSG + fi + if [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + die 1 "$MSG" + return 1 +} From d03e67546f95f479c95e7d9550d366c0ae45ad9a Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Thu, 7 Apr 2022 17:32:14 -0400 Subject: [PATCH 5/6] Avoid errors with newer bash on RHEL8 --- scripts/osc_gpfs.nhc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc index 42f914f..24c0818 100644 --- a/scripts/osc_gpfs.nhc +++ b/scripts/osc_gpfs.nhc @@ -152,7 +152,7 @@ function check_gpfs_verbs_status() { dbg "$MMFSADM test verbs status: \"$OUTPUT\"" if [[ "$OUTPUT" == *": started" ]]; then - continue + return 0 else MSG="$FUNCNAME: GPFS verbs is not started" fi From e0653b40b3bff4b2c1b4b944fed4625ce6f5f8ee Mon Sep 17 00:00:00 2001 From: Trey Dockendorf Date: Wed, 13 Apr 2022 10:04:17 -0400 Subject: [PATCH 6/6] Ensure in Makefile --- Makefile.am | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile.am b/Makefile.am index 2b1023b..e6c8489 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,8 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \ scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \ scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \ scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \ - scripts/lbnl_ps.nhc + scripts/lbnl_ps.nhc \ + scripts/osc_gpfs.nhc MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing DISTCLEANFILES =