diff --git a/Makefile.am b/Makefile.am index 2b1023b..e6c8489 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,7 +14,8 @@ nobase_dist_conf_DATA = scripts/lbnl_cmd.nhc scripts/common.nhc \ scripts/lbnl_fs.nhc scripts/lbnl_hw.nhc \ scripts/lbnl_job.nhc scripts/lbnl_moab.nhc \ scripts/lbnl_net.nhc scripts/lbnl_nv.nhc \ - scripts/lbnl_ps.nhc + scripts/lbnl_ps.nhc \ + scripts/osc_gpfs.nhc MAINTAINERCLEANFILES = Makefile.in aclocal.m4 configure install-sh missing DISTCLEANFILES = diff --git a/README.md b/README.md index 0b375e0..27980e3 100644 --- a/README.md +++ b/README.md @@ -452,6 +452,7 @@ The table below provides a list of the configuration variables which may be used | MCELOG_MAX_CORRECTED_RATE | `9` | Maximum number of **corrected** MCEs allowed before `check_hw_mcelog()` returns failure | | MCELOG_MAX_UNCORRECTED_RATE | `0` | Maximum number of **uncorrected** MCEs allowed before `check_hw_mcelog()` returns failure | | MDIAG_CMD | `mdiag` | Command to use to invoke Moab's `mdiag` command (may include path) | +| MMHEALTH | `/usr/lpp/mmfs/bin/mmhealth` | Command to use to invoke the GPFS `mmhealth` command | | *NAME | `nhc` | Used to populate default paths/filenames for configuration | | NHC_AUTH_USERS | `root nobody` | Users authorized to have arbitrary processes running on compute nodes | | NHC_CHECK_ALL | `0` | Forces all checks to be non-fatal. Displays each failure message, reports total number of failed checks, and returns that number. | @@ -771,6 +772,40 @@ _**Example**_: `check_fs_used / 98%`
+##### check_gpfs_health +`check_gpfs_health [-0] [-a] [-l] [-s] [-e ] ` + +Checks the health of a GPFS component. The value for _component_ must match a component reported by mmhealth. + +| **Check Option** | **Purpose** | +| ---------------- | ----------- | +| `-0` | Non-fatal. Failure of this check will be ignored. | +| `-a` | Find, report, and act on all matching components. Default behavior is to fail check after first matching component. | +| `-l` | Log unhealthy component (or components, if used with `-a`) to NHC log (`$LOGFILE`). | +| `-s` | Log unhealthy component (or components, if used with `-a`) to the syslog. | +| `-e`_`action`_ | Execute `/bin/bash -c`_`action`_ if component is NOT healthy. | + +_**Example**_: `check_gpfs_health NETWORK` + +
+ + +##### check_gpfs_verbs_status +`check_gpfs_verbs_status [-0] [-l] [-s]` + +Checks that GPFS has started Verbs and is using RDMA. + +| **Check Option** | **Purpose** | +| ---------------- | ----------- | +| `-0` | Non-fatal. Failure of this check will be ignored. | +| `-l` | Log if verbs is not started to NHC log (`$LOGFILE`). | +| `-s` | Log if verbs is not started to the syslog. | + +_**Example**_: `check_gpfs_verbs_status` + +
+ + ##### check_hw_cpuinfo `check_hw_cpuinfo [sockets] [cores] [threads]` diff --git a/scripts/osc_gpfs.nhc b/scripts/osc_gpfs.nhc new file mode 100644 index 0000000..24c0818 --- /dev/null +++ b/scripts/osc_gpfs.nhc @@ -0,0 +1,173 @@ +# OSC NHC - GPFS checks +# +# Trey Dockendorf +# 1 November 2018 +# + +GPFS_COMPONENT=() +GPFS_ENTITY=() +GPFS_STATUS=() +MMHEALTH="${MMHEALTH:-/usr/lpp/mmfs/bin/mmhealth}" +MMFSADM="${MMFSADM:-/usr/lpp/mmfs/bin/mmfsadm}" +export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS MMHEALTH MMFSADM + +function nhc_gpfs_health_gather_data() { + local LINE_CNT + local -a FIELD + + GPFS_COMPONENT=() GPFS_ENTITY=() GPFS_STATUS=() + + ((LINE_CNT=0)) + while IFS=: read -a FIELD; do + if [[ "${FIELD[2]}" == "HEADER" ]]; then + continue + fi + if [[ "${FIELD[1]}" == "Event" ]]; then + continue + fi + if [[ "${FIELD[9]}" != "NODE" ]]; then + continue + fi + GPFS_COMPONENT[$LINE_CNT]="${FIELD[7]}" + GPFS_ENTITY[$LINE_CNT]="${FIELD[8]}" + GPFS_STATUS[$LINE_CNT]="${FIELD[10]}" + dbg "Got GPFS health ${GPFS_COMPONENT[$LINE_CNT]} ${GPFS_ENTITY[$LINE_CNT]} ${GPFS_STATUS[$LINE_CNT]}" + ((LINE_CNT++)) + done < <($MMHEALTH node show -Y) + + export GPFS_COMPONENT GPFS_ENTITY GPFS_STATUS +} + +# Checks GPFS health for a given component +# check_gpfs_health [-0] [-a] [-l] [-s] [-e ] +function check_gpfs_health() { + local NONFATAL=0 ALL=0 LOG=0 SYSLOG=0 ACTION="" + local THIS_COMPONENT THIS_ENTITY THIS_STATUS MSG i + + if [[ ${#GPFS_COMPONENT[*]} -eq 0 ]]; then + nhc_gpfs_health_gather_data + fi + + OPTIND=1 + while getopts ":0alse:" OPTION ; do + case "$OPTION" in + 0) NONFATAL=1 ;; + a) ALL=1 ;; + l) LOG=1 ;; + s) SYSLOG=1 ;; + e) ACTION="$OPTARG" ;; + :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;; + \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;; + esac + done + shift $((OPTIND-1)) + COMPONENT="$1" + if [[ -z "$COMPONENT" ]]; then + die 1 "$CHECK: Syntax error: Must provide component to check." + fi + dbg "Looking for GPFS health component \"$COMPONENT\"" + for ((i=0; i < ${#GPFS_COMPONENT[*]}; i++)); do + THIS_COMPONENT="${GPFS_COMPONENT[$i]}" + THIS_ENTITY="${GPFS_ENTITY[$i]}" + THIS_STATUS="${GPFS_STATUS[$i]}" + dbg "CHECKING \"$THIS_COMPONENT\" vs. \"$GPFS_COMPONENT\"" + if ! mcheck "$THIS_COMPONENT" "$COMPONENT"; then + continue + fi + dbg "Matching GPFS health found: $THIS_COMPONENT: entity=$THIS_ENTITY status=$THIS_STATUS" + if [[ "$THIS_STATUS" == "HEALTHY" ]]; then + continue + else + MSG="$CHECK: GPFS health for \"$THIS_COMPONENT\" is $THIS_STATUS" + fi + # We have a winner. Or loser, as the case may be. + if [[ "$LOG" == "1" ]]; then + log $MSG + fi + if [[ "$SYSLOG" == "1" ]]; then + syslog $MSG + fi + if [[ "$ACTION" != "" ]]; then + ${SHELL:-/bin/bash} -c "$ACTION" & + fi + if [[ $ALL -ge 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG ($ALL)" + fi + ((ALL++)) + continue + elif [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + die 1 "$MSG" + return 1 + done + # -a (all) does not necessarily imply -0 (non-fatal). A value of 1 for $ALL + # means -a was passed in but no errors were found. 2 or above is an error. + if [[ $ALL -gt 1 ]]; then + # We had at least 1 flagged process. Fail unless we're also non-fatal. + if [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + ((ALL--)) + die $ALL "$MSG (last of $ALL)" + return $ALL + fi + return 0 +} + +# Checks GPFS verbs status +# check_gpfs_verbs_status [-0] [-l] [-s] +function check_gpfs_verbs_status() { + local NONFATAL=0 LOG=0 SYSLOG=0 MSG='' + local RET OUTPUT OLD_DEBUG + + OPTIND=1 + while getopts ":0ls" OPTION ; do + case "$OPTION" in + 0) NONFATAL=1 ;; + l) LOG=1 ;; + s) SYSLOG=1 ;; + :) die 1 "$CHECK: Option -$OPTARG requires an argument." ; return 1 ;; + \?) die 1 "$CHECK: Invalid option: -$OPTARG" ; return 1 ;; + esac + done + shift $((OPTIND-1)) + + OLD_DEBUG=$DEBUG + unset DEBUG + check_cmd_output -t ${CMD_TIMEOUT:-5} -C "$FUNCNAME" -O OUTPUT -m '/status/' $MMFSADM test verbs status + RET=$? + export DEBUG=$OLD_DEBUG + if [[ $RET -ne 0 ]]; then + return $RET + fi + + dbg "$MMFSADM test verbs status: \"$OUTPUT\"" + + if [[ "$OUTPUT" == *": started" ]]; then + return 0 + else + MSG="$FUNCNAME: GPFS verbs is not started" + fi + if [[ "$LOG" == "1" ]]; then + log $MSG + fi + if [[ "$SYSLOG" == "1" ]]; then + syslog $MSG + fi + if [[ $NONFATAL == 1 ]]; then + if [[ -n "$MSG" ]]; then + log "$MSG (non-fatal)" + fi + return 0 + fi + die 1 "$MSG" + return 1 +}