From 0740a992dfecc383aa77989be277e9191e58323e Mon Sep 17 00:00:00 2001 From: Kilian Cavalotti Date: Mon, 5 Apr 2021 17:37:59 -0700 Subject: [PATCH] add new check_hw_numa check to verify NUMA configuration --- README.md | 13 ++++++++++++- scripts/lbnl_hw.nhc | 36 ++++++++++++++++++++++++++++++++++-- test/test_lbnl_hw.nhc | 12 +++++++++++- 3 files changed, 57 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ad9c570..95da5f5 100644 --- a/README.md +++ b/README.md @@ -789,6 +789,17 @@ _**Example** (dual-socket 4-core Intel Nehalem with HT turned off)_: `check_hw_
+##### check_hw_numa +`check_hw_numa [numa-nodes] [NPS]` + +`check_hw_numa` compares the properties of the [NUMA](https://en.wikipedia.org/wiki/Non-uniform_memory_access) nodes configured on the system to the specified values to ensure that the correct system topology is enabled. For CPUs with configurable NUMA nodes-per-socket (NPS), the 2nd parameter can be used to verify proper BIOS configuration. + +_**Example** (dual-socket AMD EPYC CPU with NPS=2)_: `check_hw_numa 4 2` + + +
+ + ##### check_hw_eth `check_hw_eth device` @@ -1247,7 +1258,7 @@ function check_stuff_works() { die 1 "Stuff is not working" return 1 fi - + # check passed return 0 } diff --git a/scripts/lbnl_hw.nhc b/scripts/lbnl_hw.nhc index e73d197..a19d64a 100644 --- a/scripts/lbnl_hw.nhc +++ b/scripts/lbnl_hw.nhc @@ -7,6 +7,8 @@ HW_SOCKETS=0 HW_CORES=0 HW_THREADS=0 +HW_NUMA_NODES=0 +HW_NUMA_NPS=0 HW_RAM_TOTAL=0 HW_RAM_FREE=0 HW_SWAP_TOTAL=0 @@ -24,7 +26,7 @@ MCELOG_MAX_UNCORRECTED_RATE="${MCELOG_MAX_UNCORRECTED_RATE:-0}" # Read hardware information from /proc and /sys files. function nhc_hw_gather_data() { - local LINE CORES SIBLINGS MHZ PROCESSOR PHYS_ID PORT INDEX DEV + local LINE CORES SIBLINGS MHZ PROCESSOR PHYS_ID PORT INDEX DEV NODES local -a FIELD PHYS_IDS # Gather CPU info @@ -66,6 +68,16 @@ function nhc_hw_gather_data() { fi dbg "Got $HW_SOCKETS $MHZ MHz processors ($HW_CORES cores, $HW_THREADS threads)" + # Gather NUMA info + if [[ -d /sys/devices/system/node ]]; then + set +f + NODES=(/sys/devices/system/node/node*) + set -f + HW_NUMA_NODES=${#NODES[@]} + HW_NUMA_NPS=$((HW_NUMA_NODES/$HW_SOCKETS)) + fi + dbg "Got $HW_NUMA_NODES NUMA node(s) (NPS: $HW_NUMA_NPS)" + # Gather memory info if [[ -e /proc/meminfo ]]; then while read -a FIELD ; do @@ -169,6 +181,26 @@ function check_hw_cpuinfo() { return 0 } +# Check that the NUMA nodes ($1) and NPS ($2) counts all match. +function check_hw_numa() { + local NUMA_NODES=$1 + local NUMA_NPS=$2 + + if [[ $HW_NUMA_NODES -eq 0 ]]; then + nhc_hw_gather_data + fi + + if [[ -n "$NUMA_NODES" && $NUMA_NODES -ne $HW_NUMA_NODES ]]; then + die 1 "$FUNCNAME: Actual NUMA nodes count ($HW_NUMA_NODES) does not match expected ($NUMA_NODES)." + return 1 + fi + if [[ -n "$NUMA_NPS" && $NUMA_NPS -ne $HW_NUMA_NPS ]]; then + die 1 "$FUNCNAME: Actual Nodes Per Socket (NPS) value ($HW_NUMA_NPS) does not match expected ($NUMA_NPS)." + return 1 + fi + return 0 +} + # Check RAM size against minimum ($1) and maximum ($2) allowable size in kB. To # require an exact amount of RAM, pass the same value for both parameters. # The optional fudge factor ($3) allows a certain amount of variance to be tolerated. @@ -462,7 +494,7 @@ function check_hw_mcelog() { die 1 "$MSG" return 1 fi - + # If none of the above thresholds was met, return success. return 0 else diff --git a/test/test_lbnl_hw.nhc b/test/test_lbnl_hw.nhc index ef607f2..0944250 100644 --- a/test/test_lbnl_hw.nhc +++ b/test/test_lbnl_hw.nhc @@ -1,8 +1,9 @@ # Tests for lbnl_hw.nhc -plan $((11+7+13+13+13+4+4+4+10+3+6+6)) "lbnl_hw.nhc" && { +plan $((12+7+1+13+13+13+4+4+4+10+3+6+6)) "lbnl_hw.nhc" && { is "`type -t nhc_hw_gather_data 2>&1`" 'function' 'nhc_hw_gather_data() loaded properly' is "`type -t check_hw_cpuinfo 2>&1`" 'function' 'check_hw_cpuinfo() loaded properly' + is "`type -t check_hw_numa 2>&1`" 'function' 'check_hw_numa() loaded properly' is "`type -t check_hw_physmem 2>&1`" 'function' 'check_hw_physmem() loaded properly' is "`type -t check_hw_swap 2>&1`" 'function' 'check_hw_swap() loaded properly' is "`type -t check_hw_mem 2>&1`" 'function' 'check_hw_mem() loaded properly' @@ -42,6 +43,15 @@ plan $((11+7+13+13+13+4+4+4+10+3+6+6)) "lbnl_hw.nhc" && { is $? 1 "Invalid test hardware: Non-existent CPU" + # NUMA data: 2 NUMA nodes, 1 NUMA node per socket + HW_NUMA_NODES=2 + HW_NUMA_NPS=1 + + # valid test + check_hw_numa 2 1 + is $? 0 "Valid test hardware: 2 NUMA nodes, 1 NPS" + + # Memory data: 32GB RAM, 18GB swap HW_RAM_FREE=27828840 HW_RAM_TOTAL=32857508