Skip to content

Commit

Permalink
DAOS-11393 control: Log numa stats after hugepage cleanup (#10122) (#…
Browse files Browse the repository at this point in the history
…10248)

Print output of numastat -m to the daos_admin log after hugepages have
been cleaned up which happens just before engine start. This provides
per-NUMA details of memory allocations and usage which can be used to
troubleshoot insufficient hugepage memory failures on engine start-up.

Signed-off-by: Tom Nabarro <tom.nabarro@intel.com>
  • Loading branch information
tanabarr authored Sep 12, 2022
1 parent 0731df6 commit 1dd4753
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 13 deletions.
4 changes: 2 additions & 2 deletions src/control/server/config/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -710,8 +710,8 @@ func setEngineAffinity(log logging.Logger, engineCfg *engine.Config, node uint)
if *engineCfg.PinnedNumaNode != node {
// TODO: This should probably be a fatal error, but we may need to allow the config
// override in case our affinity detection is incorrect.
log.Errorf("engine config pinned_numa_node is set to %d but detected affinity is with NUMA node %d",
*engineCfg.PinnedNumaNode, node)
log.Errorf("engine %d config pinned_numa_node is set to %d but detected affinity is with NUMA node %d",
engineCfg.Index, *engineCfg.PinnedNumaNode, node)
}
} else {
// If not set via config, use the detected NUMA node affinity.
Expand Down
20 changes: 9 additions & 11 deletions src/control/server/server_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,12 @@ func updateMemValues(srv *server, engine *EngineInstance, getHugePageInfo common
memSizeFreeMb := hpi.Free * pageSizeMb

// Fail if free hugepage mem is not enough to sustain average I/O workload (~1GB).
srv.log.Debugf("Per-engine MemSize:%dMB, HugepageSize:%dMB (info: %+v)", memSizeReqMb,
pageSizeMb, *hpi)
if memSizeFreeMb < memSizeReqMb {
srv.log.Errorf("huge page info: %+v", *hpi)
return FaultInsufficientFreeHugePageMem(int(ei), memSizeReqMb, memSizeFreeMb,
nrPagesRequired, hpi.Free)
}
srv.log.Debugf("Per-engine MemSize:%dMB, HugepageSize:%dMB", memSizeReqMb, pageSizeMb)

// Set engine mem_size and hugepage_size (MiB) values based on hugepage info.
engine.setMemSize(memSizeReqMb)
Expand All @@ -365,19 +365,19 @@ func updateMemValues(srv *server, engine *EngineInstance, getHugePageInfo common
return nil
}

func cleanEngineHugePages(srv *server, engineIdx uint32) error {
msg := fmt.Sprintf("engine %d: cleaning hugepages before starting", engineIdx)

func cleanEngineHugePages(srv *server) error {
req := storage.BdevPrepareRequest{
CleanHugePagesOnly: true,
}

msg := "cleanup hugepages via bdev backend"

resp, err := srv.ctlSvc.NvmePrepare(req)
if err != nil {
return errors.Wrap(err, msg)
}

srv.log.Debugf("%s, %d removed", msg, resp.NrHugePagesRemoved)
srv.log.Debugf("%s: %d removed", msg, resp.NrHugePagesRemoved)

return nil
}
Expand All @@ -399,14 +399,12 @@ func registerEngineEventCallbacks(srv *server, engine *EngineInstance, allStarte
return nil
})

engine.RLock()
engineIdx := engine.runner.GetConfig().Index
engine.RUnlock()

// Register callback to update engine cfg mem_size after format.
engine.OnStorageReady(func(_ context.Context) error {
srv.log.Debugf("engine %d: storage ready", engine.Index())

// Attempt to remove unused hugepages, log error only.
if err := cleanEngineHugePages(srv, engineIdx); err != nil {
if err := cleanEngineHugePages(srv); err != nil {
srv.log.Errorf(err.Error())
}

Expand Down
19 changes: 19 additions & 0 deletions src/control/server/storage/bdev/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ package bdev
import (
"fmt"
"os"
"os/exec"
"path/filepath"
"regexp"
"sort"
Expand Down Expand Up @@ -168,6 +169,22 @@ func cleanHugePages(hugePageDir string) (count uint, _ error) {
createHugePageWalkFunc(hugePageDir, os.Stat, os.Remove, &count))
}

func logNUMAStats(log logging.Logger) {
var toLog string

out, err := exec.Command("numastat", "-m").Output()
if err != nil {
toLog = (&runCmdError{
wrapped: err,
stdout: string(out),
}).Error()
} else {
toLog = string(out)
}

log.Debugf("run cmd numastat -m: %s", toLog)
}

// prepare receives function pointers for external interfaces.
func (sb *spdkBackend) prepare(req storage.BdevPrepareRequest, vmdDetect vmdDetectFn, hpClean hpCleanFn) (*storage.BdevPrepareResponse, error) {
resp := &storage.BdevPrepareResponse{}
Expand All @@ -180,6 +197,8 @@ func (sb *spdkBackend) prepare(req storage.BdevPrepareRequest, vmdDetect vmdDete
}
resp.NrHugePagesRemoved = nrRemoved

logNUMAStats(sb.log)

return resp, nil
}

Expand Down

0 comments on commit 1dd4753

Please sign in to comment.