From 1eb3482e0a06ba547086d41e8e631590dde2fa4e Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Thu, 25 Jul 2024 13:27:05 -0400 Subject: [PATCH 01/11] DAOS-16127 tools: Add daos health check command (#14730) Perform basic system health checks from the client perspective. Checks the following: * Client/Server versions * Key library versions and paths * Connected sytem information * Pool status for all pools to which the user has access * Container status for all containers in the checked pools Change-Id: I9154ee7f3632996e0e67ad6f320874e1df2e0d23 Signed-off-by: Michael MacDonald --- src/client/api/SConscript | 3 +- src/client/api/version.c | 31 ++ src/control/build/interop.go | 6 +- src/control/build/lib_versions.go | 227 ++++++++++ src/control/build/lib_versions_test.go | 87 ++++ src/control/cmd/daos/container.go | 151 +++---- src/control/cmd/daos/health.go | 168 +++++++ src/control/cmd/daos/main.go | 6 + src/control/cmd/daos/pool.go | 9 +- src/control/cmd/daos/pretty/health.go | 187 ++++++++ src/control/cmd/daos/pretty/health_test.go | 413 ++++++++++++++++++ src/control/cmd/daos/system.go | 47 +- src/control/cmd/daos/util.go | 26 +- src/control/lib/control/pool.go | 79 +--- src/control/lib/control/pool_test.go | 8 +- src/control/lib/daos/api/api.go | 69 +++ src/control/lib/daos/api/provider.go | 48 ++ src/control/lib/daos/api/system.go | 55 +++ src/control/lib/daos/container.go | 61 +++ src/control/lib/daos/health.go | 28 ++ src/control/lib/daos/pool.go | 38 +- src/control/lib/daos/pool_test.go | 11 + src/control/lib/daos/status.go | 15 +- src/control/lib/daos/status_test.go | 17 +- src/control/lib/daos/system.go | 53 +++ src/control/lib/dlopen/dlopen.go | 4 +- src/control/lib/dlopen/dlopen_example.go | 4 +- src/control/lib/ranklist/ranklist.go | 38 +- src/control/lib/ranklist/ranklist_test.go | 94 +++- src/control/lib/txtfmt/table.go | 19 +- src/control/lib/txtfmt/table_test.go | 21 +- .../github.com/jessevdk/go-flags/group.go | 2 +- src/include/daos/api.h | 28 ++ src/include/daos_types.h | 3 + src/mgmt/cli_mgmt.c | 1 + 35 files changed, 1819 insertions(+), 238 deletions(-) create mode 100644 src/client/api/version.c create mode 100644 src/control/build/lib_versions.go create mode 100644 src/control/build/lib_versions_test.go create mode 100644 src/control/cmd/daos/health.go create mode 100644 src/control/cmd/daos/pretty/health.go create mode 100644 src/control/cmd/daos/pretty/health_test.go create mode 100644 src/control/lib/daos/api/api.go create mode 100644 src/control/lib/daos/api/provider.go create mode 100644 src/control/lib/daos/api/system.go create mode 100644 src/control/lib/daos/container.go create mode 100644 src/control/lib/daos/health.go create mode 100644 src/control/lib/daos/system.go create mode 100644 src/include/daos/api.h diff --git a/src/client/api/SConscript b/src/client/api/SConscript index 43af58a6c86..8c71ceb4273 100644 --- a/src/client/api/SConscript +++ b/src/client/api/SConscript @@ -1,7 +1,8 @@ """Build DAOS client""" LIBDAOS_SRC = ['agent.c', 'array.c', 'container.c', 'event.c', 'init.c', 'job.c', 'kv.c', 'mgmt.c', - 'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c'] + 'object.c', 'pool.c', 'rpc.c', 'task.c', 'tx.c', 'pipeline.c', 'metrics.c', + 'version.c'] def scons(): diff --git a/src/client/api/version.c b/src/client/api/version.c new file mode 100644 index 00000000000..9e38c6b5af5 --- /dev/null +++ b/src/client/api/version.c @@ -0,0 +1,31 @@ +/** + * (C) Copyright 2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * This file is part of daos + * + * src/client/api/version.c + */ +#define D_LOGFAC DD_FAC(client) + +#include +#include + +/** + * Retrieve DAOS client API version. + */ +int +daos_version_get(int *major, int *minor, int *fix) +{ + if (major == NULL || minor == NULL || fix == NULL) { + D_ERROR("major, minor, fix must not be NULL\n"); + return -DER_INVAL; + } + *major = DAOS_API_VERSION_MAJOR; + *minor = DAOS_API_VERSION_MINOR; + *fix = DAOS_API_VERSION_FIX; + + return 0; +} \ No newline at end of file diff --git a/src/control/build/interop.go b/src/control/build/interop.go index f12c6bca326..e4f5e30b515 100644 --- a/src/control/build/interop.go +++ b/src/control/build/interop.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2022 Intel Corporation. +// (C) Copyright 2022-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -40,6 +40,8 @@ var ( ComponentAdmin = Component("admin") // ComponentAgent represents the compute node agent. ComponentAgent = Component("agent") + // ComponentClient represents the libdaos client. + ComponentClient = Component("client") ) // NewVersionedComponent creates a new VersionedComponent. @@ -50,7 +52,7 @@ func NewVersionedComponent(comp Component, version string) (*VersionedComponent, } switch comp { - case ComponentServer, ComponentAdmin, ComponentAgent, ComponentAny: + case ComponentServer, ComponentAdmin, ComponentAgent, ComponentClient, ComponentAny: return &VersionedComponent{ Component: comp, Version: v, diff --git a/src/control/build/lib_versions.go b/src/control/build/lib_versions.go new file mode 100644 index 00000000000..dfa007cf8e9 --- /dev/null +++ b/src/control/build/lib_versions.go @@ -0,0 +1,227 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package build + +import ( + "bufio" + "fmt" + "io" + "os" + "regexp" + "strings" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/dlopen" +) + +/* +#include + +int +get_fi_version(void *fn, int *major, int *minor, int *patch) +{ + uint32_t version; + + if (fn == NULL || major == NULL || minor == NULL || patch == NULL) { + return -1; + } + // uint32_t fi_version(void); + version = ((uint32_t (*)(void))fn)(); + + if (version == 0) { + return -1; + } + + // FI_MAJOR(version); + *major = (version >> 16); + // FI_MINOR(version); + *minor = (version & 0xFFFF); + // No way to get at revision number via ABI. :( + + return 0; +} + +int +get_hg_version(void *fn, int *major, int *minor, int *patch) +{ + if (fn == NULL || major == NULL || minor == NULL || patch == NULL) { + return -1; + } + // int HG_Version_get(int *, int *, int *); + return ((int (*)(int *, int *, int *))fn)(major, minor, patch); +} + +int +get_daos_version(void *fn, int *major, int *minor, int *fix) +{ + if (fn == NULL || major == NULL || minor == NULL || fix == NULL) { + return -1; + } + // int daos_version_get(int *, int *, int *); + return ((int (*)(int *, int *, int *))fn)(major, minor, fix); +} +*/ +import "C" + +// readMappedLibPath attempts to resolve the given library name to an on-disk path. +// NB: The library must be loaded in order for it to be found! +func readMappedLibPath(input io.Reader, libName string) (string, error) { + if libName == "" { + return "", nil + } + + libs := make(map[string]struct{}) + libRe := regexp.MustCompile(fmt.Sprintf(`\s([^\s]+/%s[\-\.\d]*\.so.*$)`, libName)) + scanner := bufio.NewScanner(input) + for scanner.Scan() { + if matches := libRe.FindStringSubmatch(scanner.Text()); len(matches) > 1 { + libs[matches[1]] = struct{}{} + } + } + + if len(libs) == 0 { + return "", errors.Errorf("unable to find path for %q", libName) + } else if len(libs) > 1 { + return "", errors.Errorf("multiple paths found for %q: %v", libName, libs) + } + + var libPath string + for lib := range libs { + libPath = lib + break + } + return libPath, nil +} + +func getLibPath(libName string) (string, error) { + f, err := os.Open("/proc/self/maps") + if err != nil { + return "", errors.Wrapf(err, "failed to open %s", "/proc/self/maps") + } + defer f.Close() + + return readMappedLibPath(f, libName) +} + +func getLibHandle(libName string) (string, *dlopen.LibHandle, error) { + searchLib := libName + ".so" + + // Check to see if it's already loaded, and if so, use that path + // instead of searching in order to make sure we're getting a + // handle to the correct library. + libPath, _ := getLibPath(libName) + if libPath != "" { + searchLib = libPath + } + + hdl, err := dlopen.GetHandle(searchLib) + if err != nil { + return "", nil, err + } + + if libPath == "" { + libPath, err = getLibPath(libName) + if err != nil { + fmt.Fprintf(os.Stderr, "failed to get path for %q: %+v\n", libName, err) + return "", nil, err + } + } + + return libPath, hdl, err +} + +func getLibFabricVersion() (*Version, string, error) { + libPath, hdl, err := getLibHandle("libfabric") + if err != nil { + return nil, "", err + } + defer hdl.Close() + + ptr, err := hdl.GetSymbolPointer("fi_version") + if err != nil { + return nil, "", err + } + + var major, minor, patch C.int + rc := C.get_fi_version(ptr, &major, &minor, &patch) + if rc != 0 { + return nil, "", errors.Errorf("get_fi_version() failed: %d", rc) + } + + return &Version{ + Major: int(major), + Minor: int(minor), + Patch: int(patch), + }, libPath, nil +} + +func getMercuryVersion() (*Version, string, error) { + libPath, hdl, err := getLibHandle("libmercury") + if err != nil { + return nil, "", err + } + defer hdl.Close() + + ptr, err := hdl.GetSymbolPointer("HG_Version_get") + if err != nil { + return nil, "", err + } + + var major, minor, patch C.int + rc := C.get_hg_version(ptr, &major, &minor, &patch) + if rc != 0 { + return nil, "", errors.Errorf("get_hg_version() failed: %d", rc) + } + + return &Version{ + Major: int(major), + Minor: int(minor), + Patch: int(patch), + }, libPath, nil +} + +func getDAOSVersion() (*Version, string, error) { + libPath, hdl, err := getLibHandle("libdaos") + if err != nil { + return nil, "", err + } + defer hdl.Close() + + ptr, err := hdl.GetSymbolPointer("daos_version_get") + if err != nil { + return nil, "", err + } + + var major, minor, fix C.int + rc := C.get_daos_version(ptr, &major, &minor, &fix) + if rc != 0 { + return nil, "", errors.Errorf("get_daos_version() failed: %d", rc) + } + + return &Version{ + Major: int(major), + Minor: int(minor), + Patch: int(fix), + }, libPath, nil +} + +// GetLibraryInfo attempts to resolve the given library name into a version and path. +// NB: The library must provide an ABI method to obtain its version, and that +// method needs to be added to this package in order to support it. +func GetLibraryInfo(libName string) (*Version, string, error) { + switch strings.TrimPrefix(strings.ToLower(libName), "lib") { + case "fabric": + return getLibFabricVersion() + case "mercury": + return getMercuryVersion() + case "daos": + return getDAOSVersion() + default: + return nil, "", errors.Errorf("unsupported library: %q", libName) + } +} diff --git a/src/control/build/lib_versions_test.go b/src/control/build/lib_versions_test.go new file mode 100644 index 00000000000..33b4a9eec2e --- /dev/null +++ b/src/control/build/lib_versions_test.go @@ -0,0 +1,87 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package build + +import ( + "io" + "strings" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/common/test" +) + +func TestBuild_readMappedLibPath(t *testing.T) { + testMap := ` +55a05b000000-55a05b008000 r-xp 00000000 fd:01 44060915 /usr/bin/cat +55a05b207000-55a05b208000 r--p 00007000 fd:01 44060915 /usr/bin/cat +55a05b208000-55a05b209000 rw-p 00008000 fd:01 44060915 /usr/bin/cat +55a05d1d5000-55a05d1f6000 rw-p 00000000 00:00 0 [heap] +7f2126a00000-7f2126bcd000 r-xp 00000000 fd:01 44043909 /usr/lib64/libc-2.28.so +7f2126bcd000-7f2126dcc000 ---p 001cd000 fd:01 44043909 /usr/lib64/libc-2.28.so +7f2126dcc000-7f2126dd0000 r--p 001cc000 fd:01 44043909 /usr/lib64/libc-2.28.so +7f2126dd0000-7f2126dd2000 rw-p 001d0000 fd:01 44043909 /usr/lib64/libc-2.28.so +7f2126dd2000-7f2126dd6000 rw-p 00000000 00:00 0 +7f2126e00000-7f2126e2f000 r-xp 00000000 fd:01 44043895 /usr/lib64/ld-2.28.so +7f212702f000-7f2127030000 r--p 0002f000 fd:01 44043895 /usr/lib64/ld-2.28.so +7f2127030000-7f2127032000 rw-p 00030000 fd:01 44043895 /usr/lib64/ld-2.28.so +7f2127135000-7f212715a000 rw-p 00000000 00:00 0 +7f2127162000-7f2127164000 rw-p 00000000 00:00 0 +7f2127164000-7f2127168000 r--p 00000000 00:00 0 [vvar] +7f2127168000-7f212716a000 r-xp 00000000 00:00 0 [vdso] +7fffab3d9000-7fffab3fb000 rw-p 00000000 00:00 0 [stack] +` + + for name, tc := range map[string]struct { + reader io.Reader + libName string + expPath string + expErr error + }{ + "empty": {}, + "nonexistent": { + reader: strings.NewReader(testMap), + libName: "libmystery", + expErr: errors.New("unable to find"), + }, + "dupes": { + reader: strings.NewReader(testMap + ` +7f2127030000-7f2127032000 rw-p 00030000 fd:01 44043895 /usr/lib64/libwhoops-2.29.so +7f2127030000-7f2127032000 rw-p 00030000 fd:01 44043895 /usr/lib64/libwhoops-2.28.so +`), + libName: "libwhoops", + expErr: errors.New("multiple paths"), + }, + "libc": { + reader: strings.NewReader(testMap), + libName: "libc", + expPath: "/usr/lib64/libc-2.28.so", + }, + "libunder_score": { + reader: strings.NewReader(testMap + ` +7f2127030000-7f2127032000 rw-p 00030000 fd:01 44043895 /usr/lib64/libkool_wow.so.123 +7f2127030000-7f2127032000 rw-p 00030000 fd:01 44043895 /usr/lib64/libkool.so.456 +`), + libName: "libkool", + expPath: "/usr/lib64/libkool.so.456", + }, + } { + t.Run(name, func(t *testing.T) { + gotPath, gotErr := readMappedLibPath(tc.reader, tc.libName) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + if diff := cmp.Diff(tc.expPath, gotPath); diff != "" { + t.Fatalf("unexpected path for %q (-want,+got): %s", tc.libName, diff) + } + }) + } +} diff --git a/src/control/cmd/daos/container.go b/src/control/cmd/daos/container.go index d83c08600e3..46ac4da7575 100644 --- a/src/control/cmd/daos/container.go +++ b/src/control/cmd/daos/container.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2021-2023 Intel Corporation. +// (C) Copyright 2021-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,7 +7,6 @@ package main import ( - "encoding/json" "fmt" "io" "os" @@ -80,6 +79,35 @@ func (cmd *containerBaseCmd) contUUIDPtr() *C.uchar { return (*C.uchar)(unsafe.Pointer(&cmd.contUUID[0])) } +func containerOpen(poolHdl C.daos_handle_t, contID string, flags uint, query bool) (C.daos_handle_t, *C.daos_cont_info_t, error) { + cContID := C.CString(contID) + defer freeString(cContID) + + var contHdl C.daos_handle_t + var infoPtr *C.daos_cont_info_t + if query { + infoPtr = new(C.daos_cont_info_t) + } + + return contHdl, infoPtr, containerOpenAPI(poolHdl, cContID, C.uint(flags), &contHdl, infoPtr) +} + +func containerOpenAPI(poolHdl C.daos_handle_t, contID *C.char, flags C.uint, contHdl *C.daos_handle_t, contInfo *C.daos_cont_info_t) error { + return daosError(C.daos_cont_open(poolHdl, contID, flags, contHdl, contInfo, nil)) +} + +func containerCloseAPI(contHdl C.daos_handle_t) error { + // Hack for NLT fault injection testing: If the rc + // is -DER_NOMEM, retry once in order to actually + // shut down and release resources. + rc := C.daos_cont_close(contHdl, nil) + if rc == -C.DER_NOMEM { + rc = C.daos_cont_close(contHdl, nil) + } + + return daosError(rc) +} + func (cmd *containerBaseCmd) openContainer(openFlags C.uint) error { openFlags |= C.DAOS_COO_FORCE if (openFlags & C.DAOS_COO_RO) != 0 { @@ -94,22 +122,23 @@ func (cmd *containerBaseCmd) openContainer(openFlags C.uint) error { defer freeString(cLabel) cmd.Debugf("opening container: %s", cmd.contLabel) - rc = C.daos_cont_open(cmd.cPoolHandle, cLabel, openFlags, - &cmd.cContHandle, &contInfo, nil) - if rc == 0 { - var err error - cmd.contUUID, err = uuidFromC(contInfo.ci_uuid) - if err != nil { - cmd.closeContainer() - return err - } + if err := containerOpenAPI(cmd.cPoolHandle, cLabel, openFlags, &cmd.cContHandle, &contInfo); err != nil { + return err + } + + var err error + cmd.contUUID, err = uuidFromC(contInfo.ci_uuid) + if err != nil { + cmd.closeContainer() + return err } case cmd.contUUID != uuid.Nil: cmd.Debugf("opening container: %s", cmd.contUUID) cUUIDstr := C.CString(cmd.contUUID.String()) defer freeString(cUUIDstr) - rc = C.daos_cont_open(cmd.cPoolHandle, cUUIDstr, - openFlags, &cmd.cContHandle, nil, nil) + if err := containerOpenAPI(cmd.cPoolHandle, cUUIDstr, openFlags, &cmd.cContHandle, nil); err != nil { + return err + } default: return errors.New("no container UUID or label supplied") } @@ -119,21 +148,13 @@ func (cmd *containerBaseCmd) openContainer(openFlags C.uint) error { func (cmd *containerBaseCmd) closeContainer() { cmd.Debugf("closing container: %s", cmd.contUUID) - // Hack for NLT fault injection testing: If the rc - // is -DER_NOMEM, retry once in order to actually - // shut down and release resources. - rc := C.daos_cont_close(cmd.cContHandle, nil) - if rc == -C.DER_NOMEM { - rc = C.daos_cont_close(cmd.cContHandle, nil) - } - - if err := daosError(rc); err != nil { + if err := containerCloseAPI(cmd.cContHandle); err != nil { cmd.Errorf("container close failed: %s", err) } } -func (cmd *containerBaseCmd) queryContainer() (*containerInfo, error) { - ci := newContainerInfo(&cmd.poolUUID, &cmd.contUUID) +func queryContainer(poolUUID, contUUID uuid.UUID, poolHandle, contHandle C.daos_handle_t) (*daos.ContainerInfo, error) { + var cInfo C.daos_cont_info_t var cType [10]C.char props, entries, err := allocProps(3) @@ -148,11 +169,12 @@ func (cmd *containerBaseCmd) queryContainer() (*containerInfo, error) { props.dpp_nr++ defer func() { C.daos_prop_free(props) }() - rc := C.daos_cont_query(cmd.cContHandle, &ci.dci, props, nil) + rc := C.daos_cont_query(contHandle, &cInfo, props, nil) if err := daosError(rc); err != nil { return nil, err } + ci := convertContainerInfo(poolUUID, contUUID, &cInfo) lType := C.get_dpe_val(&entries[0]) C.daos_unparse_ctype(C.ushort(lType), &cType[0]) ci.Type = C.GoString(&cType[0]) @@ -173,7 +195,7 @@ func (cmd *containerBaseCmd) queryContainer() (*containerInfo, error) { var dir_oclass [C.MAX_OBJ_CLASS_NAME_LEN]C.char var file_oclass [C.MAX_OBJ_CLASS_NAME_LEN]C.char - rc := C.dfs_mount(cmd.cPoolHandle, cmd.cContHandle, C.O_RDONLY, &dfs) + rc := C.dfs_mount(poolHandle, contHandle, C.O_RDONLY, &dfs) if err := dfsError(rc); err != nil { return nil, errors.Wrap(err, "failed to mount container") } @@ -297,8 +319,8 @@ func (cmd *containerCreateCmd) Execute(_ []string) (err error) { } defer cmd.closeContainer() - var ci *containerInfo - ci, err = cmd.queryContainer() + var ci *daos.ContainerInfo + ci, err = queryContainer(cmd.poolUUID, cmd.contUUID, cmd.cPoolHandle, cmd.cContHandle) if err != nil { if errors.Cause(err) != daos.NoPermission { return errors.Wrapf(err, "failed to query new container %s", contID) @@ -307,10 +329,10 @@ func (cmd *containerCreateCmd) Execute(_ []string) (err error) { // Special case for creating a container without permission to query it. cmd.Errorf("container %s was created, but query failed", contID) - ci = new(containerInfo) - ci.PoolUUID = &cmd.poolUUID + ci = new(daos.ContainerInfo) + ci.PoolUUID = cmd.poolUUID ci.Type = cmd.Type.String() - ci.ContainerUUID = &cmd.contUUID + ci.ContainerUUID = cmd.contUUID ci.ContainerLabel = cmd.Args.Label } @@ -926,7 +948,7 @@ func (cmd *containerStatCmd) Execute(_ []string) error { return nil } -func printContainerInfo(out io.Writer, ci *containerInfo, verbose bool) error { +func printContainerInfo(out io.Writer, ci *daos.ContainerInfo, verbose bool) error { rows := []txtfmt.TableRow{ {"Container UUID": ci.ContainerUUID.String()}, } @@ -939,14 +961,14 @@ func printContainerInfo(out io.Writer, ci *containerInfo, verbose bool) error { rows = append(rows, []txtfmt.TableRow{ {"Pool UUID": ci.PoolUUID.String()}, {"Container redundancy factor": fmt.Sprintf("%d", ci.RedundancyFactor)}, - {"Number of open handles": fmt.Sprintf("%d", *ci.NumHandles)}, - {"Latest open time": fmt.Sprintf("%#x (%s)", *ci.OpenTime, daos.HLC(*ci.OpenTime))}, - {"Latest close/modify time": fmt.Sprintf("%#x (%s)", *ci.CloseModifyTime, daos.HLC(*ci.CloseModifyTime))}, - {"Number of snapshots": fmt.Sprintf("%d", *ci.NumSnapshots)}, + {"Number of open handles": fmt.Sprintf("%d", ci.NumHandles)}, + {"Latest open time": fmt.Sprintf("%#x (%s)", ci.OpenTime, daos.HLC(ci.OpenTime))}, + {"Latest close/modify time": fmt.Sprintf("%#x (%s)", ci.CloseModifyTime, daos.HLC(ci.CloseModifyTime))}, + {"Number of snapshots": fmt.Sprintf("%d", ci.NumSnapshots)}, }...) - if *ci.LatestSnapshot != 0 { - rows = append(rows, txtfmt.TableRow{"Latest Persistent Snapshot": fmt.Sprintf("%#x (%s)", *ci.LatestSnapshot, daos.HLC(*ci.LatestSnapshot))}) + if ci.LatestSnapshot != 0 { + rows = append(rows, txtfmt.TableRow{"Latest Persistent Snapshot": fmt.Sprintf("%#x (%s)", ci.LatestSnapshot, daos.HLC(ci.LatestSnapshot))}) } if ci.ObjectClass != "" { rows = append(rows, txtfmt.TableRow{"Object Class": ci.ObjectClass}) @@ -968,47 +990,16 @@ func printContainerInfo(out io.Writer, ci *containerInfo, verbose bool) error { return err } -type containerInfo struct { - dci C.daos_cont_info_t - PoolUUID *uuid.UUID `json:"pool_uuid"` - ContainerUUID *uuid.UUID `json:"container_uuid"` - ContainerLabel string `json:"container_label,omitempty"` - LatestSnapshot *uint64 `json:"latest_snapshot"` - RedundancyFactor uint32 `json:"redundancy_factor"` - NumHandles *uint32 `json:"num_handles"` - NumSnapshots *uint32 `json:"num_snapshots"` - OpenTime *uint64 `json:"open_time"` - CloseModifyTime *uint64 `json:"close_modify_time"` - Type string `json:"container_type"` - ObjectClass string `json:"object_class,omitempty"` - DirObjectClass string `json:"dir_object_class,omitempty"` - FileObjectClass string `json:"file_object_class,omitempty"` - CHints string `json:"hints,omitempty"` - ChunkSize uint64 `json:"chunk_size,omitempty"` -} - -func (ci *containerInfo) MarshalJSON() ([]byte, error) { - type toJSON containerInfo - return json.Marshal(&struct { - *toJSON - }{ - toJSON: (*toJSON)(ci), - }) -} - -// as an experiment, try creating a Go struct whose members are -// pointers into the C struct. -func newContainerInfo(poolUUID, contUUID *uuid.UUID) *containerInfo { - ci := new(containerInfo) - - ci.PoolUUID = poolUUID - ci.ContainerUUID = contUUID - ci.LatestSnapshot = (*uint64)(&ci.dci.ci_lsnapshot) - ci.NumHandles = (*uint32)(&ci.dci.ci_nhandles) - ci.NumSnapshots = (*uint32)(&ci.dci.ci_nsnapshots) - ci.OpenTime = (*uint64)(&ci.dci.ci_md_otime) - ci.CloseModifyTime = (*uint64)(&ci.dci.ci_md_mtime) - return ci +func convertContainerInfo(poolUUID, contUUID uuid.UUID, cInfo *C.daos_cont_info_t) *daos.ContainerInfo { + return &daos.ContainerInfo{ + PoolUUID: poolUUID, + ContainerUUID: contUUID, + LatestSnapshot: uint64(cInfo.ci_lsnapshot), + NumHandles: uint32(cInfo.ci_nhandles), + NumSnapshots: uint32(cInfo.ci_nsnapshots), + OpenTime: uint64(cInfo.ci_md_otime), + CloseModifyTime: uint64(cInfo.ci_md_mtime), + } } type containerQueryCmd struct { @@ -1028,7 +1019,7 @@ func (cmd *containerQueryCmd) Execute(_ []string) error { } defer cleanup() - ci, err := cmd.queryContainer() + ci, err := queryContainer(cmd.poolUUID, cmd.contUUID, cmd.cPoolHandle, cmd.cContHandle) if err != nil { return errors.Wrapf(err, "failed to query container %s", diff --git a/src/control/cmd/daos/health.go b/src/control/cmd/daos/health.go new file mode 100644 index 00000000000..cbc29c1e3ba --- /dev/null +++ b/src/control/cmd/daos/health.go @@ -0,0 +1,168 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package main + +import ( + "fmt" + "strings" + + "github.com/google/uuid" + + "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/cmd/daos/pretty" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/logging" +) + +type healthCmds struct { + Check healthCheckCmd `command:"check" description:"Perform DAOS system health checks"` +} + +type healthCheckCmd struct { + daosCmd + Verbose bool `short:"v" long:"verbose" description:"Display more detailed system health information"` +} + +func collectBuildInfo(log logging.Logger, shi *daos.SystemHealthInfo) error { + shi.ComponentBuildInfo[build.ComponentClient.String()] = daos.ComponentBuild{ + Version: build.DaosVersion, + BuildInfo: build.BuildInfo, + } + srvBuild, err := srvBuildInfo() + if err != nil { + log.Errorf("failed to get server build info: %v", err) + } else { + shi.ComponentBuildInfo[build.ComponentServer.String()] = daos.ComponentBuild{ + Version: srvBuild.Version, + BuildInfo: srvBuild.BuildInfo, + } + } + + for _, libName := range []string{"libfabric", "mercury", "libdaos"} { + ver, path, err := build.GetLibraryInfo(libName) + if err != nil { + log.Debugf("failed to get %q info: %+v", libName, err) + continue + } + shi.ComponentBuildInfo[libName] = daos.ComponentBuild{ + Version: ver.String(), + LibPath: path, + } + } + + return nil +} + +func (cmd *healthCheckCmd) Execute([]string) error { + // TODO (DAOS-10028): Move this logic into the daos package once the API is available. + systemHealth := &daos.SystemHealthInfo{ + ComponentBuildInfo: make(map[string]daos.ComponentBuild), + Pools: make(map[uuid.UUID]*daos.PoolInfo), + Containers: make(map[uuid.UUID][]*daos.ContainerInfo), + } + if err := collectBuildInfo(cmd.Logger, systemHealth); err != nil { + return err + } + + sysInfo, err := cmd.apiProvider.GetSystemInfo() + if err != nil { + cmd.Errorf("failed to query system information: %v", err) + } + systemHealth.SystemInfo = sysInfo + + cmd.Infof("Checking DAOS system: %s", systemHealth.SystemInfo.Name) + + pools, err := getPoolList(cmd.Logger, cmd.SysName, true) + if err != nil { + cmd.Errorf("failed to get pool list: %v", err) + } + + for _, pool := range pools { + systemHealth.Pools[pool.UUID] = pool + + poolHdl, _, err := poolConnect(pool.UUID.String(), cmd.SysName, daos.PoolConnectFlagReadOnly, false) + if err != nil { + cmd.Errorf("failed to connect to pool %s: %v", pool.Label, err) + continue + } + defer func() { + if err := poolDisconnectAPI(poolHdl); err != nil { + cmd.Errorf("failed to disconnect from pool %s: %v", pool.Label, err) + } + }() + + queryMask := daos.MustNewPoolQueryMask(daos.PoolQueryOptionEnabledEngines) + tpi, err := queryPool(poolHdl, queryMask) + if err != nil { + cmd.Errorf("failed to query pool %s: %v", pool.Label, err) + continue + } + pool.EnabledRanks = tpi.EnabledRanks + + if pool.DisabledTargets > 0 { + queryMask.ClearAll() + queryMask.SetOptions(daos.PoolQueryOptionDisabledEngines) + tpi, err = queryPool(poolHdl, queryMask) + if err != nil { + cmd.Errorf("failed to query pool %s: %v", pool.Label, err) + continue + } + pool.DisabledRanks = tpi.DisabledRanks + } + + poolConts, err := listContainers(poolHdl) + if err != nil { + cmd.Errorf("failed to list containers on pool %s: %v", pool.Label, err) + continue + } + + for _, cont := range poolConts { + openFlags := uint(daos.ContainerOpenFlagReadOnly | daos.ContainerOpenFlagForce | daos.ContainerOpenFlagReadOnlyMetadata) + contHdl, contInfo, err := containerOpen(poolHdl, cont.UUID.String(), openFlags, true) + if err != nil { + cmd.Errorf("failed to connect to container %s: %v", cont.Label, err) + ci := &daos.ContainerInfo{ + PoolUUID: pool.UUID, + ContainerUUID: cont.UUID, + ContainerLabel: cont.Label, + Health: fmt.Sprintf("Unknown (%s)", err), + } + systemHealth.Containers[pool.UUID] = append(systemHealth.Containers[pool.UUID], ci) + continue + } + ci := convertContainerInfo(pool.UUID, cont.UUID, contInfo) + ci.ContainerLabel = cont.Label + + props, freeProps, err := getContainerProperties(contHdl, "status") + if err != nil || len(props) == 0 { + cmd.Errorf("failed to get container properties for %s: %v", cont.Label, err) + ci.Health = fmt.Sprintf("Unknown (%s)", err) + } else { + ci.Health = props[0].String() + } + freeProps() + + if err := containerCloseAPI(contHdl); err != nil { + cmd.Errorf("failed to close container %s: %v", cont.Label, err) + } + + systemHealth.Containers[pool.UUID] = append(systemHealth.Containers[pool.UUID], ci) + } + } + + if cmd.JSONOutputEnabled() { + return cmd.OutputJSON(systemHealth, nil) + } + + var buf strings.Builder + if err := pretty.PrintSystemHealthInfo(&buf, systemHealth, cmd.Verbose); err != nil { + return err + } + cmd.Info(buf.String()) + + return nil +} diff --git a/src/control/cmd/daos/main.go b/src/control/cmd/daos/main.go index 4cf634be6a2..5d6886cb2e5 100644 --- a/src/control/cmd/daos/main.go +++ b/src/control/cmd/daos/main.go @@ -27,12 +27,14 @@ type cliOptions struct { Debug bool `long:"debug" description:"Enable debug output"` Verbose bool `long:"verbose" description:"Enable verbose output (when applicable)"` JSON bool `long:"json" short:"j" description:"Enable JSON output"` + SysName string `long:"sys-name" short:"G" description:"DAOS system name (optional)"` Container containerCmd `command:"container" alias:"cont" description:"Perform tasks related to DAOS containers"` Pool poolCmd `command:"pool" description:"Perform tasks related to DAOS pools"` Filesystem fsCmd `command:"filesystem" alias:"fs" description:"POSIX filesystem operations"` Object objectCmd `command:"object" alias:"obj" description:"DAOS object operations"` System systemCmd `command:"system" alias:"sys" description:"DAOS system operations"` Version versionCmd `command:"version" description:"Print daos version"` + Health healthCmds `command:"health" description:"DAOS health operations"` ServerVersion serverVersionCmd `command:"server-version" description:"Print server version"` ManPage cmdutil.ManCmd `command:"manpage" hidden:"true"` faultsCmdRoot @@ -108,6 +110,10 @@ or query/manage an object inside a container.` return cmd.Execute(args) } + if sysCmd, ok := cmd.(interface{ setSysName(string) }); ok { + sysCmd.setSysName(opts.SysName) + } + if opts.Debug { log.SetLevel(logging.LogLevelTrace) if os.Getenv("D_LOG_MASK") == "" { diff --git a/src/control/cmd/daos/pool.go b/src/control/cmd/daos/pool.go index cf1d1512580..275e0859005 100644 --- a/src/control/cmd/daos/pool.go +++ b/src/control/cmd/daos/pool.go @@ -71,8 +71,7 @@ type poolBaseCmd struct { cPoolHandle C.daos_handle_t - SysName string `long:"sys-name" short:"G" description:"DAOS system name"` - Args struct { + Args struct { Pool PoolID `positional-arg-name:"pool label or UUID" description:"required if --path is not used"` } `positional-args:"yes"` } @@ -254,6 +253,7 @@ func convertPoolRebuildStatus(in *C.struct_daos_rebuild_status) *daos.PoolRebuil Status: int32(in.rs_errno), } if out.Status == 0 { + out.TotalObjects = uint64(in.rs_toberb_obj_nr) out.Objects = uint64(in.rs_obj_nr) out.Records = uint64(in.rs_rec_nr) switch { @@ -727,9 +727,8 @@ func getPoolList(log logging.Logger, sysName string, queryEnabled bool) ([]*daos type poolListCmd struct { daosCmd - SysName string `long:"sys-name" short:"G" description:"DAOS system name"` - Verbose bool `short:"v" long:"verbose" description:"Add pool UUIDs and service replica lists to display"` - NoQuery bool `short:"n" long:"no-query" description:"Disable query of listed pools"` + Verbose bool `short:"v" long:"verbose" description:"Add pool UUIDs and service replica lists to display"` + NoQuery bool `short:"n" long:"no-query" description:"Disable query of listed pools"` } func (cmd *poolListCmd) Execute(_ []string) error { diff --git a/src/control/cmd/daos/pretty/health.go b/src/control/cmd/daos/pretty/health.go new file mode 100644 index 00000000000..25c94e10f99 --- /dev/null +++ b/src/control/cmd/daos/pretty/health.go @@ -0,0 +1,187 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package pretty + +import ( + "encoding/json" + "fmt" + "io" + "strings" + + "github.com/dustin/go-humanize" + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/txtfmt" +) + +const ( + minDisplayImbalance = 10 +) + +func printBuildInfo(out io.Writer, comp string, cb *daos.ComponentBuild) { + if cb == nil { + return + } + + buildInfo := cb.BuildInfo + if buildInfo == "" { + buildInfo = cb.Version + } + fmt.Fprintf(out, "%s: %s\n", txtfmt.Title(comp), buildInfo) +} + +func printSystemInfo(out io.Writer, si *daos.SystemInfo, verbose bool) { + if si == nil { + return + } + + fmt.Fprintf(out, "System Name: %s\n", si.Name) + fmt.Fprintf(out, "Fabric Provider: %s\n", si.Provider) + if verbose { + fmt.Fprintf(out, "Agent Path: %s\n", si.AgentPath) + } + apStr := "[]" + if len(si.AccessPointRankURIs) > 0 { + if apBytes, err := json.Marshal(si.AccessPoints()); err == nil { + apStr = string(apBytes) + } + } + fmt.Fprintf(out, "Access Points: %s\n", apStr) +} + +func printPoolHealth(out io.Writer, pi *daos.PoolInfo, verbose bool) { + if pi == nil { + return + } + + var healthStrings []string + if pi.DisabledTargets > 0 { + degStr := "Degraded" + if verbose { + degStr += fmt.Sprintf(" (%d/%d targets disabled)", pi.DisabledTargets, pi.TotalTargets) + } + healthStrings = append(healthStrings, degStr) + } + if pi.Rebuild != nil { + rbi := pi.Rebuild + if rbi.State == daos.PoolRebuildStateBusy { + var pctCmp float64 + if rbi.TotalObjects > 0 { + pctCmp = (float64(rbi.Objects) / float64(rbi.TotalObjects)) * 100 + } + + rbStr := fmt.Sprintf("Rebuilding (%.01f%% complete)", pctCmp) + if verbose { + rbStr += fmt.Sprintf(" (%d/%d objects; %d records)", rbi.Objects, rbi.TotalObjects, rbi.Records) + } + healthStrings = append(healthStrings, rbStr) + } + } + + // If there's no other pertinent info, just display some useful storage stats. + if len(healthStrings) == 0 { + healthStr := "Healthy" + var metaFree uint64 + var metaTotal uint64 + var metaImbal uint32 + var dataFree uint64 + var dataTotal uint64 + var dataImbal uint32 + var totalFree uint64 + for _, tier := range pi.Usage() { + switch tier.TierName { + case strings.ToUpper(daos.StorageMediaTypeScm.String()): + metaFree = tier.Free + metaTotal = tier.Size + metaImbal = tier.Imbalance + totalFree += metaFree + case strings.ToUpper(daos.StorageMediaTypeNvme.String()): + dataFree = tier.Free + dataTotal = tier.Size + dataImbal = tier.Imbalance + totalFree += dataFree + } + } + if !verbose { + healthStr += fmt.Sprintf(" (%s Free)", humanize.Bytes(totalFree)) + } else { + healthStr += " (meta: " + healthStr += fmt.Sprintf("%s/%s", humanize.Bytes(metaFree), humanize.Bytes(metaTotal)) + healthStr += " Free, data: " + healthStr += fmt.Sprintf("%s/%s", humanize.Bytes(dataFree), humanize.Bytes(dataTotal)) + healthStr += " Free)" + if metaImbal > minDisplayImbalance || dataImbal > minDisplayImbalance { + healthStr += fmt.Sprintf(" (imbalances: %d%% meta, %d%% data)", metaImbal, dataImbal) + } + } + healthStrings = append(healthStrings, healthStr) + } + fmt.Fprintf(out, "%s: %s\n", pi.Name(), strings.Join(healthStrings, ",")) +} + +func printContainerHealth(out io.Writer, ci *daos.ContainerInfo, verbose bool) { + if ci == nil { + return + } + + fmt.Fprintf(out, "%s: %s\n", ci.Name(), txtfmt.Title(ci.Health)) +} + +// PrintSystemHealthInfo pretty-prints the supplied system health struct. +func PrintSystemHealthInfo(out io.Writer, shi *daos.SystemHealthInfo, verbose bool) error { + if shi == nil { + return errors.Errorf("nil %T", shi) + } + + iw := txtfmt.NewIndentWriter(out) + + fmt.Fprintln(out, "Component Build Information") + srvStr := build.ComponentServer.String() + cliStr := build.ComponentClient.String() + if srvBuild, found := shi.ComponentBuildInfo[srvStr]; found { + printBuildInfo(iw, srvStr, &srvBuild) + } + if cliBuild, found := shi.ComponentBuildInfo[cliStr]; found { + printBuildInfo(iw, cliStr, &cliBuild) + } + if verbose { + fmt.Fprintln(out, "Client Library Information") + for comp, bi := range shi.ComponentBuildInfo { + if comp != srvStr && comp != cliStr { + printBuildInfo(iw, comp, &bi) + } + } + } + + if shi.SystemInfo != nil { + fmt.Fprintln(out, "System Information") + printSystemInfo(iw, shi.SystemInfo, verbose) + } + + fmt.Fprintln(out, "Pool Status") + if len(shi.Pools) > 0 { + for _, pool := range shi.Pools { + iiw := txtfmt.NewIndentWriter(iw) + printPoolHealth(iw, pool, verbose) + fmt.Fprintln(iiw, "Container Status") + iiiw := txtfmt.NewIndentWriter(iiw) + if len(shi.Containers[pool.UUID]) > 0 { + for _, cont := range shi.Containers[pool.UUID] { + printContainerHealth(iiiw, cont, verbose) + } + } else { + fmt.Fprintln(iiiw, "No containers in pool.") + } + } + } else { + fmt.Fprintln(iw, "No pools in system.") + } + + return nil +} diff --git a/src/control/cmd/daos/pretty/health_test.go b/src/control/cmd/daos/pretty/health_test.go new file mode 100644 index 00000000000..909becb2b0f --- /dev/null +++ b/src/control/cmd/daos/pretty/health_test.go @@ -0,0 +1,413 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package pretty + +import ( + "errors" + "fmt" + "strings" + "testing" + + "github.com/dustin/go-humanize" + "github.com/google/go-cmp/cmp" + "github.com/google/uuid" + + "github.com/daos-stack/daos/src/control/build" + "github.com/daos-stack/daos/src/control/common/proto/convert" + "github.com/daos-stack/daos/src/control/common/test" + "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/ranklist" +) + +var ( + srvComp = &daos.ComponentBuild{ + Version: "1.2.3", + } + cliComp = &daos.ComponentBuild{ + Version: "1.2.3", + BuildInfo: "1.2.4-pre12345", + } +) + +func TestPretty_printBuildInfo(t *testing.T) { + for name, tc := range map[string]struct { + testComp build.Component + testCompBuild *daos.ComponentBuild + expPrintStr string + }{ + "nil build": { + testComp: build.ComponentServer, + }, + "version only": { + testComp: build.ComponentServer, + testCompBuild: srvComp, + expPrintStr: fmt.Sprintf(` +Server: %s +`, srvComp.Version), + }, + "buildinfo overrides version": { + testComp: build.ComponentAgent, + testCompBuild: cliComp, + expPrintStr: fmt.Sprintf(` +Agent: %s +`, cliComp.BuildInfo), + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + printBuildInfo(&bld, tc.testComp.String(), tc.testCompBuild) + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected pretty-printed string (-want, +got):\n%s\n", diff) + } + }) + } +} + +var ( + sysName = "test-system" + provStr = "test-provider" + hostBase = "test-host" + + sysInfo = &daos.SystemInfo{ + Name: sysName, + Provider: provStr, + AccessPointRankURIs: []*daos.RankURI{ + { + Rank: 1, + URI: provStr + "://" + hostBase + "1", + }, + { + Rank: 0, + URI: provStr + "://" + hostBase + "0", + }, + }, + } +) + +func TestPretty_printSystemInfo(t *testing.T) { + for name, tc := range map[string]struct { + testSysInfo *daos.SystemInfo + verbose bool + expPrintStr string + }{ + "nil sysinfo": {}, + "standard sysinfo": { + testSysInfo: sysInfo, + expPrintStr: fmt.Sprintf(` +System Name: %s +Fabric Provider: %s +Access Points: ["%s0","%s1"] +`, sysName, provStr, hostBase, hostBase), + }, + "empty APs list": { + testSysInfo: func() *daos.SystemInfo { + newInfo := new(daos.SystemInfo) + if err := convert.Types(sysInfo, newInfo); err != nil { + t.Fatal(err) + } + newInfo.AccessPointRankURIs = nil + return newInfo + }(), + expPrintStr: fmt.Sprintf(` +System Name: %s +Fabric Provider: %s +Access Points: [] +`, sysName, provStr), + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + printSystemInfo(&bld, tc.testSysInfo, tc.verbose) + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected pretty-printed string (-want, +got):\n%s\n", diff) + } + }) + } +} + +var healthyPool = &daos.PoolInfo{ + QueryMask: daos.DefaultPoolQueryMask, + State: daos.PoolServiceStateReady, + UUID: uuid.New(), + Label: "test-pool", + TotalTargets: 24, + ActiveTargets: 24, + TotalEngines: 3, + DisabledTargets: 0, + Version: 1, + ServiceLeader: 2, + ServiceReplicas: []ranklist.Rank{0, 1, 2}, + Rebuild: &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateIdle, + }, + TierStats: []*daos.StorageUsageStats{ + { + MediaType: daos.StorageMediaTypeScm, + Total: 128 * humanize.GByte, + Free: 64 * humanize.GByte, + Min: 2420 * humanize.MByte, + Max: 5333 * humanize.MByte, + Mean: 2666 * humanize.GiByte, + }, + { + MediaType: daos.StorageMediaTypeNvme, + Total: 6 * humanize.TByte, + Free: 4 * humanize.TByte, + Min: 130 * humanize.GByte, + Max: 240 * humanize.GByte, + Mean: 166 * humanize.TByte, + }, + }, + EnabledRanks: ranklist.MustCreateRankSet("[0,1,2]"), + PoolLayoutVer: 1, + UpgradeLayoutVer: 1, +} + +func TestPretty_printPoolHealth(t *testing.T) { + busyRebuild := &daos.PoolRebuildStatus{ + State: daos.PoolRebuildStateBusy, + Objects: 42, + Records: 7, + TotalObjects: 100, + } + + getTestPool := func(mut func(*daos.PoolInfo) *daos.PoolInfo) *daos.PoolInfo { + testPool := new(daos.PoolInfo) + if err := convert.Types(healthyPool, testPool); err != nil { + t.Fatal(err) + } + return mut(testPool) + } + + for name, tc := range map[string]struct { + pi *daos.PoolInfo + verbose bool + expPrintStr string + }{ + "nil PoolInfo": {}, + "disabled targets": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.DisabledTargets = 8 + return pi + }), + expPrintStr: fmt.Sprintf(` +%s: Degraded +`, healthyPool.Label), + }, + "disabled targets; verbose": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.DisabledTargets = 8 + return pi + }), + verbose: true, + expPrintStr: fmt.Sprintf(` +%s: Degraded (8/24 targets disabled) +`, healthyPool.Label), + }, + "rebuilding": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.Rebuild = busyRebuild + return pi + }), + expPrintStr: fmt.Sprintf(` +%s: Rebuilding (42.0%% complete) +`, healthyPool.Label), + }, + "rebuilding; verbose": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.Rebuild = busyRebuild + return pi + }), + verbose: true, + expPrintStr: fmt.Sprintf(` +%s: Rebuilding (42.0%% complete) (42/100 objects; 7 records) +`, healthyPool.Label), + }, + "degraded, rebuilding; verbose": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.DisabledTargets = 8 + pi.Rebuild = busyRebuild + return pi + }), + verbose: true, + expPrintStr: fmt.Sprintf(` +%s: Degraded (8/24 targets disabled),Rebuilding (42.0%% complete) (42/100 objects; 7 records) +`, healthyPool.Label), + }, + "healthy": { + pi: healthyPool, + expPrintStr: fmt.Sprintf(` +%s: Healthy (4.1 TB Free) +`, healthyPool.Label), + }, + "healthy; verbose": { + pi: healthyPool, + verbose: true, + expPrintStr: fmt.Sprintf(` +%s: Healthy (meta: 64 GB/128 GB Free, data: 4.0 TB/6.0 TB Free) (imbalances: 54%% meta, 44%% data) +`, healthyPool.Label), + }, + "healthy no imbalances; verbose": { + pi: getTestPool(func(pi *daos.PoolInfo) *daos.PoolInfo { + pi.TierStats[0].Min = pi.TierStats[0].Max + pi.TierStats[1].Min = pi.TierStats[1].Max + return pi + }), + verbose: true, + expPrintStr: fmt.Sprintf(` +%s: Healthy (meta: 64 GB/128 GB Free, data: 4.0 TB/6.0 TB Free) +`, healthyPool.Label), + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + printPoolHealth(&bld, tc.pi, tc.verbose) + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected pretty-printed string (-want, +got):\n%s\n", diff) + } + }) + } +} + +var healthyContainer = &daos.ContainerInfo{ + PoolUUID: healthyPool.UUID, + ContainerUUID: uuid.New(), + ContainerLabel: "test-container", + LatestSnapshot: 0, + RedundancyFactor: 1, + NumHandles: 1, + NumSnapshots: 0, + OpenTime: 1782965550217953280, + CloseModifyTime: 1782965553047535616, + Health: "HEALTHY", +} + +func TestPretty_printContainerHealth(t *testing.T) { + for name, tc := range map[string]struct { + ci *daos.ContainerInfo + verbose bool + expPrintStr string + }{ + "nil ContainerInfo": {}, + "healthy": { + ci: healthyContainer, + expPrintStr: fmt.Sprintf(` +%s: Healthy +`, healthyContainer.ContainerLabel), + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + printContainerHealth(&bld, tc.ci, tc.verbose) + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected pretty-printed string (-want, +got):\n%s\n", diff) + } + }) + } + +} + +func TestPretty_PrintSystemHealthInfo(t *testing.T) { + buildInfo := map[string]daos.ComponentBuild{ + build.ComponentServer.String(): *srvComp, + build.ComponentClient.String(): *cliComp, + } + + for name, tc := range map[string]struct { + shi *daos.SystemHealthInfo + verbose bool + expErr error + expPrintStr string + }{ + "nil SystemHealthInfo": { + expErr: errors.New("nil"), + }, + "healthy": { + shi: &daos.SystemHealthInfo{ + SystemInfo: sysInfo, + ComponentBuildInfo: buildInfo, + Pools: map[uuid.UUID]*daos.PoolInfo{ + healthyPool.UUID: healthyPool, + }, + Containers: map[uuid.UUID][]*daos.ContainerInfo{ + healthyContainer.PoolUUID: {healthyContainer}, + }, + }, + expPrintStr: fmt.Sprintf(` +Component Build Information + Server: %s + Client: %s +System Information + System Name: %s + Fabric Provider: %s + Access Points: ["%s0","%s1"] +Pool Status + %s: Healthy (4.1 TB Free) + Container Status + %s: Healthy +`, srvComp.Version, cliComp.BuildInfo, sysInfo.Name, sysInfo.Provider, hostBase, hostBase, healthyPool.Label, healthyContainer.ContainerLabel), + }, + "healthy; no pools": { + shi: &daos.SystemHealthInfo{ + SystemInfo: sysInfo, + ComponentBuildInfo: buildInfo, + Pools: map[uuid.UUID]*daos.PoolInfo{}, + }, + expPrintStr: fmt.Sprintf(` +Component Build Information + Server: %s + Client: %s +System Information + System Name: %s + Fabric Provider: %s + Access Points: ["%s0","%s1"] +Pool Status + No pools in system. +`, srvComp.Version, cliComp.BuildInfo, sysInfo.Name, sysInfo.Provider, hostBase, hostBase), + }, + "healthy; no containers": { + shi: &daos.SystemHealthInfo{ + SystemInfo: sysInfo, + ComponentBuildInfo: buildInfo, + Pools: map[uuid.UUID]*daos.PoolInfo{ + healthyPool.UUID: healthyPool, + }, + Containers: map[uuid.UUID][]*daos.ContainerInfo{}, + }, + expPrintStr: fmt.Sprintf(` +Component Build Information + Server: %s + Client: %s +System Information + System Name: %s + Fabric Provider: %s + Access Points: ["%s0","%s1"] +Pool Status + %s: Healthy (4.1 TB Free) + Container Status + No containers in pool. +`, srvComp.Version, cliComp.BuildInfo, sysInfo.Name, sysInfo.Provider, hostBase, hostBase, healthyPool.Label), + }, + } { + t.Run(name, func(t *testing.T) { + var bld strings.Builder + gotErr := PrintSystemHealthInfo(&bld, tc.shi, tc.verbose) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + if diff := cmp.Diff(strings.TrimLeft(tc.expPrintStr, "\n"), bld.String()); diff != "" { + t.Fatalf("unexpected pretty-printed string (-want, +got):\n%s\n", diff) + } + }) + } +} diff --git a/src/control/cmd/daos/system.go b/src/control/cmd/daos/system.go index 70ec95ed06a..bdb189494f8 100644 --- a/src/control/cmd/daos/system.go +++ b/src/control/cmd/daos/system.go @@ -6,29 +6,10 @@ package main -/* -#include "util.h" -*/ -import "C" - import ( - "unsafe" - "github.com/pkg/errors" ) -type rankURI struct { - Rank uint32 `json:"rank"` - URI string `json:"uri"` -} - -type systemInfo struct { - Name string `json:"system_name"` - Provider string `json:"fabric_provider"` - RankURIs []*rankURI `json:"rank_uris"` - AccessPointRankURIs []*rankURI `json:"access_point_rank_uris"` -} - type systemCmd struct { Query systemQueryCmd `command:"query" description:"query DAOS system via the daos_agent"` } @@ -38,31 +19,9 @@ type systemQueryCmd struct { } func (cmd *systemQueryCmd) Execute(_ []string) error { - var cSysInfo *C.struct_daos_sys_info - rc := C.daos_mgmt_get_sys_info(nil, &cSysInfo) - if err := daosError(rc); err != nil { - return errors.Wrap(err, "querying DAOS system information") - } - defer C.daos_mgmt_put_sys_info(cSysInfo) - - sysInfo := &systemInfo{ - Name: C.GoString(&cSysInfo.dsi_system_name[0]), - Provider: C.GoString(&cSysInfo.dsi_fabric_provider[0]), - } - - rankURIs := make(map[uint32]*rankURI) - - for _, cRank := range unsafe.Slice(cSysInfo.dsi_ranks, int(cSysInfo.dsi_nr_ranks)) { - rankURI := &rankURI{ - Rank: uint32(cRank.dru_rank), - URI: C.GoString(cRank.dru_uri), - } - sysInfo.RankURIs = append(sysInfo.RankURIs, rankURI) - rankURIs[rankURI.Rank] = rankURI - } - - for _, cMSRank := range unsafe.Slice(cSysInfo.dsi_ms_ranks, int(cSysInfo.dsi_nr_ms_ranks)) { - sysInfo.AccessPointRankURIs = append(sysInfo.AccessPointRankURIs, rankURIs[uint32(cMSRank)]) + sysInfo, err := cmd.apiProvider.GetSystemInfo() + if err != nil { + return errors.Wrap(err, "failed to query DAOS system") } if cmd.JSONOutputEnabled() { diff --git a/src/control/cmd/daos/util.go b/src/control/cmd/daos/util.go index 383b6cd21b0..1800a130e2a 100644 --- a/src/control/cmd/daos/util.go +++ b/src/control/cmd/daos/util.go @@ -20,6 +20,7 @@ import ( "github.com/daos-stack/daos/src/control/build" "github.com/daos-stack/daos/src/control/common/cmdutil" "github.com/daos-stack/daos/src/control/lib/daos" + "github.com/daos-stack/daos/src/control/lib/daos/api" "github.com/daos-stack/daos/src/control/lib/ranklist" "github.com/daos-stack/daos/src/control/logging" ) @@ -84,10 +85,7 @@ func srvBuildInfo() (*build.Info, error) { } func daosError(rc C.int) error { - if rc == 0 { - return nil - } - return daos.Status(rc) + return daos.ErrorFromRC(int(rc)) } func goBool2int(in bool) (out C.int) { @@ -281,20 +279,22 @@ type daosCmd struct { cmdutil.NoArgsCmd cmdutil.JSONOutputCmd cmdutil.LogCmd + apiProvider *api.Provider + SysName string +} + +func (dc *daosCmd) setSysName(sysName string) { + dc.SysName = sysName } func (dc *daosCmd) initDAOS() (func(), error) { - if rc := C.daos_init(); rc != 0 { - // Do some inspection of the RC to display an informative error to the user - // e.g. "No DAOS Agent detected"... - return nil, errors.Wrap(daosError(rc), "daos_init() failed") + provider, err := api.NewProvider(dc.Logger) + if err != nil { + return func() {}, err } + dc.apiProvider = provider - return func() { - if rc := C.daos_fini(); rc != 0 { - dc.Errorf("daos_fini() failed: %s", daosError(rc)) - } - }, nil + return provider.Cleanup, nil } func initDaosDebug() (func(), error) { diff --git a/src/control/lib/control/pool.go b/src/control/lib/control/pool.go index 2b5bfc89efd..9a9772f0256 100644 --- a/src/control/lib/control/pool.go +++ b/src/control/lib/control/pool.go @@ -465,78 +465,17 @@ func (pqr *PoolQueryResp) MarshalJSON() ([]byte, error) { return []byte("null"), nil } - piJSON, err := json.Marshal(&pqr.PoolInfo) - if err != nil { - return nil, err - } - - aux := &struct { - EnabledRanks *[]ranklist.Rank `json:"enabled_ranks"` - DisabledRanks *[]ranklist.Rank `json:"disabled_ranks"` - Status int32 `json:"status"` - }{ - Status: pqr.Status, - } - - if pqr.EnabledRanks != nil { - ranks := pqr.EnabledRanks.Ranks() - aux.EnabledRanks = &ranks - } - - if pqr.DisabledRanks != nil { - ranks := pqr.DisabledRanks.Ranks() - aux.DisabledRanks = &ranks - } - - auxJSON, err := json.Marshal(&aux) - if err != nil { - return nil, err - } - - // Kinda gross, but needed to merge the embedded struct's MarshalJSON - // output with this one's. - piJSON[0] = ',' - return append(auxJSON[:len(auxJSON)-1], piJSON...), nil -} - -func unmarshallRankSet(ranks string) (*ranklist.RankSet, error) { - switch ranks { - case "": - return nil, nil - case "[]": - return &ranklist.RankSet{}, nil - default: - return ranklist.CreateRankSet(ranks) - } -} - -func (pqr *PoolQueryResp) UnmarshalJSON(data []byte) error { - type Alias PoolQueryResp - aux := &struct { - EnabledRanks string `json:"enabled_ranks"` - DisabledRanks string `json:"disabled_ranks"` + // Bypass the MarshalJSON() implementation in daos.PoolInfo, + // which would otherwise be promoted, resulting in the Status + // field not being included. + type Alias daos.PoolInfo + return json.Marshal(&struct { *Alias + Status int32 `json:"status"` }{ - Alias: (*Alias)(pqr), - } - - if err := json.Unmarshal(data, &aux); err != nil { - return err - } - - if rankSet, err := unmarshallRankSet(aux.EnabledRanks); err != nil { - return err - } else { - pqr.EnabledRanks = rankSet - } - - if rankSet, err := unmarshallRankSet(aux.DisabledRanks); err != nil { - return err - } else { - pqr.DisabledRanks = rankSet - } - - return nil + Alias: (*Alias)(&pqr.PoolInfo), + Status: pqr.Status, + }) } // PoolQuery performs a pool query operation for the specified pool ID on a diff --git a/src/control/lib/control/pool_test.go b/src/control/lib/control/pool_test.go index ca3a7e68a9d..6849b88b053 100644 --- a/src/control/lib/control/pool_test.go +++ b/src/control/lib/control/pool_test.go @@ -791,7 +791,7 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { }, "null rankset": { pqr: &PoolQueryResp{ - Status: 0, + Status: 42, PoolInfo: daos.PoolInfo{ QueryMask: daos.DefaultPoolQueryMask, State: daos.PoolServiceStateReady, @@ -807,11 +807,11 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { UpgradeLayoutVer: 8, }, }, - exp: `{"enabled_ranks":null,"disabled_ranks":null,"status":0,"query_mask":"rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8}`, + exp: `{"query_mask":"rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`, }, "valid rankset": { pqr: &PoolQueryResp{ - Status: 0, + Status: 42, PoolInfo: daos.PoolInfo{ QueryMask: daos.DefaultPoolQueryMask, State: daos.PoolServiceStateReady, @@ -829,7 +829,7 @@ func TestControl_PoolQueryResp_MarshalJSON(t *testing.T) { UpgradeLayoutVer: 8, }, }, - exp: `{"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"status":0,"query_mask":"rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"pool_layout_ver":7,"upgrade_layout_ver":8}`, + exp: `{"query_mask":"rebuild,space","state":"Ready","uuid":"` + poolUUID.String() + `","total_targets":1,"active_targets":2,"total_engines":3,"disabled_targets":4,"version":5,"svc_ldr":6,"svc_reps":[0,1,2],"rebuild":null,"tier_stats":null,"enabled_ranks":[0,1,2,3,5],"disabled_ranks":[],"pool_layout_ver":7,"upgrade_layout_ver":8,"status":42}`, }, } { t.Run(name, func(t *testing.T) { diff --git a/src/control/lib/daos/api/api.go b/src/control/lib/daos/api/api.go new file mode 100644 index 00000000000..b75bf707b72 --- /dev/null +++ b/src/control/lib/daos/api/api.go @@ -0,0 +1,69 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import ( + "sync" + + "github.com/daos-stack/daos/src/control/lib/daos" +) + +/* +#include + +#cgo LDFLAGS: -ldaos +*/ +import "C" + +type ( + api struct { + sync.RWMutex + initialized bool + } +) + +func daosError(rc C.int) error { + return daos.ErrorFromRC(int(rc)) +} + +func (api *api) isInitialized() bool { + api.RLock() + defer api.RUnlock() + return api.initialized +} + +// Init performs DAOS API initialization steps and returns a closure +// to be called before application exit. +func (api *api) Init() (func(), error) { + api.Lock() + defer api.Unlock() + + stubFini := func() {} + if api.initialized { + return stubFini, daos.Already + } + + if err := daosError(C.daos_init()); err != nil { + return stubFini, err + } + api.initialized = true + + return api.Fini, nil +} + +// Fini releases resources obtained during DAOS API initialization. +func (api *api) Fini() { + api.Lock() + defer api.Unlock() + + if !api.initialized { + return + } + + C.daos_fini() + api.initialized = false +} diff --git a/src/control/lib/daos/api/provider.go b/src/control/lib/daos/api/provider.go new file mode 100644 index 00000000000..5b7c74be35c --- /dev/null +++ b/src/control/lib/daos/api/provider.go @@ -0,0 +1,48 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import ( + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/logging" +) + +type ( + debugTraceLogger interface { + logging.TraceLogger + logging.DebugLogger + } + + // Provider wraps the C DAOS API with methods that accept and return + // standard Go data types. + Provider struct { + log debugTraceLogger + api *api + cleanup func() + } +) + +// NewProvider returns an initialized DAOS API provider. +func NewProvider(log debugTraceLogger) (*Provider, error) { + api := &api{} + cleanup, err := api.Init() + if err != nil { + return nil, errors.Wrap(err, "failed to initialize DAOS API") + } + + return &Provider{ + log: log, + api: api, + cleanup: cleanup, + }, nil +} + +// Cleanup releases resources obtained during API initialization. +func (p *Provider) Cleanup() { + p.cleanup() +} diff --git a/src/control/lib/daos/api/system.go b/src/control/lib/daos/api/system.go new file mode 100644 index 00000000000..73aa2a86c3d --- /dev/null +++ b/src/control/lib/daos/api/system.go @@ -0,0 +1,55 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package api + +import ( + "unsafe" + + "github.com/pkg/errors" + + "github.com/daos-stack/daos/src/control/lib/daos" +) + +/* +#include + +#cgo LDFLAGS: -ldaos +*/ +import "C" + +// GetSystemInfo queries for the connected system information. +func (p *Provider) GetSystemInfo() (*daos.SystemInfo, error) { + var cSysInfo *C.struct_daos_sys_info + rc := C.daos_mgmt_get_sys_info(nil, &cSysInfo) + if err := daos.ErrorFromRC(int(rc)); err != nil { + return nil, errors.Wrap(err, "querying DAOS system information") + } + defer C.daos_mgmt_put_sys_info(cSysInfo) + + sysInfo := &daos.SystemInfo{ + Name: C.GoString(&cSysInfo.dsi_system_name[0]), + Provider: C.GoString(&cSysInfo.dsi_fabric_provider[0]), + AgentPath: C.GoString(&cSysInfo.dsi_agent_path[0]), + } + + rankURIs := make(map[uint32]*daos.RankURI) + + for _, cRank := range unsafe.Slice(cSysInfo.dsi_ranks, int(cSysInfo.dsi_nr_ranks)) { + rankURI := &daos.RankURI{ + Rank: uint32(cRank.dru_rank), + URI: C.GoString(cRank.dru_uri), + } + sysInfo.RankURIs = append(sysInfo.RankURIs, rankURI) + rankURIs[rankURI.Rank] = rankURI + } + + for _, cMSRank := range unsafe.Slice(cSysInfo.dsi_ms_ranks, int(cSysInfo.dsi_nr_ms_ranks)) { + sysInfo.AccessPointRankURIs = append(sysInfo.AccessPointRankURIs, rankURIs[uint32(cMSRank)]) + } + + return sysInfo, nil +} diff --git a/src/control/lib/daos/container.go b/src/control/lib/daos/container.go new file mode 100644 index 00000000000..31c2992f81b --- /dev/null +++ b/src/control/lib/daos/container.go @@ -0,0 +1,61 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import "github.com/google/uuid" + +/* +#include + +#include +*/ +import "C" + +const ( + // ContainerOpenFlagReadOnly opens the container in read-only mode. + ContainerOpenFlagReadOnly = C.DAOS_COO_RO + // ContainerOpenFlagReadWrite opens the container in read-write mode. + ContainerOpenFlagReadWrite = C.DAOS_COO_RW + // ContainerOpenFlagExclusive opens the container in exclusive read-write mode. + ContainerOpenFlagExclusive = C.DAOS_COO_EX + // ContainerOpenFlagForce skips container health checks. + ContainerOpenFlagForce = C.DAOS_COO_FORCE + // ContainerOpenFlagReadOnlyMetadata skips container metadata updates. + ContainerOpenFlagReadOnlyMetadata = C.DAOS_COO_RO_MDSTATS + // ContainerOpenFlagEvict evicts the current user's open handles. + ContainerOpenFlagEvict = C.DAOS_COO_EVICT + // ContainerOpenFlagEvictAll evicts all open handles. + ContainerOpenFlagEvictAll = C.DAOS_COO_EVICT_ALL +) + +// ContainerInfo contains information about the Container. +type ContainerInfo struct { + PoolUUID uuid.UUID `json:"pool_uuid"` + ContainerUUID uuid.UUID `json:"container_uuid"` + ContainerLabel string `json:"container_label,omitempty"` + LatestSnapshot uint64 `json:"latest_snapshot"` + RedundancyFactor uint32 `json:"redundancy_factor"` + NumHandles uint32 `json:"num_handles"` + NumSnapshots uint32 `json:"num_snapshots"` + OpenTime uint64 `json:"open_time"` + CloseModifyTime uint64 `json:"close_modify_time"` + Type string `json:"container_type"` + ObjectClass string `json:"object_class,omitempty"` + DirObjectClass string `json:"dir_object_class,omitempty"` + FileObjectClass string `json:"file_object_class,omitempty"` + CHints string `json:"hints,omitempty"` + ChunkSize uint64 `json:"chunk_size,omitempty"` + Health string `json:"health,omitempty"` // FIXME (DAOS-10028): Should be derived from props +} + +// Name returns an identifier for the container (Label, if set, falling back to UUID). +func (ci *ContainerInfo) Name() string { + if ci.ContainerLabel == "" { + return ci.ContainerUUID.String() + } + return ci.ContainerLabel +} diff --git a/src/control/lib/daos/health.go b/src/control/lib/daos/health.go new file mode 100644 index 00000000000..057f4d1e19a --- /dev/null +++ b/src/control/lib/daos/health.go @@ -0,0 +1,28 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "github.com/google/uuid" +) + +type ( + // ComponentBuild contains string representations of a component's build information. + ComponentBuild struct { + Version string `json:"version"` + BuildInfo string `json:"build_info,omitempty"` + LibPath string `json:"lib_path,omitempty"` + } + + // SystemHealthInfo provides a top-level structure to hold system health information. + SystemHealthInfo struct { + SystemInfo *SystemInfo `json:"system_info"` + ComponentBuildInfo map[string]ComponentBuild `json:"component_build_info"` + Pools map[uuid.UUID]*PoolInfo `json:"pools"` + Containers map[uuid.UUID][]*ContainerInfo `json:"pool_containers,omitempty"` + } +) diff --git a/src/control/lib/daos/pool.go b/src/control/lib/daos/pool.go index 6a2a80635c6..fe44a00e210 100644 --- a/src/control/lib/daos/pool.go +++ b/src/control/lib/daos/pool.go @@ -53,10 +53,11 @@ type ( // PoolRebuildStatus contains detailed information about the pool rebuild process. PoolRebuildStatus struct { - Status int32 `json:"status"` - State PoolRebuildState `json:"state"` - Objects uint64 `json:"objects"` - Records uint64 `json:"records"` + Status int32 `json:"status"` + State PoolRebuildState `json:"state"` + Objects uint64 `json:"objects"` + Records uint64 `json:"records"` + TotalObjects uint64 `json:"total_objects"` } // PoolInfo contains information about the pool. @@ -74,8 +75,8 @@ type ( ServiceReplicas []ranklist.Rank `json:"svc_reps,omitempty"` Rebuild *PoolRebuildStatus `json:"rebuild"` TierStats []*StorageUsageStats `json:"tier_stats"` - EnabledRanks *ranklist.RankSet `json:"-"` - DisabledRanks *ranklist.RankSet `json:"-"` + EnabledRanks *ranklist.RankSet `json:"enabled_ranks,omitempty"` + DisabledRanks *ranklist.RankSet `json:"disabled_ranks,omitempty"` PoolLayoutVer uint32 `json:"pool_layout_ver"` UpgradeLayoutVer uint32 `json:"upgrade_layout_ver"` } @@ -140,6 +141,15 @@ func resolvePoolQueryOpt(name string) (C.int, error) { return 0, errors.Errorf("invalid pool query option: %q", name) } +// MustNewPoolQueryMask returns a PoolQueryMask initialized with the specified options. +// NB: If an error occurs due to an invalid option, it panics. +func MustNewPoolQueryMask(options ...string) (mask PoolQueryMask) { + if err := mask.SetOptions(options...); err != nil { + panic(err) + } + return +} + // SetOptions sets the pool query mask to include the specified options. func (pqm *PoolQueryMask) SetOptions(optNames ...string) error { for _, optName := range optNames { @@ -210,7 +220,7 @@ func (pqm *PoolQueryMask) UnmarshalJSON(data []byte) error { } var newVal PoolQueryMask - for _, opt := range strings.Split(string(data), ",") { + for _, opt := range strings.Split(strings.Trim(string(data), "\""), ",") { for k, v := range poolQueryOptMap { if v == opt { newVal |= PoolQueryMask(k) @@ -355,6 +365,18 @@ func (smt StorageMediaType) MarshalJSON() ([]byte, error) { return []byte(`"` + smt.String() + `"`), nil } +func (smt *StorageMediaType) UnmarshalJSON(data []byte) error { + mediaTypeStr := strings.ToUpper(strings.Trim(string(data), "\"")) + + sm, err := unmarshalStrVal(mediaTypeStr, mgmtpb.StorageMediaType_value, mgmtpb.StorageMediaType_name) + if err != nil { + return errors.Wrap(err, "failed to unmarshal StorageMediaType") + } + *smt = StorageMediaType(sm) + + return nil +} + // PoolRebuildState indicates the current state of the pool rebuild process. type PoolRebuildState int32 @@ -380,7 +402,7 @@ func (prs PoolRebuildState) MarshalJSON() ([]byte, error) { } func (prs *PoolRebuildState) UnmarshalJSON(data []byte) error { - stateStr := strings.ToUpper(string(data)) + stateStr := strings.ToUpper(strings.Trim(string(data), "\"")) state, err := unmarshalStrVal(stateStr, mgmtpb.PoolRebuildStatus_State_value, mgmtpb.PoolRebuildStatus_State_name) if err != nil { diff --git a/src/control/lib/daos/pool_test.go b/src/control/lib/daos/pool_test.go index facbc7682c8..e76f33f4c25 100644 --- a/src/control/lib/daos/pool_test.go +++ b/src/control/lib/daos/pool_test.go @@ -7,6 +7,7 @@ package daos import ( + "encoding/json" "strings" "testing" @@ -267,6 +268,16 @@ func TestDaos_PoolQueryMaskUnmarshalJSON(t *testing.T) { testData: []byte("rebuild,disabled_engines"), expString: "disabled_engines,rebuild", }, + "JSON-encoded string values": { + testData: func() []byte { + if b, err := json.Marshal("rebuild,enabled_engines"); err != nil { + panic(err) + } else { + return b + } + }(), + expString: "enabled_engines,rebuild", + }, } { t.Run(name, func(t *testing.T) { var testMask PoolQueryMask diff --git a/src/control/lib/daos/status.go b/src/control/lib/daos/status.go index e40f71555c9..577c0c01d2b 100644 --- a/src/control/lib/daos/status.go +++ b/src/control/lib/daos/status.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2022 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -30,6 +30,17 @@ func (ds Status) Int32() int32 { return int32(ds) } +// ErrorFromRC converts a simple DAOS return code into an error. +func ErrorFromRC(rc int) error { + if rc == 0 { + return nil + } + if rc > 0 { + rc = -rc + } + return Status(rc) +} + const ( // Success indicates no error Success Status = 0 @@ -160,4 +171,6 @@ const ( NoCert Status = -C.DER_NO_CERT // BadCert indicates that an invalid certificate was detected. BadCert Status = -C.DER_BAD_CERT + // RedundancyFactorExceeded indicates that the maximum number of failed components was exceeded. + RedundancyFactorExceeded = -C.DER_RF ) diff --git a/src/control/lib/daos/status_test.go b/src/control/lib/daos/status_test.go index 2945d99f8fd..bc1f4b57b13 100644 --- a/src/control/lib/daos/status_test.go +++ b/src/control/lib/daos/status_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,6 +7,7 @@ package daos_test import ( + "fmt" "testing" "github.com/pkg/errors" @@ -52,3 +53,17 @@ func TestDaos_Error(t *testing.T) { }) } } + +func TestDaos_ErrorFromRC(t *testing.T) { + for rc, expErr := range map[int]error{ + 0: nil, + -1014: daos.ProtocolError, + 1013: daos.TryAgain, + 424242: daos.Status(-424242), + } { + t.Run(fmt.Sprintf("%v", expErr), func(t *testing.T) { + gotErr := daos.ErrorFromRC(rc) + test.AssertEqual(t, expErr, gotErr, "not equal") + }) + } +} diff --git a/src/control/lib/daos/system.go b/src/control/lib/daos/system.go new file mode 100644 index 00000000000..5b16b83cb02 --- /dev/null +++ b/src/control/lib/daos/system.go @@ -0,0 +1,53 @@ +// +// (C) Copyright 2024 Intel Corporation. +// +// SPDX-License-Identifier: BSD-2-Clause-Patent +// + +package daos + +import ( + "net/url" + "sort" +) + +type ( + // RankURI expresses a rank-to-fabric-URI mapping. + RankURI struct { + Rank uint32 `json:"rank"` + URI string `json:"uri"` + } + + // SystemInfo represents information about the connected system. + SystemInfo struct { + Name string `json:"system_name"` + Provider string `json:"fabric_provider"` + AgentPath string `json:"agent_path"` + RankURIs []*RankURI `json:"rank_uris"` + AccessPointRankURIs []*RankURI `json:"access_point_rank_uris"` + } +) + +// AccessPoints returns a string slice representation of the system access points. +func (si *SystemInfo) AccessPoints() []string { + apSet := make(map[string]struct{}) + for _, ri := range si.AccessPointRankURIs { + // NB: This is intended for IP-based address schemes. If we can't + // pretty-print the URI as an address, then the fallback is to use + // the raw URI. Not as nice, but better than nothing. + url, err := url.Parse(ri.URI) + if err == nil { + apSet[url.Hostname()] = struct{}{} + } else { + apSet[ri.URI] = struct{}{} + } + } + + apList := make([]string, 0, len(apSet)) + for ap := range apSet { + apList = append(apList, ap) + } + sort.Strings(apList) + + return apList +} diff --git a/src/control/lib/dlopen/dlopen.go b/src/control/lib/dlopen/dlopen.go index 8f748471fbb..96993a97ecf 100644 --- a/src/control/lib/dlopen/dlopen.go +++ b/src/control/lib/dlopen/dlopen.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -30,7 +30,7 @@ type LibHandle struct { // by the names specified in libs and returning the first that is successfully // opened. Callers are responsible for closing the handler. If no library can // be successfully opened, an error is returned. -func GetHandle(libs []string) (*LibHandle, error) { +func GetHandle(libs ...string) (*LibHandle, error) { for _, name := range libs { libname := C.CString(name) defer C.free(unsafe.Pointer(libname)) diff --git a/src/control/lib/dlopen/dlopen_example.go b/src/control/lib/dlopen/dlopen_example.go index 178718a5311..6488916d92e 100644 --- a/src/control/lib/dlopen/dlopen_example.go +++ b/src/control/lib/dlopen/dlopen_example.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -27,7 +27,7 @@ import ( ) func strlen(libs []string, s string) (int, error) { - h, err := GetHandle(libs) + h, err := GetHandle(libs...) if err != nil { return -1, fmt.Errorf(`couldn't get a handle to the library: %v`, err) } diff --git a/src/control/lib/ranklist/ranklist.go b/src/control/lib/ranklist/ranklist.go index c9639a9ca64..c65d3e7259f 100644 --- a/src/control/lib/ranklist/ranklist.go +++ b/src/control/lib/ranklist/ranklist.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -7,6 +7,7 @@ package ranklist import ( + "encoding/json" "math/bits" "strings" @@ -140,6 +141,41 @@ func (rs *RankSet) Ranks() (out []Rank) { return } +func (rs *RankSet) MarshalJSON() ([]byte, error) { + if rs == nil { + return json.Marshal(nil) + } + return json.Marshal(rs.Ranks()) +} + +func (rs *RankSet) UnmarshalJSON(data []byte) error { + if rs == nil { + return errors.New("nil RankSet") + } + + var ranks []Rank + if err := json.Unmarshal(data, &ranks); err == nil { + rs.Replace(RankSetFromRanks(ranks)) + return nil + } + + // If the input doesn't parse as a JSON array, try parsing + // it as a ranged string. + trimmed := strings.Trim(string(data), "\"") + if trimmed == "[]" { + rs.Replace(&RankSet{}) + return nil + } + + newRs, err := CreateRankSet(trimmed) + if err != nil { + return err + } + rs.Replace(newRs) + + return nil +} + // MustCreateRankSet is like CreateRankSet but will panic on error. func MustCreateRankSet(stringRanks string) *RankSet { rs, err := CreateRankSet(stringRanks) diff --git a/src/control/lib/ranklist/ranklist_test.go b/src/control/lib/ranklist/ranklist_test.go index c3005805c25..34f9193687b 100644 --- a/src/control/lib/ranklist/ranklist_test.go +++ b/src/control/lib/ranklist/ranklist_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2020-2022 Intel Corporation. +// (C) Copyright 2020-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -96,3 +96,95 @@ func TestRankList_RankSet(t *testing.T) { }) } } + +func TestRanklist_RankSet_MarshalJSON(t *testing.T) { + for name, tc := range map[string]struct { + rankSet *RankSet + expOut string + expErr error + }{ + "nil rankset": { + expOut: "null", + }, + "empty rankset": { + rankSet: &RankSet{}, + expOut: "[]", + }, + "simple rankset": { + rankSet: MustCreateRankSet("[0-3]"), + expOut: "[0,1,2,3]", + }, + } { + t.Run(name, func(t *testing.T) { + gotBytes, gotErr := tc.rankSet.MarshalJSON() + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + if diff := cmp.Diff(tc.expOut, string(gotBytes)); diff != "" { + t.Fatalf("unexpected value (-want, +got):\n%s\n", diff) + } + }) + } +} + +func TestRanklist_RankSet_UnmarshalJSON(t *testing.T) { + for name, tc := range map[string]struct { + dataStr string + rankSet *RankSet + expSet *RankSet + expErr error + }{ + "nil rankset": { + dataStr: "[0,1,2,3]", + expErr: errors.New("nil RankSet"), + }, + "invalid range string": { + dataStr: "3-0", + rankSet: &RankSet{}, + expErr: errors.New("invalid range"), + }, + "empty data": { + rankSet: &RankSet{}, + expSet: &RankSet{}, + }, + "empty slice": { + dataStr: "[]", + rankSet: &RankSet{}, + expSet: &RankSet{}, + }, + "empty slice, quoted": { + dataStr: "\"[]\"", + rankSet: &RankSet{}, + expSet: &RankSet{}, + }, + "rankset from ranks": { + rankSet: &RankSet{}, + dataStr: "[0,1,2,3,5]", + expSet: MustCreateRankSet("[0-3,5]"), + }, + "rankset from range": { + rankSet: &RankSet{}, + dataStr: "[0-3,5]", + expSet: MustCreateRankSet("[0-3,5]"), + }, + } { + t.Run(name, func(t *testing.T) { + gotErr := tc.rankSet.UnmarshalJSON([]byte(tc.dataStr)) + test.CmpErr(t, tc.expErr, gotErr) + if tc.expErr != nil { + return + } + + cmpOpts := cmp.Options{ + cmp.Comparer(func(x, y *RankSet) bool { + return x.String() == y.String() + }), + } + if diff := cmp.Diff(tc.expSet, tc.rankSet, cmpOpts...); diff != "" { + t.Fatalf("unexpected RankSet (-want, +got):\n%s\n", diff) + } + }) + } +} diff --git a/src/control/lib/txtfmt/table.go b/src/control/lib/txtfmt/table.go index 2f80c4a1921..95556c488ac 100644 --- a/src/control/lib/txtfmt/table.go +++ b/src/control/lib/txtfmt/table.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2021 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -10,9 +10,26 @@ import ( "bytes" "fmt" "io" + "strings" "text/tabwriter" + "unicode" ) +// Title returns the string in Title Format. +// +// NB: This is basically a copy of strings.Title(), which is deprecated. +func Title(s string) string { + prev := ' ' + return strings.Map(func(r rune) rune { + if unicode.IsSpace(prev) { + prev = r + return unicode.ToTitle(r) + } + prev = r + return r + }, strings.ToLower(s)) +} + // TableRow is a map of string values to be printed, keyed by column title. type TableRow map[string]string diff --git a/src/control/lib/txtfmt/table_test.go b/src/control/lib/txtfmt/table_test.go index e592adc3ab7..5e8eed316a1 100644 --- a/src/control/lib/txtfmt/table_test.go +++ b/src/control/lib/txtfmt/table_test.go @@ -1,5 +1,5 @@ // -// (C) Copyright 2019-2021 Intel Corporation. +// (C) Copyright 2019-2024 Intel Corporation. // // SPDX-License-Identifier: BSD-2-Clause-Patent // @@ -13,6 +13,25 @@ import ( "github.com/google/go-cmp/cmp" ) +func TestTitle(t *testing.T) { + for testStr, expStr := range map[string]string{ + "": "", + " ": " ", + "lowercase": "Lowercase", + "lowercase three words": "Lowercase Three Words", + "mIxed CASE": "Mixed Case", + "12345": "12345", + } { + t.Run(testStr, func(t *testing.T) { + gotStr := Title(testStr) + + if diff := cmp.Diff(expStr, gotStr); diff != "" { + t.Fatalf("unexpected result (-want, +got): %s", diff) + } + }) + } +} + func TestNewTableFormatter_NoTitles(t *testing.T) { f := NewTableFormatter() if f.writer == nil { diff --git a/src/control/vendor/github.com/jessevdk/go-flags/group.go b/src/control/vendor/github.com/jessevdk/go-flags/group.go index 181caabb2df..e901ddcd008 100644 --- a/src/control/vendor/github.com/jessevdk/go-flags/group.go +++ b/src/control/vendor/github.com/jessevdk/go-flags/group.go @@ -331,7 +331,7 @@ func (g *Group) checkForDuplicateFlags() *Error { longName := option.LongNameWithNamespace() if otherOption, ok := longNames[longName]; ok { - duplicateError = newErrorf(ErrDuplicatedFlag, "option `%s' uses the same long name as option `%s'", option, otherOption) + duplicateError = newErrorf(ErrDuplicatedFlag, "%s/%s: option `%s' uses the same long name as option `%s'", longName, otherOption.LongNameWithNamespace(), option, otherOption) return } longNames[longName] = option diff --git a/src/include/daos/api.h b/src/include/daos/api.h new file mode 100644 index 00000000000..8c2a20fc668 --- /dev/null +++ b/src/include/daos/api.h @@ -0,0 +1,28 @@ +/** + * (C) Copyright 2024 Intel Corporation. + * + * SPDX-License-Identifier: BSD-2-Clause-Patent + */ +/** + * Miscellaneous Client API Functions + */ + +#ifndef __DAOS_API_H__ +#define __DAOS_API_H__ + +#include + +/** + * Retrieve the daos API version. + * + * \param[out] major Pointer to an int that will be set to DAOS_API_VERSION_MAJOR. + * \param[out] minor Pointer to an int that will be set to DAOS_API_VERSION_MINOR. + * \param[out] fix Pointer to an int that will be set to DAOS_API_VERSION_FIX. + * + * \return 0 if Success, negative if failed. + * \retval -DER_INVAL Invalid parameter. + */ +int +daos_version_get(int *major, int *minor, int *fix); + +#endif \ No newline at end of file diff --git a/src/include/daos_types.h b/src/include/daos_types.h index b7071d11245..6db21423cee 100644 --- a/src/include/daos_types.h +++ b/src/include/daos_types.h @@ -19,6 +19,7 @@ extern "C" { #include #include #include +#include /** uuid_t */ #include @@ -274,6 +275,8 @@ struct daos_sys_info { char dsi_system_name[DAOS_SYS_INFO_STRING_MAX + 1]; /** fabric provider in use by this system */ char dsi_fabric_provider[DAOS_SYS_INFO_STRING_MAX + 1]; + /** path to agent socket */ + char dsi_agent_path[PATH_MAX]; /** length of ranks array */ uint32_t dsi_nr_ranks; /** ranks and their client-accessible URIs */ diff --git a/src/mgmt/cli_mgmt.c b/src/mgmt/cli_mgmt.c index ea475c50466..c24f4802171 100644 --- a/src/mgmt/cli_mgmt.c +++ b/src/mgmt/cli_mgmt.c @@ -492,6 +492,7 @@ dc_mgmt_get_sys_info(const char *sys, struct daos_sys_info **out) copy_str(info->dsi_system_name, internal.system_name); copy_str(info->dsi_fabric_provider, internal.provider); + copy_str(info->dsi_agent_path, dc_agent_sockpath); *out = info; From 87a2a8625a8e42dfb302547733e407afc3506a75 Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 29 Jul 2024 16:11:12 +0100 Subject: [PATCH 02/11] DAOS-16223 control: Config generate to not output auto values (#14778) Config generate should not output auto calculated values update ftest to not expect scm_size in config generate yaml output Signed-off-by: Tom Nabarro --- src/control/cmd/daos_server/auto_test.go | 74 +++---------------- src/control/cmd/dmg/auto_test.go | 46 ++---------- src/control/lib/control/auto.go | 12 +-- src/control/lib/control/auto_test.go | 73 +----------------- src/control/server/config/server_test.go | 22 +++++- .../ftest/control/config_generate_output.py | 10 +-- 6 files changed, 50 insertions(+), 187 deletions(-) diff --git a/src/control/cmd/daos_server/auto_test.go b/src/control/cmd/daos_server/auto_test.go index 1cff1b0c00c..8fb8d85522f 100644 --- a/src/control/cmd/daos_server/auto_test.go +++ b/src/control/cmd/daos_server/auto_test.go @@ -13,7 +13,6 @@ import ( "strings" "testing" - "github.com/dustin/go-humanize" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" @@ -216,20 +215,10 @@ func TestDaosServer_Auto_confGen(t *testing.T) { e1 := control.MockEngineCfg(1, 1, 3).WithTargetCount(18).WithHelperStreamCount(4) exmplEngineCfgs := []*engine.Config{e0, e1} - engRsvdGiB := 2 /* calculated based on 18 targets */ - sysRsvdGiB := storage.DefaultSysMemRsvd / humanize.GiByte - ramdiskGiB := storage.MinRamdiskMem / humanize.GiByte - // Nr hugepages expected with 18 targets * 2 engines * 512 pages-per-target. - defNrHugepages := 18 * 2 * 512 - tmpfsHugeMemGiB := (humanize.MiByte * 2 * defNrHugepages) / humanize.GiByte - // Total mem to meet requirements 38GiB hugeMem, 2GiB per engine rsvd, 6GiB sys rsvd, 4GiB - // per engine RAM-disk. - tmpfsMemTotalGiB := humanize.GiByte * (tmpfsHugeMemGiB + (2 * engRsvdGiB) + sysRsvdGiB + - (2 * ramdiskGiB) + 1 /* add 1GiB buffer */) tmpfsEngineCfgs := []*engine.Config{ - control.MockEngineCfgTmpfs(0, ramdiskGiB, control.MockBdevTier(0, 2, 4)). + control.MockEngineCfgTmpfs(0, 0, control.MockBdevTier(0, 2, 4)). WithTargetCount(18).WithHelperStreamCount(4), - control.MockEngineCfgTmpfs(1, ramdiskGiB, control.MockBdevTier(1, 1, 3)). + control.MockEngineCfgTmpfs(1, 0, control.MockBdevTier(1, 1, 3)). WithTargetCount(18).WithHelperStreamCount(4), } @@ -237,16 +226,8 @@ func TestDaosServer_Auto_confGen(t *testing.T) { controlMetadata := storage.ControlMetadata{ Path: metadataMountPath, } - // Nr hugepages expected with 18+1 (extra MD-on-SSD sys-xstream) targets * 2 engines * 512 - // pages-per-target. - mdOnSSDNrHugepages := 19 * 2 * 512 - mdOnSSDHugeMemGiB := (humanize.MiByte * 2 * mdOnSSDNrHugepages) / humanize.GiByte - // Total mem to meet requirements 39GiB hugeMem, 2GiB per engine rsvd, 6GiB sys rsvd, 4GiB - // per engine RAM-disk. - mdOnSSDMemTotalGiB := humanize.GiByte * (mdOnSSDHugeMemGiB + (2 * engRsvdGiB) + sysRsvdGiB + - (2 * ramdiskGiB) + 1 /* add 1GiB buffer */) mdOnSSDEngineCfgs := []*engine.Config{ - control.MockEngineCfgTmpfs(0, ramdiskGiB, + control.MockEngineCfgTmpfs(0, 0, control.MockBdevTierWithRole(0, storage.BdevRoleWAL, 2), control.MockBdevTierWithRole(0, storage.BdevRoleMeta|storage.BdevRoleData, 4)). WithStorageControlMetadataPath(metadataMountPath). @@ -254,7 +235,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { filepath.Join(controlMetadata.EngineDirectory(0), storage.BdevOutConfName), ). WithTargetCount(18).WithHelperStreamCount(4), - control.MockEngineCfgTmpfs(1, ramdiskGiB, + control.MockEngineCfgTmpfs(1, 0, control.MockBdevTierWithRole(1, storage.BdevRoleWAL, 1), control.MockBdevTierWithRole(1, storage.BdevRoleMeta|storage.BdevRoleData, 3)). WithStorageControlMetadataPath(metadataMountPath). @@ -266,7 +247,10 @@ func TestDaosServer_Auto_confGen(t *testing.T) { var defCoresPerNuma uint32 = 26 var defNumaCount uint32 = 2 - defMemInfo := common.MemInfo{HugepageSizeKiB: 2048} + defMemInfo := common.MemInfo{ + HugepageSizeKiB: 2048, + MemTotalKiB: 1, // Avoid failing non-zero check. + } defHostFabric := &control.HostFabric{ Interfaces: []*control.HostFabricInterface{ eth0, eth1, ib0, ib1, @@ -354,7 +338,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { hf: defHostFabric, hs: defHostStorage, expCfg: control.MockServerCfg("ofi+psm2", exmplEngineCfgs). - WithNrHugepages(defNrHugepages). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"), }, @@ -363,7 +346,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { hf: defHostFabric, hs: defHostStorage, expCfg: control.MockServerCfg("ofi+psm2", exmplEngineCfgs). - WithNrHugepages(defNrHugepages). WithAccessPoints("localhost:10001"). WithAccessPoints("moon-111:10001", "mars-115:10001", "jupiter-119:10001"). WithControlLogFile("/tmp/daos_server.log"), @@ -400,10 +382,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { storage.MockScmNamespace(0), storage.MockScmNamespace(1), }, - MemInfo: &common.MemInfo{ - HugepageSizeKiB: 2048, - MemTotalKiB: tmpfsMemTotalGiB / humanize.KiByte, - }, + MemInfo: &defMemInfo, NvmeDevices: storage.NvmeControllers{ storage.MockNvmeController(1), storage.MockNvmeController(2), @@ -412,7 +391,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { }, }, expCfg: control.MockServerCfg("ofi+psm2", tmpfsEngineCfgs). - WithNrHugepages(defNrHugepages). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"), }, @@ -422,28 +400,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { hs: defHostStorage, expErr: errors.New("only supported with scm class ram"), }, - "tmpfs scm; md-on-ssd; low mem": { - tmpfsSCM: true, - extMetadataPath: metadataMountPath, - hf: defHostFabric, - hs: &control.HostStorage{ - ScmNamespaces: storage.ScmNamespaces{ - storage.MockScmNamespace(0), - storage.MockScmNamespace(1), - }, - MemInfo: &common.MemInfo{ - HugepageSizeKiB: 2048, - MemTotalKiB: (humanize.GiByte * 12) / humanize.KiByte, - }, - NvmeDevices: storage.NvmeControllers{ - storage.MockNvmeController(1), - storage.MockNvmeController(2), - storage.MockNvmeController(3), - storage.MockNvmeController(4), - }, - }, - expErr: errors.New("insufficient ram"), - }, "tmpfs scm; md-on-ssd": { tmpfsSCM: true, extMetadataPath: metadataMountPath, @@ -453,10 +409,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { storage.MockScmNamespace(0), storage.MockScmNamespace(1), }, - MemInfo: &common.MemInfo{ - HugepageSizeKiB: 2048, - MemTotalKiB: mdOnSSDMemTotalGiB / humanize.KiByte, - }, + MemInfo: &defMemInfo, NvmeDevices: storage.NvmeControllers{ storage.MockNvmeController(1), storage.MockNvmeController(2), @@ -465,7 +418,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { }, }, expCfg: control.MockServerCfg("ofi+psm2", mdOnSSDEngineCfgs). - WithNrHugepages(mdOnSSDNrHugepages). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"). WithControlMetadata(controlMetadata), @@ -479,10 +431,7 @@ func TestDaosServer_Auto_confGen(t *testing.T) { storage.MockScmNamespace(0), storage.MockScmNamespace(1), }, - MemInfo: &common.MemInfo{ - HugepageSizeKiB: 2048, - MemTotalKiB: mdOnSSDMemTotalGiB / humanize.KiByte, - }, + MemInfo: &defMemInfo, NvmeDevices: storage.NvmeControllers{ storage.MockNvmeController(1), storage.MockNvmeController(2), @@ -561,7 +510,6 @@ func TestDaosServer_Auto_confGen(t *testing.T) { WithBdevDeviceList("0000:97:00.5", "0000:e2:00.5"), ), }). - WithNrHugepages(16 /* tgts */ * 2 /* engines */ * 512 /* pages-per-tgt */). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"), }, diff --git a/src/control/cmd/dmg/auto_test.go b/src/control/cmd/dmg/auto_test.go index 104b389a3e3..48f06427792 100644 --- a/src/control/cmd/dmg/auto_test.go +++ b/src/control/cmd/dmg/auto_test.go @@ -13,7 +13,6 @@ import ( "strings" "testing" - "github.com/dustin/go-humanize" "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" "github.com/pkg/errors" @@ -305,22 +304,13 @@ func TestAuto_confGen(t *testing.T) { Addr: "host1", Message: control.MockServerScanResp(t, "withSpaceUsage"), } - storRespHighMem := control.MockServerScanResp(t, "withSpaceUsage") - // Total mem to meet requirements 34GiB hugeMem, 2GiB per engine rsvd, 16GiB sys rsvd, - // 5GiB per engine for tmpfs. - storRespHighMem.MemInfo.MemTotalKb = (humanize.GiByte * (34 + 4 + 16 + 10)) / humanize.KiByte - mockRamdiskSize := 5 - storHostRespHighMem := &control.HostResponse{ - Addr: "host1", - Message: storRespHighMem, - } e0 := control.MockEngineCfg(0, 2, 4, 6, 8).WithHelperStreamCount(4) e1 := control.MockEngineCfg(1, 1, 3, 5, 7).WithHelperStreamCount(4) exmplEngineCfgs := []*engine.Config{e0, e1} tmpfsEngineCfgs := []*engine.Config{ - control.MockEngineCfgTmpfs(0, mockRamdiskSize+1, control.MockBdevTier(0, 2, 4, 6, 8)). + control.MockEngineCfgTmpfs(0, 0, control.MockBdevTier(0, 2, 4, 6, 8)). WithHelperStreamCount(4), - control.MockEngineCfgTmpfs(1, mockRamdiskSize+1, control.MockBdevTier(1, 1, 3, 5, 7)). + control.MockEngineCfgTmpfs(1, 0, control.MockBdevTier(1, 1, 3, 5, 7)). WithHelperStreamCount(4), } metadataMountPath := "/mnt/daos_md" @@ -328,7 +318,7 @@ func TestAuto_confGen(t *testing.T) { Path: metadataMountPath, } mdonssdEngineCfgs := []*engine.Config{ - control.MockEngineCfgTmpfs(0, mockRamdiskSize, + control.MockEngineCfgTmpfs(0, 0, control.MockBdevTierWithRole(0, storage.BdevRoleWAL, 2), control.MockBdevTierWithRole(0, storage.BdevRoleMeta|storage.BdevRoleData, 4, 6, 8)). WithStorageControlMetadataPath(metadataMountPath). @@ -336,7 +326,7 @@ func TestAuto_confGen(t *testing.T) { filepath.Join(controlMetadata.EngineDirectory(0), storage.BdevOutConfName), ). WithHelperStreamCount(4), - control.MockEngineCfgTmpfs(1, mockRamdiskSize, + control.MockEngineCfgTmpfs(1, 0, control.MockBdevTierWithRole(1, storage.BdevRoleWAL, 1), control.MockBdevTierWithRole(1, storage.BdevRoleMeta|storage.BdevRoleData, 3, 5, 7)). WithStorageControlMetadataPath(metadataMountPath). @@ -386,8 +376,6 @@ func TestAuto_confGen(t *testing.T) { {storHostResp}, }, expCfg: control.MockServerCfg("ofi+psm2", exmplEngineCfgs). - // 16 targets * 2 engines * 512 pages - WithNrHugepages(16 * 2 * 512). WithAccessPoints("localhost:10001"). WithControlLogFile("/tmp/daos_server.log"), }, @@ -398,8 +386,6 @@ func TestAuto_confGen(t *testing.T) { {storHostResp}, }, expCfg: control.MockServerCfg("ofi+psm2", exmplEngineCfgs). - // 16 targets * 2 engines * 512 pages - WithNrHugepages(16*2*512). WithAccessPoints("moon-111:10001", "mars-115:10001", "jupiter-119:10001"). WithControlLogFile("/tmp/daos_server.log"), }, @@ -435,11 +421,9 @@ func TestAuto_confGen(t *testing.T) { tmpfsSCM: true, hostResponsesSet: [][]*control.HostResponse{ {netHostResp}, - {storHostRespHighMem}, + {storHostResp}, }, expCfg: control.MockServerCfg("ofi+psm2", tmpfsEngineCfgs). - // 16 targets * 2 engines * 512 pages - WithNrHugepages(16 * 2 * 512). WithControlLogFile("/tmp/daos_server.log"), }, "dcpm scm; control_metadata path set": { @@ -450,25 +434,14 @@ func TestAuto_confGen(t *testing.T) { }, expErr: errors.New("only supported with scm class ram"), }, - "tmpfs scm; md-on-ssd; low mem": { - tmpfsSCM: true, - extMetadataPath: metadataMountPath, - hostResponsesSet: [][]*control.HostResponse{ - {netHostResp}, - {storHostResp}, - }, - expErr: errors.New("insufficient ram"), - }, "tmpfs scm; md-on-ssd": { tmpfsSCM: true, extMetadataPath: metadataMountPath, hostResponsesSet: [][]*control.HostResponse{ {netHostResp}, - {storHostRespHighMem}, + {storHostResp}, }, expCfg: control.MockServerCfg("ofi+psm2", mdonssdEngineCfgs). - // 16+1 (MD-on-SSD extra sys-XS) targets * 2 engines * 512 pages - WithNrHugepages(17 * 2 * 512). WithControlLogFile("/tmp/daos_server.log"). WithControlMetadata(controlMetadata), }, @@ -477,11 +450,9 @@ func TestAuto_confGen(t *testing.T) { extMetadataPath: metadataMountPath, hostResponsesSet: [][]*control.HostResponse{ {netHostResp}, - {storHostRespHighMem}, + {storHostResp}, }, expCfg: control.MockServerCfg("ofi+psm2", mdonssdEngineCfgs). - // 16+1 (MD-on-SSD extra sys-XS) targets * 2 engines * 512 pages - WithNrHugepages(17 * 2 * 512). WithControlLogFile("/tmp/daos_server.log"). WithControlMetadata(controlMetadata), expOutPrefix: "port: 10001", @@ -616,7 +587,7 @@ engines: disable_vfio: false disable_vmd: false enable_hotplug: false -nr_hugepages: 6144 +nr_hugepages: 0 system_ram_reserved: 16 disable_hugepages: false control_log_mask: INFO @@ -636,7 +607,6 @@ hyperthreads: false WithControlLogFile(defaultControlLogFile). WithFabricProvider("ofi+verbs"). WithAccessPoints("hostX:10002"). - WithNrHugepages(6144). WithDisableVMD(false). WithEngines( engine.MockConfig(). diff --git a/src/control/lib/control/auto.go b/src/control/lib/control/auto.go index 6f90bb9671b..0f2a94c404c 100644 --- a/src/control/lib/control/auto.go +++ b/src/control/lib/control/auto.go @@ -194,7 +194,7 @@ func ConfGenerate(req ConfGenerateReq, newEngineCfg newEngineCfgFn, hf *HostFabr } // populate server config using engine configs - sc, err := genServerConfig(req, ecs, sd.MemInfo, tc) + sc, err := genServerConfig(req, ecs, tc) if err != nil { return nil, err } @@ -1217,7 +1217,7 @@ func checkAccessPointPorts(log logging.Logger, aps []string) (int, error) { // Generate a server config file from the constituent hardware components. Enforce consistent // target and helper count across engine configs necessary for optimum performance and populate // config parameters. Set NUMA affinity on the generated config and then run through validation. -func genServerConfig(req ConfGenerateReq, ecs []*engine.Config, mi *common.MemInfo, tc *threadCounts) (*config.Server, error) { +func genServerConfig(req ConfGenerateReq, ecs []*engine.Config, tc *threadCounts) (*config.Server, error) { if len(ecs) == 0 { return nil, errors.New("expected non-zero number of engine configs") } @@ -1260,13 +1260,5 @@ func genServerConfig(req ConfGenerateReq, ecs []*engine.Config, mi *common.MemIn return nil, errors.Wrap(err, "validating engine config") } - if err := cfg.SetNrHugepages(req.Log, mi); err != nil { - return nil, err - } - - if err := cfg.SetRamdiskSize(req.Log, mi); err != nil { - return nil, err - } - return cfg, nil } diff --git a/src/control/lib/control/auto_test.go b/src/control/lib/control/auto_test.go index 510a9adb0fb..2e45d641daf 100644 --- a/src/control/lib/control/auto_test.go +++ b/src/control/lib/control/auto_test.go @@ -1613,7 +1613,6 @@ func TestControl_AutoConfig_getThreadCounts(t *testing.T) { } func TestControl_AutoConfig_genServerConfig(t *testing.T) { - defHpSizeKb := 2048 exmplEngineCfg0 := MockEngineCfg(0, 0, 1, 2) exmplEngineCfg1 := MockEngineCfg(1, 3, 4, 5) metadataMountPath := "/mnt/daos_md" @@ -1625,8 +1624,6 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { accessPoints []string // list of access point host/ip addresses extMetadataPath string ecs []*engine.Config - hpSize int - memTotal int threadCounts *threadCounts // numa to cpu mappings expCfg *config.Server // expected config generated expErr error @@ -1634,65 +1631,49 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { "no engines": { expErr: errors.New("expected non-zero"), }, - "no hugepage size": { - threadCounts: &threadCounts{16, 0}, - ecs: []*engine.Config{exmplEngineCfg0}, - memTotal: (humanize.GiByte * 16) / humanize.KiByte, // convert to kib - expErr: errors.New("invalid system hugepage size"), - }, "no provider in engine config": { threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{ MockEngineCfg(0, 0, 1, 2).WithFabricProvider(""), }, - hpSize: defHpSizeKb, expErr: errors.New("provider not specified"), }, "no access points": { accessPoints: []string{}, threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{exmplEngineCfg0}, - hpSize: defHpSizeKb, expErr: errors.New("no access points"), }, "access points without the same port": { accessPoints: []string{"bob:1", "joe:2"}, threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{exmplEngineCfg0}, - hpSize: defHpSizeKb, expErr: errors.New("numbers do not match"), }, "access points some with port specified": { accessPoints: []string{"bob:1", "joe"}, threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{exmplEngineCfg0}, - hpSize: defHpSizeKb, expErr: errors.New("numbers do not match"), }, "single engine config; default port number": { accessPoints: []string{"hostX"}, threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{exmplEngineCfg0}, - hpSize: defHpSizeKb, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ exmplEngineCfg0.WithHelperStreamCount(0), }). - // 16 targets * 1 engine * 512 pages - WithNrHugepages(16 * 512). WithAccessPoints("hostX:10001"), // Default applied. }, "single engine config; default port number specified": { accessPoints: []string{"hostX:10001"}, threadCounts: &threadCounts{16, 0}, ecs: []*engine.Config{exmplEngineCfg0}, - hpSize: defHpSizeKb, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ exmplEngineCfg0.WithHelperStreamCount(0), }). - // 16 targets * 1 engine * 512 pages - WithNrHugepages(16 * 512). WithAccessPoints("hostX:10001"), // ControlPort remains at 10001. }, "dual engine config; custom access point port number": { @@ -1702,14 +1683,11 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { exmplEngineCfg0, exmplEngineCfg1, }, - hpSize: defHpSizeKb, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ exmplEngineCfg0.WithHelperStreamCount(0), exmplEngineCfg1.WithHelperStreamCount(0), }). - // 16 targets * 2 engines * 512 pages - WithNrHugepages(16 * 2 * 512). WithAccessPoints("hostX:10002"). WithControlPort(10002), // ControlPort updated to AP port. }, @@ -1720,7 +1698,6 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { exmplEngineCfg0, exmplEngineCfg1, }, - hpSize: defHpSizeKb, expErr: config.FaultConfigBadControlPort, }, "dual engine tmpfs; multiple bdev tiers; no control metadata path": { @@ -1729,40 +1706,7 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0), MockBdevTier(0, 1, 2)), MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3), MockBdevTier(1, 4, 5)), }, - hpSize: defHpSizeKb, - memTotal: (52 * humanize.GiByte) / humanize.KiByte, - expErr: errors.New("multiple bdev tiers"), - }, - "dual engine tmpfs; no hugepage size": { - extMetadataPath: metadataMountPath, - threadCounts: &threadCounts{16, 0}, - ecs: []*engine.Config{ - MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0), MockBdevTier(0, 1, 2)), - MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3), MockBdevTier(1, 4, 5)), - }, - memTotal: humanize.GiByte, - expErr: errors.New("invalid system hugepage size"), - }, - "dual engine tmpfs; no mem": { - extMetadataPath: metadataMountPath, - threadCounts: &threadCounts{16, 0}, - ecs: []*engine.Config{ - MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0), MockBdevTier(0, 1, 2)), - MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3), MockBdevTier(1, 4, 5)), - }, - hpSize: defHpSizeKb, - expErr: errors.New("requires nonzero total mem"), - }, - "dual engine tmpfs; low mem": { - extMetadataPath: metadataMountPath, - threadCounts: &threadCounts{16, 0}, - ecs: []*engine.Config{ - MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0), MockBdevTier(0, 1, 2)), - MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3), MockBdevTier(1, 4, 5)), - }, - hpSize: defHpSizeKb, - memTotal: humanize.GiByte / humanize.KiByte, - expErr: errors.New("insufficient ram"), + expErr: errors.New("multiple bdev tiers"), }, "dual engine tmpfs; high mem": { accessPoints: []string{"hostX:10002", "hostY:10002", "hostZ:10002"}, @@ -1772,11 +1716,9 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0), MockBdevTier(0, 1, 2)), MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3), MockBdevTier(1, 4, 5)), }, - hpSize: defHpSizeKb, - memTotal: (64 * humanize.GiByte) / humanize.KiByte, expCfg: MockServerCfg(exmplEngineCfg0.Fabric.Provider, []*engine.Config{ - MockEngineCfgTmpfs(0, 5, /* tmpfs size in gib */ + MockEngineCfgTmpfs(0, 0, MockBdevTier(0, 0).WithBdevDeviceRoles(4), MockBdevTier(0, 1, 2).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). @@ -1785,7 +1727,7 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { filepath.Join(controlMetadata.EngineDirectory(0), storage.BdevOutConfName), ), - MockEngineCfgTmpfs(1, 5, /* tmpfs size in gib */ + MockEngineCfgTmpfs(1, 0, MockBdevTier(1, 3).WithBdevDeviceRoles(4), MockBdevTier(1, 4, 5).WithBdevDeviceRoles(3)). WithHelperStreamCount(0). @@ -1795,8 +1737,6 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { storage.BdevOutConfName), ), }). - // 16+1 (MD-on-SSD) targets * 2 engines * 512 pages - WithNrHugepages(17*2*512). WithAccessPoints("hostX:10002", "hostY:10002", "hostZ:10002"). WithControlPort(10002). // ControlPort updated to AP port. WithControlMetadata(controlMetadata), @@ -1813,18 +1753,13 @@ func TestControl_AutoConfig_genServerConfig(t *testing.T) { tc.threadCounts = &threadCounts{} } - mi := &common.MemInfo{ - HugepageSizeKiB: tc.hpSize, - MemTotalKiB: tc.memTotal, - } - req := ConfGenerateReq{ Log: log, AccessPoints: tc.accessPoints, ExtMetadataPath: tc.extMetadataPath, } - getCfg, gotErr := genServerConfig(req, tc.ecs, mi, tc.threadCounts) + getCfg, gotErr := genServerConfig(req, tc.ecs, tc.threadCounts) test.CmpErr(t, tc.expErr, gotErr) if tc.expErr != nil { return diff --git a/src/control/server/config/server_test.go b/src/control/server/config/server_test.go index 4c5a3c6a3ab..0c11b358573 100644 --- a/src/control/server/config/server_test.go +++ b/src/control/server/config/server_test.go @@ -980,9 +980,11 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { testFile := filepath.Join(testDir, sConfigUncomment) uncommentServerConfig(t, testFile) + defHpSizeKb := 2048 + for name, tc := range map[string]struct { extraConfig func(c *Server) *Server - memTotBytes uint64 + zeroHpSize bool expNrHugepages int expErr error }{ @@ -1034,6 +1036,13 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { ) }, }, + "zero hugepage size": { + extraConfig: func(c *Server) *Server { + return c + }, + zeroHpSize: true, + expErr: errors.New("invalid system hugepage size"), + }, "zero hugepages set in config; bdevs configured; implicit role assignment": { extraConfig: func(c *Server) *Server { return c.WithEngines(defaultEngineCfg(). @@ -1105,7 +1114,10 @@ func TestServerConfig_SetNrHugepages(t *testing.T) { cfg := tc.extraConfig(baseCfg(t, testFile)) mi := &common.MemInfo{ - HugepageSizeKiB: 2048, + HugepageSizeKiB: defHpSizeKb, + } + if tc.zeroHpSize { + mi.HugepageSizeKiB = 0 } test.CmpErr(t, tc.expErr, cfg.SetNrHugepages(log, mi)) @@ -1133,6 +1145,12 @@ func TestServerConfig_SetRamdiskSize(t *testing.T) { expRamdiskSize int expErr error }{ + "zero mem reported": { + extraConfig: func(c *Server) *Server { + return c + }, + expErr: errors.New("requires nonzero total mem"), + }, "out of range scm_size; high": { // 16896 hugepages / 512 pages-per-gib = 33 gib huge mem // 33 huge mem + 5 sys rsv + 2 engine rsv = 40 gib reserved mem diff --git a/src/tests/ftest/control/config_generate_output.py b/src/tests/ftest/control/config_generate_output.py index c36e098e350..b574088cf98 100644 --- a/src/tests/ftest/control/config_generate_output.py +++ b/src/tests/ftest/control/config_generate_output.py @@ -1,5 +1,5 @@ ''' - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent ''' @@ -257,7 +257,7 @@ def test_tmpfs_scm_config(self): 1. Call dmg config generate (using --use-tmpfs-scm flag) 2. Verify class value is set to ram. 3. Verify the scm_list value is not set. - 4. Verify the scm_size value is set. + 4. Verify the scm_size value is not set (as auto calculated on start). 5. Iterate bdev_list address list and verify that it's in the NVMe PCI address set of the correct numa node. 6. Get fabric_iface and verify that it's in the interface set. @@ -294,9 +294,9 @@ def test_tmpfs_scm_config(self): # Verify scm_list value is not set: if "scm_list" in storage: errors.append("unexpected scm_list field exists in ram tier") - # Verify scm_size value is set: - if "scm_size" not in storage: - errors.append("Expected scm_size field does not exist in ram tier") + # Verify scm_size value is not set: + if "scm_size" in storage: + errors.append("unexpected scm_size field exists in ram tier") elif storage["class"] == "dcpm": scm_found = True errors.append("Unexpected storage tier class dcpm, want ram") From 96a95b811f88848387e615e4a8667bac59e5daae Mon Sep 17 00:00:00 2001 From: Tom Nabarro Date: Mon, 29 Jul 2024 16:13:15 +0100 Subject: [PATCH 03/11] DAOS-15946 control: Checker enablement misc fixes (#14700) Ignore NotReplica errors when checking if enabled. Signed-off-by: Tom Nabarro --- src/control/server/mgmt_check.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/control/server/mgmt_check.go b/src/control/server/mgmt_check.go index 5312763cef8..78a346f0daf 100644 --- a/src/control/server/mgmt_check.go +++ b/src/control/server/mgmt_check.go @@ -50,7 +50,8 @@ func (svc *mgmtSvc) disableChecker() error { func (svc *mgmtSvc) checkerIsEnabled() bool { value, err := system.GetMgmtProperty(svc.sysdb, checkerEnabledKey) if err != nil { - if !system.IsNotLeader(err) && !system.IsErrSystemAttrNotFound(err) { + if !system.IsNotLeader(err) && !system.IsErrSystemAttrNotFound(err) && + !system.IsNotReplica(err) { svc.log.Errorf("failed to get checker enabled value: %s", err) } return false From 53d96f40f04e233c865a9500deec5cf60dbd5d23 Mon Sep 17 00:00:00 2001 From: Jeff Olivier Date: Mon, 29 Jul 2024 09:53:14 -0600 Subject: [PATCH 04/11] DAOS-623 vos: Add visible iterator option to vos command line (#14653) For VOS command test, add a visible iterator option so one can manually verify that VOS iterates correctly for that case. Signed-off-by: Jeff Olivier --- src/vos/tests/vos_cmd.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/src/vos/tests/vos_cmd.c b/src/vos/tests/vos_cmd.c index 86ecd58d2df..fa8b9e00d0b 100644 --- a/src/vos/tests/vos_cmd.c +++ b/src/vos/tests/vos_cmd.c @@ -198,6 +198,7 @@ free_pool(struct known_pool *pool) ACTION(AGGREGATE, aggregate, true, 1) \ ACTION(DISCARD, discard, true, 0) \ ACTION(ITERATE, iterate, true, 0) \ + ACTION(VISIBLE_ITERATE, visible_iterate, true, 0) \ ACTION(SIZE_QUERY, print_size, true, 0) \ ACTION(RANDOMIZE, run_many_tests, true, 0) @@ -478,7 +479,7 @@ iter_cb(daos_handle_t ih, vos_iter_entry_t *entry, vos_iter_type_t type, vos_ite } static int -iterate(struct cmd_info *cinfo) +iterate_common(struct cmd_info *cinfo, bool visible) { struct vos_iter_anchors anchors = {0}; vos_iter_param_t param = {0}; @@ -489,7 +490,9 @@ iterate(struct cmd_info *cinfo) set_oid(¶m.ip_oid); param.ip_epr.epr_lo = 0; param.ip_epr.epr_hi = d_hlc_get(); - param.ip_flags = VOS_IT_RECX_VISIBLE | VOS_IT_RECX_COVERED; + param.ip_flags = VOS_IT_RECX_VISIBLE; + if (!visible) + param.ip_flags |= VOS_IT_RECX_COVERED; rc = vos_iterate(¶m, VOS_ITER_DKEY, true, &anchors, iter_cb, NULL, &count, NULL); if (rc != 0) { @@ -501,6 +504,18 @@ iterate(struct cmd_info *cinfo) return rc; } +static int +iterate(struct cmd_info *cinfo) +{ + return iterate_common(cinfo, false); +} + +static int +visible_iterate(struct cmd_info *cinfo) +{ + return iterate_common(cinfo, true); +} + int write_key(struct cmd_info *cinfo) { @@ -945,6 +960,7 @@ run_vos_command(const char *arg0, const char *cmd) {"write", required_argument, 0, 'w'}, {"punch_range", required_argument, 0, 'P'}, {"iterate", required_argument, 0, 'i'}, + {"visible_iterate", required_argument, 0, 'I'}, {"remove", required_argument, 0, 'R'}, {"punch", required_argument, 0, 'p'}, {"aggregate", no_argument, 0, 'a'}, @@ -977,7 +993,7 @@ run_vos_command(const char *arg0, const char *cmd) optind = 1; - while ((c = getopt_long(args.a_nr, args.a_argv, "c:o:dw:p:ahrsP:R:ix:A:D", long_options, + while ((c = getopt_long(args.a_nr, args.a_argv, "c:o:dw:p:ahrsP:R:iIx:A:D", long_options, &option_index)) != -1) { cinfo = &args.a_cmds[args.a_cmd_nr]; switch (c) { @@ -991,6 +1007,10 @@ run_vos_command(const char *arg0, const char *cmd) cinfo->type = ITERATE; args.a_cmd_nr++; break; + case 'I': + cinfo->type = VISIBLE_ITERATE; + args.a_cmd_nr++; + break; case 'c': cinfo->type = CREATE_POOL; strncpy(cinfo->key, optarg, MAX_KEY_LEN); From f50030129a0d990d81bd57f561c31238055d3fc7 Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Mon, 29 Jul 2024 11:46:14 -0600 Subject: [PATCH 05/11] DAOS-16238 utils: Add valgrind suppressions for Go runtime (#14823) Added some more Go runtime suppressions. I've added counterparts for all functions already on the list, as well as the false positives that have been recently spotted. Signed-off-by: Kris Jacque --- src/cart/utils/memcheck-cart.supp | 45 +++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/cart/utils/memcheck-cart.supp b/src/cart/utils/memcheck-cart.supp index 7918d1f5909..a3d5653c46c 100644 --- a/src/cart/utils/memcheck-cart.supp +++ b/src/cart/utils/memcheck-cart.supp @@ -583,13 +583,19 @@ fun:racecall } { - go 1.22.3 race + __tsan_go_atomic64_load Memcheck:Addr8 fun:__tsan_go_atomic64_load fun:racecall } { - go 1.22.3 race + __tsan_go_atomic64_store + Memcheck:Addr8 + fun:__tsan_go_atomic64_store + fun:racecall +} +{ + __tsan_go_atomic64_compare_exchange Memcheck:Addr8 fun:__tsan_go_atomic64_compare_exchange fun:racecall @@ -605,11 +611,17 @@ fun:runtime.persistentalloc } { - go 1.22.3 race + __tsan_write_pc Memcheck:Value8 fun:__tsan_write_pc fun:racecall } +{ + __tsan_read_pc + Memcheck:Value8 + fun:__tsan_read_pc + fun:racecall +} { go 1.22.3 race Memcheck:Value8 @@ -623,31 +635,48 @@ fun:racecall } { - go 1.22.3 race + __tsan_read Memcheck:Value8 fun:__tsan_read fun:racecall } { - go 1.22.3 race + __tsan_write + Memcheck:Value8 + fun:__tsan_write + fun:racecall +} +{ + racecallatomic Memcheck:Addr8 fun:racecallatomic } { - go 1.22.3 race + racecalladdr + Memcheck:Addr8 + fun:racecalladdr +} +{ + Syscall6 param Memcheck:Param read(buf) fun:runtime/internal/syscall.Syscall6 } { - go 1.22.3 race + __tsan_go_atomic32_load Memcheck:Addr4 fun:__tsan_go_atomic32_load fun:racecall } { - go 1.22.3 race + __tsan_go_atomic32_store Memcheck:Addr4 fun:__tsan_go_atomic32_store fun:racecall } +{ + __tsan_go_atomic32_compare_exchange + Memcheck:Addr4 + fun:__tsan_go_atomic32_compare_exchange + fun:racecall +} From 0d9391812b28e2c4a66e422d404ed6bb5fef571a Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Tue, 30 Jul 2024 02:48:55 +0900 Subject: [PATCH 06/11] =?UTF-8?q?DAOS-16082=20test:=20Repeatedly=20query?= =?UTF-8?q?=20device=20after=20SysXS=20device=20is=20set=20to=E2=80=A6=20(?= =?UTF-8?q?#14736)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After SysXS device is set to faulty, the device state transition may take a while until its state becomes EVICTED. Wait for 10 sec before querying the device state. If the state isn't EVICTED, query again. Also set only one SysXS device faulty because there's no point of setting second SysXS device to faulty. (based on the developer feedback). Check whether any of the engines is down at the end of the test Signed-off-by: Makito Kano --- src/tests/ftest/control/dmg_storage_query.py | 44 ++++++++++++++------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/src/tests/ftest/control/dmg_storage_query.py b/src/tests/ftest/control/dmg_storage_query.py index fcc383fb59d..924709163b8 100644 --- a/src/tests/ftest/control/dmg_storage_query.py +++ b/src/tests/ftest/control/dmg_storage_query.py @@ -3,15 +3,15 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ - import re import traceback import avocado from control_test_base import ControlTestBase -from dmg_utils import get_storage_query_device_info, get_storage_query_pool_info +from dmg_utils import (check_system_query_status, get_storage_query_device_info, + get_storage_query_pool_info) from exception_utils import CommandFailure -from general_utils import dict_to_str, list_to_str +from general_utils import dict_to_str, list_to_str, wait_for_result class DmgStorageQuery(ControlTestBase): @@ -67,6 +67,15 @@ def check_dev_state(self, device_info, state): if errors: self.fail("Found {} device(s) not in the {} state".format(errors, state)) + def check_engine_down(self): + """Check if engine is down. + + Returns: + bool: True if any of the engines is down. False otherwise. + + """ + return not check_system_query_status(self.dmg.system_query()) + @avocado.fail_on(CommandFailure) def test_dmg_storage_query_devices(self): """ @@ -218,10 +227,9 @@ def test_dmg_storage_query_device_state(self): """ JIRA ID: DAOS-3925 - Test Description: Test 'dmg storage query list-devices' command. - - In addition this test also does a basic test of nvme-faulty cmd: - 'dmg storage set nvme-faulty' + 1. Call "dmg storage query list-devices" and check that the state is NORMAL. + 2. Set SysXS device to faulty with "dmg set nvme-faulty". + 3. Check that devices are in EVICTED state. :avocado: tags=all,daily_regression :avocado: tags=hw,medium @@ -230,11 +238,12 @@ def test_dmg_storage_query_device_state(self): """ expect_failed_engine = False - # Get device info and check state is NORMAL + msg = 'Call "dmg storage query list-devices" and check that the state is NORMAL.' + self.log_step(msg) device_info = get_storage_query_device_info(self.dmg) - self.check_dev_state(device_info, "NORMAL") + self.check_dev_state(device_info=device_info, state="NORMAL") - # Set device to faulty state and check that it's in FAULTY state + self.log_step('Set SysXS device to faulty with "dmg set nvme-faulty".') for device in device_info: if str(device['has_sys_xs']).lower() == 'true': # Setting a SysXS device faulty will kill the engine @@ -247,11 +256,14 @@ def test_dmg_storage_query_device_state(self): except CommandFailure: if not expect_failed_engine: self.fail("Error setting the faulty state for {}".format(device['uuid'])) + # Set only one SysXS device faulty. + if expect_failed_engine: + break - # Check that devices are in FAULTY state + self.log_step("Check that devices are in EVICTED state.") try: device_info = get_storage_query_device_info(self.dmg) - self.check_dev_state(device_info, "EVICTED") + self.check_dev_state(device_info=device_info, state="EVICTED") except CommandFailure as error: if not expect_failed_engine: raise @@ -262,4 +274,12 @@ def test_dmg_storage_query_device_state(self): self.log.debug(error) self.fail("dmg storage query list-devices failed for an unexpected reason") + if expect_failed_engine: + timeout = 30 + engine_down_detected = wait_for_result( + log=self.log, get_method=self.check_engine_down, timeout=timeout, delay=10, + add_log=False) + if not engine_down_detected: + self.fail(f"Engine down NOT detected after {timeout} sec!") + self.log.info("Test passed") From 18a7878da70a39a537461c45894aa21f25c04697 Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Tue, 30 Jul 2024 12:31:26 +0800 Subject: [PATCH 07/11] DAOS-16035 rebuild: create VOS cont when no record need to be rebuilt (#14819) For EC object rebuild, some ext not exist on some shards, for this case create VOS container when no record need to be rebuilt. To avoid following IO cannot find container and fail at obj_ioc_init() -> cont_child_lookup(). Another case is in cont_snap_update_one() create the vos cont if non-exist. Signed-off-by: Xuezhao Liu --- src/container/srv_target.c | 7 ++++++- src/object/srv_obj_migrate.c | 11 ++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/container/srv_target.c b/src/container/srv_target.c index 0b17f6619b8..7ebcabbca16 100644 --- a/src/container/srv_target.c +++ b/src/container/srv_target.c @@ -1989,9 +1989,14 @@ cont_snap_update_one(void *vin) struct ds_cont_child *cont; int rc; - rc = ds_cont_child_lookup(args->pool_uuid, args->cont_uuid, &cont); + /* The container should be exist on the system at this point, if non-exist on this target + * it should be the case of reintegrate the container was destroyed ahead, so just + * open_create the container here. + */ + rc = ds_cont_child_open_create(args->pool_uuid, args->cont_uuid, &cont); if (rc != 0) return rc; + if (args->snap_count == 0) { if (cont->sc_snapshots != NULL) { D_ASSERT(cont->sc_snapshots_nr > 0); diff --git a/src/object/srv_obj_migrate.c b/src/object/srv_obj_migrate.c index ea6684603c7..649f236e829 100644 --- a/src/object/srv_obj_migrate.c +++ b/src/object/srv_obj_migrate.c @@ -2718,9 +2718,18 @@ migrate_enum_unpack_cb(struct dc_obj_enum_unpack_io *io, void *data) } if (!create_migrate_one) { + struct ds_cont_child *cont = NULL; + D_DEBUG(DB_REBUILD, DF_UOID"/"DF_KEY" does not need rebuild.\n", DP_UOID(io->ui_oid), DP_KEY(&io->ui_dkey)); - D_GOTO(put, rc = 0); + + /* Create the vos container when no record need to be rebuilt for this shard, + * for the case of reintegrate the container was discarded ahead. + */ + rc = migrate_get_cont_child(tls, arg->arg->cont_uuid, &cont, true); + if (cont != NULL) + ds_cont_child_put(cont); + D_GOTO(put, rc); } /* Check if some IODs from this unpack can be merged to the exist mrone, mostly for EC From ed96c9d579579176e42b3d477a35f9cb61032242 Mon Sep 17 00:00:00 2001 From: Phil Henderson Date: Tue, 30 Jul 2024 12:15:36 -0400 Subject: [PATCH 08/11] DAOS-16217 test: Update run_local(). (#14748) Update the current run_local() command to return an object similar to run_remote() to allow them to be used interchangeably. increase verify_perms.py timeout. Signed-off-by: Phil Henderson --- src/tests/ftest/dfuse/pil4dfs_fio.py | 2 +- src/tests/ftest/harness/core_files.py | 11 +- src/tests/ftest/harness/unit.py | 261 +++++++++++++--- src/tests/ftest/process_core_files.py | 62 ++-- .../ftest/server/multiengine_persocket.py | 5 +- src/tests/ftest/slurm_setup.py | 4 +- src/tests/ftest/util/agent_utils.py | 6 +- src/tests/ftest/util/collection_utils.py | 32 +- src/tests/ftest/util/dfuse_utils.py | 14 +- src/tests/ftest/util/fio_utils.py | 4 +- src/tests/ftest/util/general_utils.py | 11 +- src/tests/ftest/util/io_utilities.py | 2 +- src/tests/ftest/util/launch_utils.py | 100 +++---- src/tests/ftest/util/package_utils.py | 6 +- src/tests/ftest/util/run_utils.py | 279 ++++++++---------- src/tests/ftest/util/server_utils.py | 16 +- src/tests/ftest/util/slurm_utils.py | 35 +-- src/tests/ftest/util/soak_test_base.py | 30 +- src/tests/ftest/util/user_utils.py | 14 +- src/tests/ftest/verify_perms.py | 9 +- 20 files changed, 488 insertions(+), 415 deletions(-) diff --git a/src/tests/ftest/dfuse/pil4dfs_fio.py b/src/tests/ftest/dfuse/pil4dfs_fio.py index a5405fa3388..2aa3cd1b952 100644 --- a/src/tests/ftest/dfuse/pil4dfs_fio.py +++ b/src/tests/ftest/dfuse/pil4dfs_fio.py @@ -74,7 +74,7 @@ def _get_bandwidth(self, fio_result, rw): """Returns FIO bandwidth of a given I/O pattern Args: - fio_result (RemoteCommandResult): results of a FIO command. + fio_result (CommandResult): results of a FIO command. rw (str): Type of I/O pattern. Returns: diff --git a/src/tests/ftest/harness/core_files.py b/src/tests/ftest/harness/core_files.py index fed038a6a82..a017b8bba7a 100644 --- a/src/tests/ftest/harness/core_files.py +++ b/src/tests/ftest/harness/core_files.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2021-2023 Intel Corporation. + (C) Copyright 2021-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -9,7 +9,7 @@ from apricot import TestWithServers from ClusterShell.NodeSet import NodeSet -from run_utils import RunException, run_local, run_remote +from run_utils import run_local, run_remote class HarnessCoreFilesTest(TestWithServers): @@ -40,11 +40,10 @@ def test_core_files(self): """ # create a core.gdb file self.log.debug("Create a core.gdb.harness.advanced file in core_pattern dir.") - try: - results = run_local(self.log, "cat /proc/sys/kernel/core_pattern", check=True) - except RunException: + result = run_local(self.log, "cat /proc/sys/kernel/core_pattern") + if not result.passed: self.fail("Unable to find local core file pattern") - core_path = os.path.split(results.stdout.splitlines()[-1])[0] + core_path = os.path.split(result.joined_stdout.splitlines()[-1])[0] core_file = "{}/core.gdb.harness.advanced".format(core_path) self.log.debug("Creating %s", core_file) diff --git a/src/tests/ftest/harness/unit.py b/src/tests/ftest/harness/unit.py index 1f4524674f6..bd6b108e64b 100644 --- a/src/tests/ftest/harness/unit.py +++ b/src/tests/ftest/harness/unit.py @@ -1,12 +1,13 @@ """ - (C) Copyright 2023 Intel Corporation. + (C) Copyright 2023-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ from apricot import TestWithoutServers from ClusterShell.NodeSet import NodeSet from data_utils import dict_extract_values, dict_subtract, list_flatten, list_stats, list_unique -from run_utils import ResultData, run_remote +from host_utils import get_local_host +from run_utils import ResultData, run_local, run_remote class HarnessUnitTest(TestWithoutServers): @@ -15,12 +16,13 @@ class HarnessUnitTest(TestWithoutServers): :avocado: recursive """ - def _verify_remote_command_result(self, result, passed, expected, timeout, homogeneous, - passed_hosts, failed_hosts, all_stdout, all_stderr): - """Verify a RemoteCommandResult object. + def _verify_command_result(self, result, passed, expected, timeout, homogeneous, passed_hosts, + failed_hosts, all_stdout, all_stderr, join_stdout, join_stderr): + # pylint: disable=too-many-arguments + """Verify a CommandResult object. Args: - result (RemoteCommandResult): object to verify + result (CommandResult): object to verify passed (bool): expected passed command state expected (list): expected list of ResultData objects timeout (bool): expected command timeout state @@ -29,26 +31,25 @@ def _verify_remote_command_result(self, result, passed, expected, timeout, homog failed_hosts (NodeSet): expected set of hosts on which the command failed all_stdout (dict): expected stdout str per host key all_stderr (dict): expected stderr str per host key + join_stdout (str): expected all stdout joined into one string + join_stderr (str): expected all stderr joined into one string """ - self.assertEqual(passed, result.passed, 'Incorrect RemoteCommandResult.passed') - self.assertEqual( - len(expected), len(result.output), 'Incorrect RemoteCommandResult.output count') + self.assertEqual(passed, result.passed, 'Incorrect CommandResult.passed') + self.assertEqual(len(expected), len(result.output), 'Incorrect CommandResult.output count') sorted_output = sorted(result.output) for index, expect in enumerate(sorted(expected)): actual = sorted_output[index] for key in ('command', 'returncode', 'hosts', 'stdout', 'stderr', 'timeout'): self.assertEqual( - getattr(expect, key), getattr(actual, key), - 'Incorrect ResultData.{}'.format(key)) - self.assertEqual(timeout, result.timeout, 'Incorrect RemoteCommandResult.timeout') - self.assertEqual( - homogeneous, result.homogeneous, 'Incorrect RemoteCommandResult.homogeneous') - self.assertEqual( - passed_hosts, result.passed_hosts, 'Incorrect RemoteCommandResult.passed_hosts') - self.assertEqual( - failed_hosts, result.failed_hosts, 'Incorrect RemoteCommandResult.failed_hosts') - self.assertEqual(all_stdout, result.all_stdout, 'Incorrect RemoteCommandResult.all_stdout') - self.assertEqual(all_stderr, result.all_stderr, 'Incorrect RemoteCommandResult.all_stderr') + getattr(expect, key), getattr(actual, key), f'Incorrect ResultData.{key}') + self.assertEqual(timeout, result.timeout, 'Incorrect CommandResult.timeout') + self.assertEqual(homogeneous, result.homogeneous, 'Incorrect CommandResult.homogeneous') + self.assertEqual(passed_hosts, result.passed_hosts, 'Incorrect CommandResult.passed_hosts') + self.assertEqual(failed_hosts, result.failed_hosts, 'Incorrect CommandResult.failed_hosts') + self.assertEqual(all_stdout, result.all_stdout, 'Incorrect CommandResult.all_stdout') + self.assertEqual(all_stderr, result.all_stderr, 'Incorrect CommandResult.all_stderr') + self.assertEqual(join_stdout, result.joined_stdout, 'Incorrect CommandResult.joined_stdout') + self.assertEqual(join_stderr, result.joined_stderr, 'Incorrect CommandResult.joined_stderr') def test_harness_unit_list_unique(self): """Verify list_unique(). @@ -233,6 +234,136 @@ def test_harness_unit_dict_subtract(self): }) self.log_step('Unit Test Passed') + def test_harness_unit_run_local(self): + """Verify run_local(). + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local + """ + host = get_local_host() + command = 'uname -o' + self.log_step('Verify run_local()') + self._verify_command_result( + result=run_local(self.log, command), + passed=True, + expected=[ResultData(command, 0, host, ['GNU/Linux'], [], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): 'GNU/Linux'}, + all_stderr={str(host): ''}, + join_stdout='GNU/Linux', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_separated(self): + """Verify run_local() with separate stdout and stderr. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_separated + """ + host = get_local_host() + command = 'echo stdout; echo stderr 1>&2' + self.log_step('Verify run_local() w/ no stdout') + self._verify_command_result( + result=run_local(self.log, command, stderr=True), + passed=True, + expected=[ResultData(command, 0, host, ['stdout'], ['stderr'], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): 'stdout'}, + all_stderr={str(host): 'stderr'}, + join_stdout='stdout', + join_stderr='stderr', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_no_stdout(self): + """Verify run_local() with no stdout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_no_stdout + """ + host = get_local_host() + command = 'echo stderr 1>&2' + self.log_step('Verify run_local() w/ no stdout') + self._verify_command_result( + result=run_local(self.log, command, stderr=True), + passed=True, + expected=[ResultData(command, 0, host, [], ['stderr'], False)], + timeout=False, + homogeneous=True, + passed_hosts=host, + failed_hosts=NodeSet(), + all_stdout={str(host): ''}, + all_stderr={str(host): 'stderr'}, + join_stdout='', + join_stderr='stderr', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_failure(self): + """Verify run_local() with a failure. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_failure + """ + host = get_local_host() + command = 'echo fail; exit 1' + self.log_step('Verify run_local() w/ a failure') + self._verify_command_result( + result=run_local(self.log, command), + passed=False, + expected=[ResultData(command, 1, host, ['fail'], [], False)], + timeout=False, + homogeneous=True, + passed_hosts=NodeSet(), + failed_hosts=host, + all_stdout={str(host): 'fail'}, + all_stderr={str(host): ''}, + join_stdout='fail', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_local_timeout(self): + """Verify run_local() with a timeout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_local_timeout + """ + host = get_local_host() + command = 'echo wait; sleep 5' + self.log_step('Verify run_local() w/ a timeout') + self._verify_command_result( + result=run_local(self.log, command, True, 2), + passed=False, + expected=[ResultData(command, 124, host, ['wait'], [], True)], + timeout=True, + homogeneous=True, + passed_hosts=NodeSet(), + failed_hosts=host, + all_stdout={str(host): 'wait'}, + all_stderr={str(host): ''}, + join_stdout='wait', + join_stderr='', + ) + self.log_step('Unit Test Passed') + def test_harness_unit_run_remote_single(self): """Verify run_remote() with a single host. @@ -244,7 +375,7 @@ def test_harness_unit_run_remote_single(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'uname -o' self.log_step('Verify run_remote() w/ single host') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, NodeSet(hosts[0]), command), passed=True, expected=[ResultData(command, 0, NodeSet(hosts[0]), ['GNU/Linux'], [], False)], @@ -253,7 +384,9 @@ def test_harness_unit_run_remote_single(self): passed_hosts=NodeSet(hosts[0]), failed_hosts=NodeSet(), all_stdout={hosts[0]: 'GNU/Linux'}, - all_stderr={hosts[0]: ''} + all_stderr={hosts[0]: ''}, + join_stdout='GNU/Linux', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -268,7 +401,7 @@ def test_harness_unit_run_remote_homogeneous(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'uname -o' self.log_step('Verify run_remote() w/ homogeneous output') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command), passed=True, expected=[ResultData(command, 0, hosts, ['GNU/Linux'], [], False)], @@ -277,7 +410,9 @@ def test_harness_unit_run_remote_homogeneous(self): passed_hosts=hosts, failed_hosts=NodeSet(), all_stdout={str(hosts): 'GNU/Linux'}, - all_stderr={str(hosts): ''} + all_stderr={str(hosts): ''}, + join_stdout='GNU/Linux', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -292,7 +427,7 @@ def test_harness_unit_run_remote_heterogeneous(self): hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') command = 'hostname -s' self.log_step('Verify run_remote() w/ heterogeneous output') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command), passed=True, expected=[ @@ -311,6 +446,8 @@ def test_harness_unit_run_remote_heterogeneous(self): hosts[0]: '', hosts[1]: '' }, + join_stdout='\n'.join(hosts), + join_stderr='', ) self.log_step('Unit Test Passed') @@ -323,10 +460,9 @@ def test_harness_unit_run_remote_combined(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_combined """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'echo stdout; if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format( - hosts[1]) + command = f'echo stdout; if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ separated stdout and stderr') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=False), passed=True, expected=[ @@ -344,7 +480,9 @@ def test_harness_unit_run_remote_combined(self): all_stderr={ hosts[0]: '', hosts[1]: '' - } + }, + join_stdout='stdout\nstdout\nstderr', + join_stderr='', ) self.log_step('Unit Test Passed') @@ -357,10 +495,9 @@ def test_harness_unit_run_remote_separated(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_separated """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'echo stdout; if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format( - hosts[1]) + command = f'echo stdout; if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ separated stdout and stderr') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=True, expected=[ @@ -378,12 +515,14 @@ def test_harness_unit_run_remote_separated(self): all_stderr={ hosts[0]: '', hosts[1]: 'stderr' - } + }, + join_stdout='stdout\nstdout', + join_stderr='stderr', ) self.log_step('Unit Test Passed') def test_harness_unit_run_remote_no_stdout(self): - """Verify run_remote() with separated stdout and stderr. + """Verify run_remote() with no stdout. :avocado: tags=all :avocado: tags=vm @@ -391,9 +530,9 @@ def test_harness_unit_run_remote_no_stdout(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_no_stdout """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'if [ $(hostname -s) == \'{}\' ]; then echo stderr 1>&2; fi'.format(hosts[1]) + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo stderr 1>&2; fi' self.log_step('Verify run_remote() w/ no stdout') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=True, expected=[ @@ -411,12 +550,14 @@ def test_harness_unit_run_remote_no_stdout(self): all_stderr={ hosts[0]: '', hosts[1]: 'stderr' - } + }, + join_stdout='', + join_stderr='stderr', ) self.log_step('Unit Test Passed') def test_harness_unit_run_remote_failure(self): - """Verify run_remote() with separated stdout and stderr. + """Verify run_remote() with a failure. :avocado: tags=all :avocado: tags=vm @@ -424,10 +565,9 @@ def test_harness_unit_run_remote_failure(self): :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_failure """ hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') - command = 'if [ $(hostname -s) == \'{}\' ]; then echo fail; exit 1; fi; echo pass'.format( - hosts[1]) + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo fail; exit 1; fi; echo pass' self.log_step('Verify run_remote() w/ a failure') - self._verify_remote_command_result( + self._verify_command_result( result=run_remote(self.log, hosts, command, stderr=True), passed=False, expected=[ @@ -445,6 +585,43 @@ def test_harness_unit_run_remote_failure(self): all_stderr={ hosts[0]: '', hosts[1]: '' - } + }, + join_stdout='pass\nfail', + join_stderr='', + ) + self.log_step('Unit Test Passed') + + def test_harness_unit_run_remote_timeout(self): + """Verify run_remote() with a timeout. + + :avocado: tags=all + :avocado: tags=vm + :avocado: tags=harness,run_utils + :avocado: tags=HarnessUnitTest,test_harness_unit_run_remote_timeout + """ + hosts = self.get_hosts_from_yaml('test_clients', 'partition', 'reservation', '/run/hosts/*') + command = f'if [ $(hostname -s) == \'{hosts[1]}\' ]; then echo wait; sleep 5; fi; echo pass' + self.log_step('Verify run_remote() w/ a timeout') + self._verify_command_result( + result=run_remote(self.log, hosts, command, stderr=True, timeout=2), + passed=False, + expected=[ + ResultData(command, 0, NodeSet(hosts[0]), ['pass'], [], False), + ResultData(command, 124, NodeSet(hosts[1]), ['wait'], [], True), + ], + timeout=True, + homogeneous=False, + passed_hosts=NodeSet(hosts[0]), + failed_hosts=NodeSet(hosts[1]), + all_stdout={ + hosts[0]: 'pass', + hosts[1]: 'wait' + }, + all_stderr={ + hosts[0]: '', + hosts[1]: '' + }, + join_stdout='pass\nwait', + join_stderr='', ) self.log_step('Unit Test Passed') diff --git a/src/tests/ftest/process_core_files.py b/src/tests/ftest/process_core_files.py index 9349cac6172..47fbf7a4ef4 100644 --- a/src/tests/ftest/process_core_files.py +++ b/src/tests/ftest/process_core_files.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -141,7 +141,8 @@ def process_core_files(self, directory, delete, test=None): if os.path.splitext(core_name)[-1] == ".bz2": # Decompress the file command = f"lbzip2 -d -v '{os.path.join(core_dir, core_name)}'" - run_local(self.log, command) + if not run_local(self.log, command).passed: + raise CoreFileException(f"Error decompressing {core_name}") core_name = os.path.splitext(core_name)[0] exe_name = self._get_exe_name(os.path.join(core_dir, core_name)) self._create_stacktrace(core_dir, core_name, exe_name) @@ -187,22 +188,23 @@ def _create_stacktrace(self, core_dir, core_name, exe_name): stack_trace_file = os.path.join(core_dir, f"'{core_name}.stacktrace'") self.log.debug("Generating a stacktrace from the %s core file from %s", core_full, host) - run_local(self.log, f"ls -l '{core_full}'") + if not run_local(self.log, f"ls -l '{core_full}'").passed: + raise RunException(f"Error listing {core_full}") command = ( f"gdb -cd='{core_dir}' -ex 'set pagination off' -ex 'thread apply all bt full' -ex " f"detach -ex quit '{exe_name}' '{core_name}'") + result = run_local(self.log, command, verbose=False) + if not result.passed: + raise RunException(f"Error creating {stack_trace_file}") + try: - output = run_local(self.log, command, check=False, verbose=False) with open(stack_trace_file, "w", encoding="utf-8") as stack_trace: - stack_trace.writelines(output.stdout) + stack_trace.write(result.joined_stdout) except IOError as error: raise RunException(f"Error writing {stack_trace_file}") from error - except RunException as error: - raise RunException(f"Error creating {stack_trace_file}") from error - def _get_exe_name(self, core_file): """Get the executable name from the core file. @@ -219,7 +221,7 @@ def _get_exe_name(self, core_file): self.log.debug("Extracting the executable name from '%s'", core_file) command = f"gdb -c '{core_file}' -ex 'info proc exe' -ex quit" result = run_local(self.log, command, verbose=False) - last_line = result.stdout.splitlines()[-1] + last_line = result.joined_stdout.splitlines()[-1] self.log.debug(" last line: %s", last_line) cmd = last_line[7:] self.log.debug(" last_line[7:-1]: %s", cmd) @@ -277,7 +279,7 @@ def install_debuginfo_packages(self): cmds.append(["sudo", "rm", "-f", path]) if self.USE_DEBUGINFO_INSTALL: - dnf_args = ["--exclude", "ompi-debuginfo"] + dnf_args = ["--nobest", "--exclude", "ompi-debuginfo"] if os.getenv("TEST_RPMS", 'false') == 'true': if "suse" in self.distro_info.name.lower(): dnf_args.extend(["libpmemobj1", "python3", "openmpi3"]) @@ -291,9 +293,8 @@ def install_debuginfo_packages(self): else: raise RunException(f"Unsupported distro: {self.distro_info}") cmds.append(["sudo", "dnf", "-y", "install"] + dnf_args) - output = run_local( - self.log, " ".join(["rpm", "-q", "--qf", "'%{evr}'", "daos"]), check=False) - rpm_version = output.stdout + result = run_local(self.log, " ".join(["rpm", "-q", "--qf", "'%{evr}'", "daos"])) + rpm_version = result.joined_stdout cmds.append( ["sudo", "dnf", "debuginfo-install", "-y"] + dnf_args + ["daos-" + rpm_version, "daos-*-" + rpm_version]) @@ -324,9 +325,7 @@ def install_debuginfo_packages(self): retry = False for cmd in cmds: - try: - run_local(self.log, " ".join(cmd), check=True) - except RunException: + if not run_local(self.log, " ".join(cmd)).passed: # got an error, so abort this list of commands and re-run # it with a dnf clean, makecache first retry = True @@ -339,9 +338,7 @@ def install_debuginfo_packages(self): cmds.insert(0, cmd_prefix + ["clean", "all"]) cmds.insert(1, cmd_prefix + ["makecache"]) for cmd in cmds: - try: - run_local(self.log, " ".join(cmd)) - except RunException: + if not run_local(self.log, " ".join(cmd)).passed: break def is_el(self): @@ -380,14 +377,11 @@ def resolve_debuginfo(self, pkg): """ package_info = None - try: - # Eventually use python libraries for this rather than exec()ing out to rpm - output = run_local( - self.log, - " ".join( - ["rpm", "-q", "--qf", "'%{name} %{version} %{release} %{epoch}'", pkg]), - check=False) - name, version, release, epoch = output.stdout.split() + # Eventually use python libraries for this rather than exec()ing out to rpm + command = f"rpm -q --qf '%{{name}} %{{version}} %{{release}} %{{epoch}}' {pkg}" + result = run_local(self.log, command) + if result.passed: + name, version, release, epoch = result.joined_stdout.split() debuginfo_map = {"glibc": "glibc-debuginfo-common"} try: @@ -400,7 +394,7 @@ def resolve_debuginfo(self, pkg): "release": release, "epoch": epoch } - except ValueError: + else: self.log.debug("Package %s not installed, skipping debuginfo", pkg) return package_info @@ -413,20 +407,16 @@ def delete_gdb_core_files(self): """ self.log.debug("Checking core files generated by core file processing") - try: - results = run_local(self.log, "cat /proc/sys/kernel/core_pattern", check=True) - except RunException: + result = run_local(self.log, "cat /proc/sys/kernel/core_pattern") + if not result.passed: self.log.error("Unable to find local core file pattern") self.log.debug("Stacktrace", exc_info=True) return 1 - core_path = os.path.split(results.stdout.splitlines()[-1])[0] + core_path = os.path.split(result.joined_stdout.splitlines()[-1])[0] self.log.debug("Deleting core.gdb.*.* core files located in %s", core_path) other = ["-printf '%M %n %-12u %-12g %12k %t %p\n' -delete"] - try: - run_local( - self.log, find_command(core_path, "core.gdb.*.*", 1, other), check=True) - except RunException: + if not run_local(self.log, find_command(core_path, "core.gdb.*.*", 1, other)).passed: self.log.debug("core.gdb.*.* files could not be removed") return 1 return 0 diff --git a/src/tests/ftest/server/multiengine_persocket.py b/src/tests/ftest/server/multiengine_persocket.py index ff91186d618..8c92fdfbdad 100644 --- a/src/tests/ftest/server/multiengine_persocket.py +++ b/src/tests/ftest/server/multiengine_persocket.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2020-2023 Intel Corporation. + (C) Copyright 2020-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -181,7 +181,8 @@ def check_pmem(self, hosts, count): def storage_format(self): """Perform storage format.""" - run_local(self.log, "dmg storage format") + if not run_local(self.log, "dmg storage format").passed: + self.fail("dmg storage format failed") def cleanup(self): """Servers clean up after test complete.""" diff --git a/src/tests/ftest/slurm_setup.py b/src/tests/ftest/slurm_setup.py index d13ab537eb7..0c3d300d5ff 100755 --- a/src/tests/ftest/slurm_setup.py +++ b/src/tests/ftest/slurm_setup.py @@ -207,7 +207,7 @@ def _create_epilog_script(self, script): """ self.log.debug('Creating the slurm epilog script to run after each job.') try: - with open(script, 'w') as script_file: + with open(script, 'w', encoding='utf-8') as script_file: script_file.write('#!/bin/bash\n#\n') script_file.write('/usr/bin/bash -c \'pkill --signal 9 dfuse\'\n') script_file.write( @@ -364,7 +364,7 @@ def _append_config_file(self, echo_command): echo_command (str): command adding contents to the config file Returns: - RemoteCommandResult: the result from the echo | tee command + CommandResult: the result from the echo | tee command """ tee_command = command_as_user(f'tee -a {self.SLURM_CONF}', self.root) return run_remote(self.log, self.all_nodes, f'{echo_command} | {tee_command}') diff --git a/src/tests/ftest/util/agent_utils.py b/src/tests/ftest/util/agent_utils.py index 76c293f0e4f..b68b141676d 100644 --- a/src/tests/ftest/util/agent_utils.py +++ b/src/tests/ftest/util/agent_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -294,9 +294,7 @@ def support_collect_log(self, **kwargs): CommandFailure: if the daos_agent command fails. Returns: - RemoteCommandResult: a grouping of the command results from - the same hosts with the same return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosAgentCommand(self.manager.job.command_path) cmd.sudo = True diff --git a/src/tests/ftest/util/collection_utils.py b/src/tests/ftest/util/collection_utils.py index 15963e03446..3b054eb19e9 100644 --- a/src/tests/ftest/util/collection_utils.py +++ b/src/tests/ftest/util/collection_utils.py @@ -16,7 +16,7 @@ # pylint: disable=import-error,no-name-in-module from util.environment_utils import TestEnvironment from util.host_utils import get_local_host -from util.run_utils import RunException, find_command, run_local, run_remote, stop_processes +from util.run_utils import find_command, run_local, run_remote, stop_processes from util.user_utils import get_chown_command from util.yaml_utils import get_test_category @@ -562,20 +562,17 @@ def move_files(logger, hosts, source, pattern, destination, depth, timeout, test # Clush -rcopy the temporary remote directory to this host command = ["clush", "-w", str(hosts), "-pv", "--rcopy", f"'{tmp_copy_dir}'", "--dest", f"'{rcopy_dest}'"] - try: - run_local(logger, " ".join(command), check=True, timeout=timeout) - except RunException: + if not run_local(logger, " ".join(command), timeout=timeout).passed: message = f"Error copying remote files to {destination}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return_code = 16 - finally: - # Remove the temporary remote directory on each host - command = f"{sudo_command}rm -fr '{tmp_copy_dir}'" - if not run_remote(logger, hosts, command).passed: - message = f"Error removing temporary remote copy directory '{tmp_copy_dir}'" - test_result.fail_test(logger, "Process", message) - return_code = 16 + # Remove the temporary remote directory on each host + command = f"{sudo_command}rm -fr '{tmp_copy_dir}'" + if not run_remote(logger, hosts, command).passed: + message = f"Error removing temporary remote copy directory '{tmp_copy_dir}'" + test_result.fail_test(logger, "Process", message) + return_code = 16 return return_code @@ -648,14 +645,13 @@ def create_steps_log(logger, job_results_dir, test_result): job_log = os.path.join(test_logs_dir, 'job.log') step_log = os.path.join(test_logs_dir, 'steps.log') command = rf"grep -E '(INFO |ERROR)\| (==> Step|START|PASS|FAIL|ERROR)' {job_log}" - try: - result = run_local(logger, command) - with open(step_log, 'w', encoding="utf-8") as file: - file.write(result.stdout) - except Exception: # pylint: disable=broad-except + result = run_local(logger, command) + if not result.passed: message = f"Error creating {step_log}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return 8192 + with open(step_log, 'w', encoding="utf-8") as file: + file.write(result.joined_stdout) return 0 @@ -713,9 +709,7 @@ def rename_avocado_test_dir(logger, test, job_results_dir, test_result, jenkins_ return 1024 # Remove latest symlink directory to avoid inclusion in the Jenkins build artifacts - try: - run_local(logger, f"rm -fr '{test_logs_lnk}'") - except RunException: + if not run_local(logger, f"rm -fr '{test_logs_lnk}'").passed: message = f"Error removing {test_logs_lnk}" test_result.fail_test(logger, "Process", message, sys.exc_info()) return 1024 diff --git a/src/tests/ftest/util/dfuse_utils.py b/src/tests/ftest/util/dfuse_utils.py index 46109fdb8b0..a26f372e76d 100644 --- a/src/tests/ftest/util/dfuse_utils.py +++ b/src/tests/ftest/util/dfuse_utils.py @@ -92,8 +92,7 @@ def _run_as_owner(self, hosts, command, timeout=120): Defaults to 120 seconds. Returns: - RemoteCommandResult: result of the command - + CommandResult: result of the command """ return run_remote( self.log, hosts, command_as_user(command, self.run_user), timeout=timeout) @@ -233,7 +232,7 @@ def run(self, check=True, mount_callback=None): Args: check (bool): Check if dfuse mounted properly after mount is executed. - mount_callback (method, optional): method to pass RemoteCommandResult to + mount_callback (method, optional): method to pass CommandResult to after mount. Default simply raises an exception on failure. Raises: @@ -504,7 +503,7 @@ def __init__(self, hosts, namespace="/run/verify_perms/*"): # run options self.hosts = hosts.copy() - self.timeout = 120 + self.timeout = 240 # Most usage requires root permission self.run_user = 'root' @@ -517,8 +516,7 @@ def run(self): CommandFailure: If the command fails Returns: - RemoteCommandResult: result from run_remote - + CommandResult: result from run_remote """ self.log.info('Running verify_perms.py on %s', str(self.hosts)) result = run_remote(self.log, self.hosts, self.with_exports, timeout=self.timeout) @@ -568,9 +566,7 @@ def _run_process(self, raise_exception=None): CommandFailure: if there is an error running the command Returns: - RemoteCommandResult: a grouping of the command results from the same host with the - same return status - + CommandResult: groups of command results from the same hosts with the same return status """ if raise_exception is None: raise_exception = self.exit_status_exception diff --git a/src/tests/ftest/util/fio_utils.py b/src/tests/ftest/util/fio_utils.py index 983f2255841..2c34eda01af 100644 --- a/src/tests/ftest/util/fio_utils.py +++ b/src/tests/ftest/util/fio_utils.py @@ -173,9 +173,7 @@ def _run_process(self, raise_exception=None): CommandFailure: if there is an error running the command Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the - same return status - + CommandResult: groups of command results from the same hosts with the same return status """ if not self._hosts: raise CommandFailure('No hosts specified for fio command') diff --git a/src/tests/ftest/util/general_utils.py b/src/tests/ftest/util/general_utils.py index 2ba711bde4a..e745c69160d 100644 --- a/src/tests/ftest/util/general_utils.py +++ b/src/tests/ftest/util/general_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -23,7 +23,7 @@ from avocado.utils import process from ClusterShell.NodeSet import NodeSet from ClusterShell.Task import task_self -from run_utils import RunException, get_clush_command, run_local, run_remote +from run_utils import get_clush_command, run_local, run_remote from user_utils import get_chown_command, get_primary_group @@ -1356,11 +1356,8 @@ def check_ping(log, host, expected_ping=True, cmd_timeout=60, verbose=True): Returns: bool: True if the expected number of pings were returned; False otherwise. """ - log.debug("Checking for %s to be %sresponsive", host, "" if expected_ping else "un") - try: - run_local( - log, "ping -c 1 {}".format(host), check=True, timeout=cmd_timeout, verbose=verbose) - except RunException: + log.debug("Checking for %s to be %s", host, "responsive" if expected_ping else "unresponsive") + if not run_local(log, f"ping -c 1 {host}", timeout=cmd_timeout, verbose=verbose).passed: return not expected_ping return expected_ping diff --git a/src/tests/ftest/util/io_utilities.py b/src/tests/ftest/util/io_utilities.py index 56a727cfa6f..af7cd37e8ca 100644 --- a/src/tests/ftest/util/io_utilities.py +++ b/src/tests/ftest/util/io_utilities.py @@ -367,7 +367,7 @@ def run(self): """Run the command. Returns: - RemoteCommandResult: result from run_remote + CommandResult: groups of command results from the same hosts with the same return status """ self.log.info('Running directory_tree.py on %s', str(self.hosts)) return run_remote(self.log, self.hosts, self.with_exports, timeout=self.timeout) diff --git a/src/tests/ftest/util/launch_utils.py b/src/tests/ftest/util/launch_utils.py index f1c359d7f4f..6753db5b85f 100644 --- a/src/tests/ftest/util/launch_utils.py +++ b/src/tests/ftest/util/launch_utils.py @@ -44,13 +44,11 @@ def fault_injection_enabled(logger): """ logger.debug("-" * 80) logger.debug("Checking for fault injection enablement via 'fault_status':") - try: - run_local(logger, "fault_status", check=True) + if run_local(logger, "fault_status").passed: logger.debug(" Fault injection is enabled") return True - except RunException: - # Command failed or yielded a non-zero return status - logger.debug(" Fault injection is disabled") + # Command failed or yielded a non-zero return status + logger.debug(" Fault injection is disabled") return False @@ -88,10 +86,7 @@ def display_disk_space(logger, path): """ logger.debug("-" * 80) logger.debug("Current disk space usage of %s", path) - try: - run_local(logger, f"df -h {path}", check=False) - except RunException: - pass + run_local(logger, f"df -h {path}") def summarize_run(logger, mode, status): @@ -153,9 +148,11 @@ def get_test_tag_info(logger, directory): test_tag_info = {} for test_file in sorted(list(map(str, Path(directory).rglob("*.py")))): command = f"grep -ER '(^class .*:|:avocado: tags=| def test_)' {test_file}" - output = run_local(logger, command, check=False, verbose=False) + result = run_local(logger, command, verbose=False) + if not result.passed: + continue data = re.findall( - r'(?:class (.*)\(.*\):|def (test_.*)\(|:avocado: tags=(.*))', output.stdout) + r'(?:class (.*)\(.*\):|def (test_.*)\(|:avocado: tags=(.*))', result.joined_stdout) class_key = None method_key = None for match in data: @@ -392,28 +389,27 @@ def execute(self, logger, test, repeat, number, sparse, fail_fast): "[Test %s/%s] Running the %s test on repetition %s/%s", number, self.total_tests, test, repeat, self.total_repeats) start_time = int(time.time()) - - try: - return_code = run_local( - logger, " ".join(command), capture_output=False, check=False).returncode - if return_code == 0: - logger.debug("All avocado test variants passed") - elif return_code & 2 == 2: - logger.debug("At least one avocado test variant failed") - elif return_code & 4 == 4: - message = "Failed avocado commands detected" - self.test_result.fail_test(logger, "Execute", message) - elif return_code & 8 == 8: - logger.debug("At least one avocado test variant was interrupted") - if return_code: - self._collect_crash_files(logger) - - except RunException: - message = f"Error executing {test} on repeat {repeat}" + result = run_local(logger, " ".join(command)) + end_time = int(time.time()) + return_code = result.output[0].returncode + if return_code == 0: + logger.debug("All avocado test variants passed") + elif return_code & 1 == 1: + logger.debug("At least one avocado test variant failed") + elif return_code & 2 == 2: + logger.debug("At least one avocado job failed") + elif return_code & 4 == 4: + message = "Failed avocado commands detected" + self.test_result.fail_test(logger, "Execute", message) + elif return_code & 8 == 8: + logger.debug("At least one avocado test variant was interrupted") + else: + message = f"Unhandled rc={return_code} while executing {test} on repeat {repeat}" self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) return_code = 1 + if return_code: + self._collect_crash_files(logger) - end_time = int(time.time()) logger.info("Total test time: %ss", end_time - start_time) return return_code @@ -801,10 +797,11 @@ def _generate_certs(self, logger): certgen_dir = os.path.abspath( os.path.join("..", "..", "..", "..", "lib64", "daos", "certgen")) command = os.path.join(certgen_dir, "gen_certificates.sh") - try: - run_local(logger, f"/usr/bin/rm -rf {certs_dir}") - run_local(logger, f"{command} {test_env.log_dir}") - except RunException: + if not run_local(logger, f"/usr/bin/rm -rf {certs_dir}").passed: + message = "Error removing old certificates" + self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) + return 128 + if not run_local(logger, f"{command} {test_env.log_dir}").passed: message = "Error generating certificates" self.test_result.fail_test(logger, "Prepare", message, sys.exc_info()) return 128 @@ -826,12 +823,13 @@ def _collect_crash_files(self, logger): if crash_files: latest_crash_dir = os.path.join(avocado_logs_dir, "latest", "crashes") - try: - run_local(logger, f"mkdir -p {latest_crash_dir}", check=True) + if run_local(logger, f"mkdir -p {latest_crash_dir}").passed: for crash_file in crash_files: - run_local(logger, f"mv {crash_file} {latest_crash_dir}", check=True) - except RunException: - message = "Error collecting crash files" + if not run_local(logger, f"mv {crash_file} {latest_crash_dir}").passed: + message = "Error collecting crash files: mv" + self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) + else: + message = "Error collecting crash files: mkdir" self.test_result.fail_test(logger, "Execute", message, sys.exc_info()) else: logger.debug("No avocado crash files found in %s", crash_dir) @@ -927,8 +925,10 @@ def list_tests(self, logger, verbose): # Find all the test files that contain tests matching the tags logger.debug("-" * 80) logger.info("Detecting tests matching tags: %s", " ".join(command)) - output = run_local(logger, " ".join(command), check=True) - unique_test_files = set(re.findall(self._avocado.get_list_regex(), output.stdout)) + result = run_local(logger, " ".join(command)) + if not result.passed: + raise RunException("Error running avocado list") + unique_test_files = set(re.findall(self._avocado.get_list_regex(), result.joined_stdout)) for index, test_file in enumerate(unique_test_files): self.tests.append(TestInfo(test_file, index + 1, self._yaml_extension)) logger.info(" %s", self.tests[-1]) @@ -1015,7 +1015,8 @@ def update_test_yaml(self, logger, scm_size, scm_mount, extra_yaml, multiplier, if new_yaml_file: if verbose > 0: # Optionally display a diff of the yaml file - run_local(logger, f"diff -y {test.yaml_file} {new_yaml_file}", check=False) + if not run_local(logger, f"diff -y {test.yaml_file} {new_yaml_file}").passed: + raise RunException(f"Error diff'ing {test.yaml_file}") test.yaml_file = new_yaml_file # Display the modified yaml file variants with debug @@ -1023,7 +1024,8 @@ def update_test_yaml(self, logger, scm_size, scm_mount, extra_yaml, multiplier, if test.extra_yaml: command.extend(test.extra_yaml) command.extend(["--summary", "3"]) - run_local(logger, " ".join(command)) + if not run_local(logger, " ".join(command)).passed: + raise RunException(f"Error listing test variants for {test.yaml_file}") # Collect the host information from the updated test yaml test.set_yaml_info(logger, include_localhost) @@ -1144,13 +1146,11 @@ def _setup_application_directory(self, logger, result): logger.debug(" Copying applications from the '%s' directory", self._test_env.app_src) run_local(logger, f"ls -al '{self._test_env.app_src}'") for app in os.listdir(self._test_env.app_src): - try: - run_local( - logger, - f"cp -r '{os.path.join(self._test_env.app_src, app)}' " - f"'{self._test_env.app_dir}'", - check=True) - except RunException: + result = run_local( + logger, + f"cp -r '{os.path.join(self._test_env.app_src, app)}' " + f"'{self._test_env.app_dir}'") + if not result.passed: message = 'Error copying files to the application directory' result.tests[-1].fail_test(logger, 'Run', message, sys.exc_info()) return 128 diff --git a/src/tests/ftest/util/package_utils.py b/src/tests/ftest/util/package_utils.py index 22eafd2dc93..6823c2d4202 100644 --- a/src/tests/ftest/util/package_utils.py +++ b/src/tests/ftest/util/package_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2023 Intel Corporation. +(C) Copyright 2023-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -40,7 +40,7 @@ def install_packages(log, hosts, packages, user=None, timeout=600): timeout (int, optional): timeout for the dnf install command. Defaults to 600. Returns: - RemoteCommandResult: the 'dnf install' command results + CommandResult: the 'dnf install' command results """ log.info('Installing packages on %s: %s', hosts, ', '.join(packages)) command = command_as_user(' '.join(['dnf', 'install', '-y'] + packages), user) @@ -58,7 +58,7 @@ def remove_packages(log, hosts, packages, user=None, timeout=600): timeout (int, optional): timeout for the dnf remove command. Defaults to 600. Returns: - RemoteCommandResult: the 'dnf remove' command results + CommandResult: the 'dnf remove' command results """ log.info('Removing packages on %s: %s', hosts, ', '.join(packages)) command = command_as_user(' '.join(['dnf', 'remove', '-y'] + packages), user) diff --git a/src/tests/ftest/util/run_utils.py b/src/tests/ftest/util/run_utils.py index 23ac53c76d7..638c5784381 100644 --- a/src/tests/ftest/util/run_utils.py +++ b/src/tests/ftest/util/run_utils.py @@ -4,9 +4,8 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ import os -import shlex -import subprocess # nosec import time +from getpass import getuser from socket import gethostname from ClusterShell.NodeSet import NodeSet @@ -68,23 +67,94 @@ def passed(self): Returns: bool: if the command was successful - """ return self.returncode == 0 -class RemoteCommandResult(): - """Stores the command result from a Task object.""" +class CommandResult(): + """Groups of command results from the same hosts with the same return status.""" def __init__(self, command, task): - """Create a RemoteCommandResult object. + """Create a CommandResult object. Args: command (str): command executed task (Task): object containing the results from an executed clush command """ self.output = [] - self._process_task(task, command) + + # Get a dictionary of host list values for each unique return code key + return_codes = dict(task.iter_retcodes()) + + # Add any hosts that timed out using an 124 return code + timed_out_hosts = list(task.iter_keys_timeout()) + if timed_out_hosts and 124 in return_codes: + # To be on the safe side even though we typically wouldn't see 124 from iter_retcodes() + return_codes[124].extend(timed_out_hosts) + elif timed_out_hosts: + return_codes[124] = timed_out_hosts + + # Populate the a list of unique output for each NodeSet + for code in sorted(return_codes): + stdout_data = self.__sanitize_iter_data( + return_codes[code], list(task.iter_buffers(return_codes[code])), '') + + for stdout_raw, stdout_hosts in stdout_data: + # In run_remote(), task.run() is executed with the stderr=False default. + # As a result task.iter_buffers() will return combined stdout and stderr. + stdout = self.__msg_tree_elem_to_list(stdout_raw) + stderr_data = self.__sanitize_iter_data( + stdout_hosts, list(task.iter_errors(stdout_hosts)), '') + for stderr_raw, stderr_hosts in stderr_data: + stderr = self.__msg_tree_elem_to_list(stderr_raw) + self.output.append( + ResultData( + command, code, NodeSet.fromlist(stderr_hosts), stdout, stderr, + bool(code == 124))) + + @staticmethod + def __sanitize_iter_data(hosts, data, default_entry): + """Ensure the data generated from an iter function has entries for each host. + + Args: + hosts (list): lists of host which generated data + data (list): data from an iter function as a list + default_entry (object): entry to add to data for missing hosts in data + + Returns: + list: a list of tuples of entries and list of hosts + """ + if not data: + return [(default_entry, hosts)] + + source_keys = NodeSet.fromlist(hosts) + data_keys = NodeSet() + for _, keys in data: + data_keys.add(NodeSet.fromlist(keys)) + + sanitized_data = data.copy() + missing_keys = source_keys - data_keys + if missing_keys: + sanitized_data.append((default_entry, list(missing_keys))) + return sanitized_data + + @staticmethod + def __msg_tree_elem_to_list(msg_tree_elem): + """Convert a ClusterShell.MsgTree.MsgTreeElem to a list of strings. + + Args: + msg_tree_elem (MsgTreeElem): output from Task.iter_* method. + + Returns: + list: list of strings + """ + msg_tree_elem_list = [] + for line in msg_tree_elem.splitlines(): + if isinstance(line, bytes): + msg_tree_elem_list.append(line.decode("utf-8")) + else: + msg_tree_elem_list.append(line) + return msg_tree_elem_list @property def homogeneous(self): @@ -92,7 +162,6 @@ def homogeneous(self): Returns: bool: if all the hosts produced the same output - """ return len(self.output) == 1 @@ -102,7 +171,6 @@ def passed(self): Returns: bool: if the command was successful on each host - """ all_zero = all(data.passed for data in self.output) return all_zero and not self.timeout @@ -113,7 +181,6 @@ def timeout(self): Returns: bool: True if the command timed out on at least one set of hosts; False otherwise - """ return any(data.timeout for data in self.output) @@ -123,7 +190,6 @@ def passed_hosts(self): Returns: NodeSet: all nodes where the command passed - """ return NodeSet.fromlist(data.hosts for data in self.output if data.returncode == 0) @@ -133,7 +199,6 @@ def failed_hosts(self): Returns: NodeSet: all nodes where the command failed - """ return NodeSet.fromlist(data.hosts for data in self.output if data.returncode != 0) @@ -143,7 +208,6 @@ def all_stdout(self): Returns: dict: the stdout (the values) from each set of hosts (the keys, as a str of the NodeSet) - """ stdout = {} for data in self.output: @@ -156,96 +220,37 @@ def all_stderr(self): Returns: dict: the stderr (the values) from each set of hosts (the keys, as a str of the NodeSet) - """ stderr = {} for data in self.output: stderr[str(data.hosts)] = '\n'.join(data.stderr) return stderr - def _process_task(self, task, command): - """Populate the output list and determine the passed result for the specified task. - - Args: - task (Task): a ClusterShell.Task.Task object for the executed command - command (str): the executed command - """ - # Get a dictionary of host list values for each unique return code key - results = dict(task.iter_retcodes()) - - # Get a list of any hosts that timed out - timed_out = [str(hosts) for hosts in task.iter_keys_timeout()] - - # Populate the a list of unique output for each NodeSet - for code in sorted(results): - stdout_data = self._sanitize_iter_data( - results[code], list(task.iter_buffers(results[code])), '') - - for stdout_raw, stdout_hosts in stdout_data: - # In run_remote(), task.run() is executed with the stderr=False default. - # As a result task.iter_buffers() will return combined stdout and stderr. - stdout = self._msg_tree_elem_to_list(stdout_raw) - stderr_data = self._sanitize_iter_data( - stdout_hosts, list(task.iter_errors(stdout_hosts)), '') - for stderr_raw, stderr_hosts in stderr_data: - stderr = self._msg_tree_elem_to_list(stderr_raw) - self.output.append( - ResultData( - command, code, NodeSet.fromlist(stderr_hosts), stdout, stderr, False)) - if timed_out: - self.output.append( - ResultData(command, 124, NodeSet.fromlist(timed_out), None, None, True)) - - @staticmethod - def _sanitize_iter_data(hosts, data, default_entry): - """Ensure the data generated from an iter function has entries for each host. - - Args: - hosts (list): lists of host which generated data - data (list): data from an iter function as a list - default_entry (object): entry to add to data for missing hosts in data + @property + def joined_stdout(self): + """Get all of the stdout from the issued command from each host joined by newlines. Returns: - list: a list of tuples of entries and list of hosts + str: all of the stdout from each host joined by newlines """ - if not data: - return [(default_entry, hosts)] + all_stdout = self.all_stdout + return '\n'.join(filter(None, [all_stdout[key] for key in sorted(all_stdout)])) - source_keys = NodeSet.fromlist(hosts) - data_keys = NodeSet() - for _, keys in data: - data_keys.add(NodeSet.fromlist(keys)) - - sanitized_data = data.copy() - missing_keys = source_keys - data_keys - if missing_keys: - sanitized_data.append((default_entry, list(missing_keys))) - return sanitized_data - - @staticmethod - def _msg_tree_elem_to_list(msg_tree_elem): - """Convert a ClusterShell.MsgTree.MsgTreeElem to a list of strings. - - Args: - msg_tree_elem (MsgTreeElem): output from Task.iter_* method. + @property + def joined_stderr(self): + """Get all of the stderr from the issued command from each host joined by newlines. Returns: - list: list of strings + str: all of the stderr from each host joined by newlines """ - msg_tree_elem_list = [] - for line in msg_tree_elem.splitlines(): - if isinstance(line, bytes): - msg_tree_elem_list.append(line.decode("utf-8")) - else: - msg_tree_elem_list.append(line) - return msg_tree_elem_list + all_stderr = self.all_stderr + return '\n'.join(filter(None, [all_stderr[key] for key in sorted(all_stderr)])) def log_output(self, log): """Log the command result. Args: log (logger): logger for the messages produced by this method - """ for data in self.output: log_result_data(log, data) @@ -289,7 +294,6 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su Returns: str: the clush command - """ cmd_list = ["clush"] if args: @@ -301,84 +305,40 @@ def get_clush_command(hosts, args=None, command="", command_env=None, command_su return " ".join(cmd_list) -def run_local(log, command, capture_output=True, timeout=None, check=False, verbose=True): - """Run the command locally. +def run_local(log, command, verbose=True, timeout=None, task_debug=False, stderr=False): + """Run the command on the local host. Args: log (logger): logger for the messages produced by this method command (str): command from which to obtain the output - capture_output(bool, optional): whether or not to include the command output in the - subprocess.CompletedProcess.stdout returned by this method. Defaults to True. + verbose (bool, optional): log the command output. Defaults to True. timeout (int, optional): number of seconds to wait for the command to complete. Defaults to None. - check (bool, optional): if set the method will raise an exception if the command does not - yield a return code equal to zero. Defaults to False. - verbose (bool, optional): if set log the output of the command (capture_output must also be - set). Defaults to True. - - Raises: - RunException: if the command fails: times out (timeout must be specified), - yields a non-zero exit status (check must be True), is interrupted by the user, or - encounters some other exception. + task_debug (bool, optional): whether to enable debug for the task object. Defaults to False. + stderr (bool, optional): whether to enable stdout/stderr separation. Defaults to False. Returns: - subprocess.CompletedProcess: an object representing the result of the command execution with - the following properties: - - args (the command argument) - - returncode - - stdout (only set if capture_output=True) - - stderr (not used; included in stdout) - + CommandResult: groups of command results from the same hosts with the same return status """ - local_host = gethostname().split(".")[0] - kwargs = {"encoding": "utf-8", "shell": False, "check": check, "timeout": timeout} - if capture_output: - kwargs["stdout"] = subprocess.PIPE - kwargs["stderr"] = subprocess.STDOUT - if timeout and verbose: - log.debug("Running on %s with a %s timeout: %s", local_host, timeout, command) - elif verbose: - log.debug("Running on %s: %s", local_host, command) - - try: - # pylint: disable=subprocess-run-check - result = subprocess.run(shlex.split(command), **kwargs) # nosec - - except subprocess.TimeoutExpired as error: - # Raised if command times out - log.debug(str(error)) - log.debug(" output: %s", error.output) - log.debug(" stderr: %s", error.stderr) - raise RunException(f"Command '{command}' exceed {timeout}s timeout") from error - - except subprocess.CalledProcessError as error: - # Raised if command yields a non-zero return status with check=True - log.debug(str(error)) - log.debug(" output: %s", error.output) - log.debug(" stderr: %s", error.stderr) - raise RunException(f"Command '{command}' returned non-zero status") from error - - except KeyboardInterrupt as error: - # User Ctrl-C - message = f"Command '{command}' interrupted by user" - log.debug(message) - raise RunException(message) from error - - except Exception as error: - # Catch all - message = f"Command '{command}' encountered unknown error" - log.debug(message) - log.debug(str(error)) - raise RunException(message) from error - - if capture_output and verbose: - # Log the output of the command - log.debug(" %s (rc=%s):", local_host, result.returncode) - if result.stdout: - for line in result.stdout.splitlines(): - log.debug(" %s", line) - - return result + local_host = NodeSet(gethostname().split(".")[0]) + task = task_self() + task.set_info('debug', task_debug) + task.set_default("stderr", stderr) + if verbose: + if timeout is None: + log.debug("Running on %s without a timeout: %s", local_host, command) + else: + log.debug("Running on %s with a %s second timeout: %s", local_host, timeout, command) + task.run(command=command, key=str(local_host), timeout=timeout) + results = CommandResult(command, task) + if verbose: + results.log_output(log) + else: + # Always log any failed commands + for data in results.output: + if not data.passed: + log_result_data(log, data) + return results def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, stderr=False, @@ -398,9 +358,7 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, clush default (64) or available cores Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ task = task_self() task.set_info('debug', task_debug) @@ -417,7 +375,7 @@ def run_remote(log, hosts, command, verbose=True, timeout=120, task_debug=False, else: log.debug("Running on %s with a %s second timeout: %s", hosts, timeout, command) task.run(command=command, nodes=hosts, timeout=timeout) - results = RemoteCommandResult(command, task) + results = CommandResult(command, task) if verbose: results.log_output(log) else: @@ -439,9 +397,8 @@ def command_as_user(command, user, env=None): Returns: str: command adjusted to run as another user - """ - if not user: + if not user or user == getuser(): if not env: return command return " ".join([env.to_export_str(), command]).strip() @@ -469,7 +426,6 @@ def find_command(source, pattern, depth, other=None): Returns: str: the find command - """ command = ["find", source, "-maxdepth", str(depth), "-type", "f", "-name", f"'{pattern}'"] if isinstance(other, list): @@ -498,7 +454,6 @@ def stop_processes(log, hosts, pattern, verbose=True, timeout=60, exclude=None, matching the pattern were initially detected and the second NodeSet indicates on which hosts the processes matching the pattern are still running (will be empty if every process was killed or no process matching the pattern were found). - """ processes_detected = NodeSet() processes_running = NodeSet() diff --git a/src/tests/ftest/util/server_utils.py b/src/tests/ftest/util/server_utils.py index 15e9f08ae30..299103fd577 100644 --- a/src/tests/ftest/util/server_utils.py +++ b/src/tests/ftest/util/server_utils.py @@ -395,9 +395,7 @@ def scm_prepare(self, **kwargs): DaosServerCommand.ScmSubCommand.PrepareSubCommand object Raises: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -420,9 +418,7 @@ def scm_reset(self, **kwargs): DaosServerCommand.ScmSubCommand.ResetSubCommand object Raises: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -440,9 +436,7 @@ def nvme_prepare(self, **kwargs): DaosServerCommand.NvmeSubCommand.PrepareSubCommand object Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.sudo = False @@ -460,9 +454,7 @@ def support_collect_log(self, **kwargs): DaosServerCommand.SupportSubCommand.CollectLogSubCommand object Returns: - RemoteCommandResult: a grouping of the command results from the same hosts with the same - return status - + CommandResult: groups of command results from the same hosts with the same return status """ cmd = DaosServerCommand(self.manager.job.command_path) cmd.run_user = "daos_server" diff --git a/src/tests/ftest/util/slurm_utils.py b/src/tests/ftest/util/slurm_utils.py index 9ba92b0f8c9..f2b4de4b393 100644 --- a/src/tests/ftest/util/slurm_utils.py +++ b/src/tests/ftest/util/slurm_utils.py @@ -1,5 +1,5 @@ """ -(C) Copyright 2019-2023 Intel Corporation. +(C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -11,7 +11,7 @@ from ClusterShell.NodeSet import NodeSet, NodeSetParseError # pylint: disable=import-error,no-name-in-module -from util.run_utils import RunException, run_local, run_remote +from util.run_utils import run_local, run_remote PACKAGES = ['slurm', 'slurm-example-configs', 'slurm-slurmctld', 'slurm-slurmd'] W_LOCK = threading.Lock() @@ -30,8 +30,7 @@ def cancel_jobs(log, control, job_id): job_id (int): slurm job id Returns: - RemoteCommandResult: results from the scancel command - + CommandResult: results from the scancel command """ command = ['scancel', str(job_id)] return run_remote(log, control, ' '.join(command)) @@ -52,8 +51,7 @@ def create_partition(log, control, name, hosts, default='yes', max_time='UNLIMIT state (str, optional): state of jobs that can be allocated. Defaults to 'up'. Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'create'] command.append('='.join(['PartitionName', str(name)])) @@ -90,8 +88,7 @@ def show_partition(log, control, name): name (str): slurm partition name Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'show', 'partition', str(name)] return run_remote(log, control, ' '.join(command)) @@ -106,8 +103,7 @@ def show_reservation(log, control, name): name (str): slurm reservation name Returns: - RemoteCommandResult: results from the scontrol command - + CommandResult: results from the scontrol command """ command = ['scontrol', 'show', 'reservation', str(name)] return run_remote(log, control, ' '.join(command)) @@ -121,8 +117,7 @@ def sinfo(log, control): control (NodeSet): slurm control host Returns: - RemoteCommandResult: results from the sinfo command - + CommandResult: results from the sinfo command """ return run_remote(log, control, 'sinfo') @@ -137,7 +132,6 @@ def sbatch(log, script, log_file=None): Returns: CommandResult: results from the sbatch command - """ command = ['sbatch'] if log_file: @@ -285,7 +279,7 @@ def run_slurm_script(log, script, logfile=None): """ job_id = None result = sbatch(log, script, logfile) - match = re.search(r"Submitted\s+batch\s+job\s+(\d+)", result.stdout) + match = re.search(r"Submitted\s+batch\s+job\s+(\d+)", result.joined_stdout) if match is not None: job_id = match.group(1) else: @@ -306,14 +300,12 @@ def check_slurm_job(log, handle): """ state = "UNKNOWN" - command = ["scontrol", "show", "job", handle] - try: - result = run_local(log, ' '.join(command), verbose=False, check=True) - match = re.search(r"JobState=([a-zA-Z]+)", result.stdout) + command = f"scontrol show job {handle}" + result = run_local(log, command, verbose=False) + if result.passed: + match = re.search(r"JobState=([a-zA-Z]+)", result.joined_stdout) if match is not None: state = match.group(1) - except RunException as error: - log.debug(str(error)) return state @@ -401,8 +393,7 @@ def srun(log, control, hosts, cmd, srun_params=None, timeout=60): timeout (int, optional): timeout for the srun command. Defaults to 60. Returns: - RemoteCommandResult: results from the srun command - + CommandResult: results from the srun command """ srun_time = max(int(timeout / 60), 1) cmd = srun_str(hosts, cmd, srun_params, str(srun_time)) diff --git a/src/tests/ftest/util/soak_test_base.py b/src/tests/ftest/util/soak_test_base.py index a97bb2094eb..b5e35949218 100644 --- a/src/tests/ftest/util/soak_test_base.py +++ b/src/tests/ftest/util/soak_test_base.py @@ -21,7 +21,7 @@ from exception_utils import CommandFailure from general_utils import journalctl_time from host_utils import get_local_host -from run_utils import RunException, run_local, run_remote +from run_utils import run_local, run_remote from soak_utils import (SoakTestError, add_pools, build_job_script, cleanup_dfuse, create_app_cmdline, create_dm_cmdline, create_fio_cmdline, create_ior_cmdline, create_macsio_cmdline, create_mdtest_cmdline, @@ -137,11 +137,9 @@ def pre_tear_down(self): self.log.info("<>", job_id) cmd = "scancel --partition {} -u {} {}".format( self.host_info.clients.partition.name, self.username, job_id) - try: - run_local(self.log, cmd, timeout=120) - except RunException as error: + if not run_local(self.log, cmd, timeout=120).passed: # Exception was raised due to a non-zero exit status - errors.append(f"Failed to cancel jobs {self.failed_job_id_list}: {error}") + errors.append(f"Failed to cancel jobs {self.failed_job_id_list}") if self.all_failed_jobs: errors.append("SOAK FAILED: The following jobs failed {} ".format( " ,".join(str(j_id) for j_id in self.all_failed_jobs))) @@ -473,11 +471,10 @@ def job_completion(self, job_id_list): if not result.passed: self.log.error("Remote copy failed on %s", str(result.failed_hosts)) # copy the local files; local host not included in hostlist_client - try: - run_local(self.log, cmd, timeout=600) - run_local(self.log, cmd2, timeout=600) - except RunException as error: - self.log.info("Local copy failed with %s", error) + if not run_local(self.log, cmd, timeout=600).passed: + self.log.info("Local copy failed: %s", cmd) + if not run_local(self.log, cmd2, timeout=600).passed: + self.log.info("Local copy failed: %s", cmd2) self.soak_results = {} return job_id_list @@ -604,11 +601,8 @@ def run_soak(self, test_param): " ".join([pool.identifier for pool in self.pool])) # cleanup soak log directories before test - try: - run_local(self.log, f"rm -rf {self.soak_dir}/*", timeout=300) - except RunException as error: - raise SoakTestError( - f"<>") from error + if not run_local(self.log, f"rm -rf {self.soak_dir}/*", timeout=300).passed: + raise SoakTestError(f"<>") if self.enable_remote_logging: result = run_remote( self.log, self.hostlist_clients, f"rm -rf {self.soak_dir}/*", timeout=300) @@ -616,11 +610,9 @@ def run_soak(self, test_param): raise SoakTestError( f"<> {str(result.failed_hosts)}") else: - try: - run_local(self.log, f"rm -rf {self.sharedsoak_dir}/*", timeout=300) - except RunException as error: + if not run_local(self.log, f"rm -rf {self.sharedsoak_dir}/*", timeout=300).passed: raise SoakTestError( - f"<>") from error + f"<>") # Baseline metrics data run_metrics_check(self, prefix="initial") # Initialize time diff --git a/src/tests/ftest/util/user_utils.py b/src/tests/ftest/util/user_utils.py index 368a25862db..ce5a58ed4f7 100644 --- a/src/tests/ftest/util/user_utils.py +++ b/src/tests/ftest/util/user_utils.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2018-2023 Intel Corporation. + (C) Copyright 2018-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -81,8 +81,7 @@ def getent(log, hosts, database, key, sudo=False): sudo (bool): whether to execute commands with sudo Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -103,8 +102,7 @@ def groupadd(log, hosts, group, force=False, sudo=False): sudo (bool, optional): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -127,8 +125,7 @@ def useradd(log, hosts, user, group=None, parent_dir=None, sudo=False): sudo (bool): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, @@ -150,8 +147,7 @@ def userdel(log, hosts, user, sudo=False): sudo (bool): whether to execute commands with sudo. Default is False Returns: - RemoteCommandResult: result of run_remote() - + CommandResult: groups of command results from the same hosts with the same return status """ command = ' '.join(filter(None, [ 'sudo -n' if sudo else None, diff --git a/src/tests/ftest/verify_perms.py b/src/tests/ftest/verify_perms.py index 1596a5664e9..21b3a986c03 100755 --- a/src/tests/ftest/verify_perms.py +++ b/src/tests/ftest/verify_perms.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ - (C) Copyright 2022-2023 Intel Corporation. + (C) Copyright 2022-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -17,7 +17,7 @@ # pylint: disable=import-error,no-name-in-module from util.logger_utils import get_console_handler -from util.run_utils import RunException, run_local +from util.run_utils import run_local from util.user_utils import get_user_uid_gid # Set up a logger for the console messages @@ -290,10 +290,7 @@ def _real_x(entry_type, path): ''' if entry_type == 'file': - try: - return run_local(logger, path, check=True, verbose=False).returncode == 0 - except RunException: - return False + return run_local(logger, path, verbose=False).passed if entry_type == 'dir': try: os.chdir(path) From ce2e20e810c1221445af3c12c86a6b13e426bb4e Mon Sep 17 00:00:00 2001 From: Michael MacDonald Date: Tue, 30 Jul 2024 15:28:49 -0400 Subject: [PATCH 09/11] DAOS-16013 tools: Query all pool targets by default (#14783) Running `(daos|dmg) pool query-targets` with just a rank argument should query all targets on that rank. Signed-off-by: Michael MacDonald --- src/control/cmd/daos/pool.go | 14 +++++++++++++ src/control/cmd/dmg/pool.go | 26 ++++++++++++++++++++---- src/control/cmd/dmg/pool_test.go | 35 ++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+), 4 deletions(-) diff --git a/src/control/cmd/daos/pool.go b/src/control/cmd/daos/pool.go index 275e0859005..2aae717766e 100644 --- a/src/control/cmd/daos/pool.go +++ b/src/control/cmd/daos/pool.go @@ -416,6 +416,20 @@ func (cmd *poolQueryTargetsCmd) Execute(_ []string) error { return errors.WithMessage(err, "parsing target list") } + if len(idxList) == 0 { + pi, err := queryPool(cmd.cPoolHandle, daos.HealthOnlyPoolQueryMask) + if err != nil || (pi.TotalTargets == 0 || pi.TotalEngines == 0) { + if err != nil { + return errors.Wrap(err, "pool query failed") + } + return errors.New("failed to derive target count from pool query") + } + tgtCount := pi.TotalTargets / pi.TotalEngines + for i := uint32(0); i < tgtCount; i++ { + idxList = append(idxList, i) + } + } + ptInfo := new(C.daos_target_info_t) var rc C.int diff --git a/src/control/cmd/dmg/pool.go b/src/control/cmd/dmg/pool.go index e75043873a2..612051c27cf 100644 --- a/src/control/cmd/dmg/pool.go +++ b/src/control/cmd/dmg/pool.go @@ -678,15 +678,33 @@ type PoolQueryTargetsCmd struct { poolCmd Rank uint32 `long:"rank" required:"1" description:"Engine rank of the targets to be queried"` - Targets string `long:"target-idx" required:"1" description:"Comma-separated list of target idx(s) to be queried"` + Targets string `long:"target-idx" description:"Comma-separated list of target idx(s) to be queried"` } // Execute is run when PoolQueryTargetsCmd subcommand is activated func (cmd *PoolQueryTargetsCmd) Execute(args []string) error { + ctx := cmd.MustLogCtx() var tgtsList []uint32 - if err := common.ParseNumberList(cmd.Targets, &tgtsList); err != nil { - return errors.WithMessage(err, "parsing target list") + if len(cmd.Targets) > 0 { + if err := common.ParseNumberList(cmd.Targets, &tgtsList); err != nil { + return errors.WithMessage(err, "parsing target list") + } + } else { + pi, err := control.PoolQuery(ctx, cmd.ctlInvoker, &control.PoolQueryReq{ + ID: cmd.PoolID().String(), + QueryMask: daos.DefaultPoolQueryMask, + }) + if err != nil || (pi.TotalTargets == 0 || pi.TotalEngines == 0) { + if err != nil { + return errors.Wrap(err, "pool query failed") + } + return errors.New("failed to derive target count from pool query") + } + tgtCount := pi.TotalTargets / pi.TotalEngines + for i := uint32(0); i < tgtCount; i++ { + tgtsList = append(tgtsList, i) + } } req := &control.PoolQueryTargetReq{ @@ -695,7 +713,7 @@ func (cmd *PoolQueryTargetsCmd) Execute(args []string) error { Targets: tgtsList, } - resp, err := control.PoolQueryTargets(cmd.MustLogCtx(), cmd.ctlInvoker, req) + resp, err := control.PoolQueryTargets(ctx, cmd.ctlInvoker, req) if cmd.JSONOutputEnabled() { return cmd.OutputJSON(resp, err) diff --git a/src/control/cmd/dmg/pool_test.go b/src/control/cmd/dmg/pool_test.go index d02579b3a7f..67bf72bf54e 100644 --- a/src/control/cmd/dmg/pool_test.go +++ b/src/control/cmd/dmg/pool_test.go @@ -957,6 +957,41 @@ func TestPoolCommands(t *testing.T) { }, " "), nil, }, + { + "Query pool targets no arguments", + "pool query-targets mypool", + "", + errMissingFlag, + }, + { + "Query pool targets no rank argument", + "pool query-targets mypool --target-idx=1", + "", + errMissingFlag, + }, + { + "Query pool targets specific indices", + "pool query-targets mypool --rank=1 --target-idx=1,3", + strings.Join([]string{ + printRequest(t, &control.PoolQueryTargetReq{ + ID: "mypool", + Rank: 1, + Targets: []uint32{1, 3}, + }), + }, " "), + nil, + }, + { + "Query pool targets all indices; pool query fails", + "pool query-targets mypool --rank=1", + strings.Join([]string{ + printRequest(t, &control.PoolQueryReq{ + ID: "mypool", + QueryMask: daos.DefaultPoolQueryMask, + }), + }, " "), + errors.New("pool query"), + }, { "Query pool with UUID", "pool query 12345678-1234-1234-1234-1234567890ab", From 26df5454529bbb8cb009af544894a4a2819864c0 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Wed, 31 Jul 2024 09:37:26 +0800 Subject: [PATCH 10/11] DAOS-16264 common: Fix incorrect assertion (#14809) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the LRU cache is performing eviction, new lookups should fail. Currently, this logic is implemented on the caller’s side. Let's move this logic to the DAOS LRU side to return DER_SHUTDOWN if LRU is evicting and remove incorrect assertion. Signed-off-by: Wang Shilong --- src/common/lru.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/common/lru.c b/src/common/lru.c index 98d3f32bcf5..bcd3f582a33 100644 --- a/src/common/lru.c +++ b/src/common/lru.c @@ -25,7 +25,6 @@ lru_hop_rec_addref(struct d_hash_table *htable, d_list_t *link) { struct daos_llink *llink = link2llink(link); - D_ASSERT(llink->ll_evicting == 0 && llink->ll_evicted == 0); llink->ll_ref++; } @@ -216,6 +215,10 @@ daos_lru_ref_hold(struct daos_lru_cache *lcache, void *key, if (link != NULL) { llink = link2llink(link); D_ASSERT(llink->ll_evicted == 0); + if (llink->ll_evicting) { + daos_lru_ref_release(lcache, llink); + D_GOTO(out, rc = -DER_SHUTDOWN); + } /* remove busy item from LRU */ if (!d_list_empty(&llink->ll_qlink)) d_list_del_init(&llink->ll_qlink); From a0af03d308f82365a7a11022e24dbf820b8868bd Mon Sep 17 00:00:00 2001 From: wiliamhuang Date: Wed, 31 Jul 2024 09:15:49 -0500 Subject: [PATCH 11/11] DAOS-16286 client: intercept fdatasync() (#14835) Signed-off-by: Lei Huang --- src/client/dfuse/pil4dfs/int_dfs.c | 24 ++++++++++++++++++++++++ src/tests/suite/dfuse_test.c | 4 ++++ 2 files changed, 28 insertions(+) diff --git a/src/client/dfuse/pil4dfs/int_dfs.c b/src/client/dfuse/pil4dfs/int_dfs.c index 95688b8f280..ad748852888 100644 --- a/src/client/dfuse/pil4dfs/int_dfs.c +++ b/src/client/dfuse/pil4dfs/int_dfs.c @@ -399,6 +399,8 @@ static int (*next_unlinkat)(int dirfd, const char *path, int flags); static int (*next_fsync)(int fd); +static int (*next_fdatasync)(int fd); + static int (*next_truncate)(const char *path, off_t length); static int (*next_ftruncate)(int fd, off_t length); @@ -5309,6 +5311,28 @@ fsync(int fd) return 0; } +int +fdatasync(int fd) +{ + int fd_directed; + + if (next_fdatasync == NULL) { + next_fdatasync = dlsym(RTLD_NEXT, "fdatasync"); + D_ASSERT(next_fdatasync != NULL); + } + if (!d_hook_enabled) + return next_fdatasync(fd); + + fd_directed = d_get_fd_redirected(fd); + if (fd_directed < FD_FILE_BASE) + return next_fdatasync(fd); + + if (fd < FD_DIR_BASE && d_compatible_mode) + return next_fdatasync(fd); + + return 0; +} + int ftruncate(int fd, off_t length) { diff --git a/src/tests/suite/dfuse_test.c b/src/tests/suite/dfuse_test.c index 6cddb317fa7..acf46e6b7e4 100644 --- a/src/tests/suite/dfuse_test.c +++ b/src/tests/suite/dfuse_test.c @@ -101,6 +101,10 @@ do_openat(void **state) rc = write(fd, input_buf, sizeof(input_buf)); assert_return_code(rc, errno); + /* test fdatasync() */ + rc = fdatasync(fd); + assert_return_code(rc, errno); + /* First fstat. IL will forward this to the kernel so it can save ino for future calls */ rc = fstat(fd, &stbuf0); assert_return_code(rc, errno);