From ee81e40f3f959cfafd60af808a44b0be1defd961 Mon Sep 17 00:00:00 2001 From: Ashley Pittman Date: Mon, 29 Apr 2024 18:49:20 +0100 Subject: [PATCH 01/10] DAOS-15745 dfuse: Add the pre_read metrics whilst holding reference. (#14256) Increase the pre-read statistics before replying to the read, otherwise the oh might not be valid which can lead to unexpected behaviour. Signed-off-by: Ashley Pittman --- src/client/dfuse/ops/read.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c index 991755d461b..26c97204fbb 100644 --- a/src/client/dfuse/ops/read.c +++ b/src/client/dfuse/ops/read.c @@ -97,6 +97,7 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o position + reply_len - 1, position + reply_len, position + len - 1); } + DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); DFUSE_REPLY_BUFQ(oh, req, oh->doh_readahead->dra_ev->de_iov.iov_buf + position, reply_len); return true; } @@ -143,10 +144,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct replied = dfuse_readahead_reply(req, len, position, oh); D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock); - if (replied) { - DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ); + if (replied) return; - } } eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1); From ae34616f10d2b6e561447c385d1017c1850f7cbe Mon Sep 17 00:00:00 2001 From: dinghwah <48604964+dinghwah@users.noreply.github.com> Date: Mon, 29 Apr 2024 14:45:59 -0400 Subject: [PATCH 02/10] DAOS-15628 test: Verify maximum containers create with and without dup metadata ops (#14243) implement more test to metadata with disable svc. Signed-off-by: Ding Ho --- src/tests/ftest/server/metadata.py | 125 +++++++++++++++++++-------- src/tests/ftest/server/metadata.yaml | 17 ++-- 2 files changed, 98 insertions(+), 44 deletions(-) diff --git a/src/tests/ftest/server/metadata.py b/src/tests/ftest/server/metadata.py index 1125003816a..e072ce5351b 100644 --- a/src/tests/ftest/server/metadata.py +++ b/src/tests/ftest/server/metadata.py @@ -1,5 +1,5 @@ """ - (C) Copyright 2019-2023 Intel Corporation. + (C) Copyright 2019-2024 Intel Corporation. SPDX-License-Identifier: BSD-2-Clause-Patent """ @@ -86,9 +86,19 @@ def pre_tear_down(self): self.log.debug("no pre-teardown steps defined") return error_list - def create_pool(self): - """Create a pool and display the svc ranks.""" - self.add_pool() + def create_pool(self, svc_ops_enabled=True): + """Create a pool and display the svc ranks. + + Args: + svc_ops_enabled (bool, optional): pool create with svc_ops_enabled. Defaults to True. + + """ + if svc_ops_enabled: + self.add_pool() + else: + params = {} + params['properties'] = "svc_ops_enabled:0" + self.add_pool(**params) self.log.info("Created pool %s: svc ranks:", self.pool.uuid) for index, rank in enumerate(self.pool.svc_ranks): self.log.info("[%d]: %d", index, rank) @@ -236,24 +246,21 @@ def run_dummy_metadata_workload(self, duration=150): return True - def test_metadata_fillup(self): - """JIRA ID: DAOS-1512. - - Test Description: - Test to verify no IO happens after metadata is full. + def metadata_fillup(self, svc_ops_enabled=True): + """Run test to verify number of resources that can be created until metadata is full. - Use Cases: - ? + Args: + svc_ops_enabled (bool): Pool create properties svc_ops_enabled. Defaults to True. - :avocado: tags=all,full_regression - :avocado: tags=hw,large - :avocado: tags=server,metadata - :avocado: tags=ObjectMetadata,test_metadata_fillup """ - self.create_pool() - svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age") - if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age): - self.fail("failed to run dummy metadata workload") + self.log_step("Create pool with properties svc_ops_enabled: {}".format(svc_ops_enabled)) + self.create_pool(svc_ops_enabled=svc_ops_enabled) + # Run dummy_metadata_workload when feature is enabled + if svc_ops_enabled: + self.log.info("svc_ops_enabled enabled, run dummy_metadata_workload") + svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age") + if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age): + self.fail("Failed to run dummy metadata workload") sequential_fail_max = self.params.get("fillup_seq_fail_max", "/run/metadata/*") num_cont_to_destroy = self.params.get("num_cont_to_destroy", "/run/metadata/*") @@ -266,10 +273,7 @@ def test_metadata_fillup(self): # Phase 2: if Phase 1 passed: # clean up several (not all) containers created (prove "critical" destroy # in rdb (and vos) works without cascading no space errors - - # Phase 1 sustained container creates even after no space error - self.log.info( - "Phase 1: sustained container creates: to no space and beyond") + self.log_step("Sustained container creates: to no space and beyond.") self.container = [] sequential_fail_counter = 0 in_failure = False @@ -285,43 +289,44 @@ def test_metadata_fillup(self): sequential_fail_counter += 1 if sequential_fail_counter >= sequential_fail_max: self.log.info( - "Phase 1: container %d - %d/%d sequential no space " + "Container %d - %d/%d sequential no space " "container create errors", sequential_fail_counter, sequential_fail_max, loop) break if status and in_failure: self.log.info( - "Phase 1: container: %d - no space -> available " + "Container: %d - no space -> available " "transition, sequential no space failures: %d", loop, sequential_fail_counter) in_failure = False elif not status and not in_failure: self.log.info( - "Phase 1: container: %d - available -> no space " + "Container: %d - available -> no space " "transition, sequential no space failures: %d", loop, sequential_fail_counter) in_failure = True except TestFail as error: self.log.error(str(error)) - self.fail("Phase 1: fail (unexpected container create error)") + self.fail("fail (unexpected container create error)") + self.log_step("Verify number of container within limit.") if len(self.container) >= self.created_containers_limit: - self.log.error("Phase 1: Created too many containers: %d > %d", len(self.container), + self.log.error("Created too many containers: %d > %d", len(self.container), self.created_containers_limit) - self.fail("Phase 1: Created too many containers") + self.fail("Created too many containers") if len(self.container) < self.created_containers_min: - self.log.info("Phase 1: Created too few containers: %d < %d", len(self.container), + self.log.info("Created too few containers: %d < %d", len(self.container), self.created_containers_min) - self.fail("Phase 1: Created too few containers") + self.fail("Created too few containers") self.log.info( - "Phase 1: passed (created %d / %d containers)", len(self.container), loop) + "Successfully created %d / %d containers)", len(self.container), loop) # Phase 2 clean up some containers (expected to succeed) - self.log.info("Phase 2: Cleaning up %d containers (expected to work)", num_cont_to_destroy) + msg = "Cleaning up {} containers after pool is full.".format(num_cont_to_destroy) + self.log_step(msg) if not self.destroy_num_containers(num_cont_to_destroy): - self.fail("Phase 2: fail (unexpected container destroy error)") - self.log.info("Phase 2: passed") + self.fail("Fail (unexpected container destroy error)") # Do not destroy containers in teardown (destroy pool while metadata rdb is full) for container in self.container: @@ -329,6 +334,46 @@ def test_metadata_fillup(self): self.log.info("Leaving pool metadata rdb full (containers will not be destroyed)") self.log.info("Test passed") + def test_metadata_fillup_svc_ops_disabled(self): + """JIRA ID: DAOS-15628. + + Test Description: + Test to verify number of resources that can be created until metadata is full, + when svc_ops disabled. + Use Cases: + 1. Create pool with properties svc_ops_enabled:0. + 2. Create container until no space. + 3. Verify number of container within limit. + 4. Cleaning up containers after pool is full. + + :avocado: tags=all,full_regression + :avocado: tags=hw,large + :avocado: tags=server,metadata + :avocado: tags=ObjectMetadata,test_metadata_fillup_svc_ops_disabled + """ + self.metadata_fillup(False) + + def test_metadata_fillup_svc_ops_enabled(self): + """JIRA ID: DAOS-15628. + + Test Description: + Test to verify number of resources that can be created until metadata is full, + when svc_ops_enabled. + + Use Cases: + 1. Create pool with properties svc_ops_enabled:1. + and run dummy metadata workload to fill up svc ops. + 2. Create container until no space. + 3. Verify number of container within limit. + 4. Cleaning up containers after pool is full. + + :avocado: tags=all,full_regression + :avocado: tags=hw,large + :avocado: tags=server,metadata + :avocado: tags=ObjectMetadata,test_metadata_fillup_svc_ops_enabled + """ + self.metadata_fillup(True) + def test_metadata_addremove(self): """JIRA ID: DAOS-1512. @@ -344,9 +389,13 @@ def test_metadata_addremove(self): :avocado: tags=ObjectMetadata,test_metadata_addremove """ self.create_pool() - if not self.run_dummy_metadata_workload(): - self.fail("failed to run dummy metadata workload") - + svc_ops_enabled = self.pool.get_property("svc_ops_enabled") + if svc_ops_enabled: + svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age") + if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age): + self.fail("failed to run dummy metadata workload") + else: + self.fail("svc_ops_enabled:0 is not supported for this testcase.") self.container = [] mean_cont_cnt = 0 percent_cont = self.params.get("mean_percent", "/run/metadata/*") diff --git a/src/tests/ftest/server/metadata.yaml b/src/tests/ftest/server/metadata.yaml index 80f37a5f128..e25ddc97170 100644 --- a/src/tests/ftest/server/metadata.yaml +++ b/src/tests/ftest/server/metadata.yaml @@ -2,9 +2,10 @@ hosts: test_servers: 4 test_clients: 1 timeouts: - test_metadata_fillup: 600 - test_metadata_addremove: 1600 - test_metadata_server_restart: 960 + test_metadata_fillup_svc_ops_disabled: 400 + test_metadata_fillup_svc_ops_enabled: 400 + test_metadata_addremove: 1300 + test_metadata_server_restart: 500 server_config: name: daos_server engines_per_host: 2 @@ -47,7 +48,11 @@ pool: svcn: 5 scm_size: 1G control_method: dmg - properties: svc_ops_enabled:1,svc_ops_entry_age:150 + properties: svc_ops_entry_age:60 +# Uncomment the following for manual test with different svc_ops_entry_age value +# properties: svc_ops_entry_age:150 +# properties: svc_ops_entry_age:300 +# properties: svc_ops_entry_age:600 container: control_method: API silent: true @@ -61,7 +66,7 @@ ior: metadata: mean_percent: 1 num_addremove_loops: 4 - created_cont_min: 25000 - created_cont_max: 36000 + created_cont_min: 30000 + created_cont_max: 39000 num_cont_to_destroy: 500 fillup_seq_fail_max: 512 From 1ce781b9bb330fe5e1310f43f7de0ffa84b73d49 Mon Sep 17 00:00:00 2001 From: Kris Jacque Date: Mon, 29 Apr 2024 12:56:24 -0600 Subject: [PATCH 03/10] DAOS-13520 control: Fix UUID filter for dmg check query (#13050) Use requested UUIDs to filter check reports for specific pools. Signed-off-by: Kris Jacque --- src/control/server/mgmt_check.go | 22 ++- src/control/server/mgmt_check_test.go | 208 ++++++++++++++++++++++++++ 2 files changed, 229 insertions(+), 1 deletion(-) diff --git a/src/control/server/mgmt_check.go b/src/control/server/mgmt_check.go index 606f4e2dad4..5312763cef8 100644 --- a/src/control/server/mgmt_check.go +++ b/src/control/server/mgmt_check.go @@ -331,6 +331,13 @@ func (svc *mgmtSvc) SystemCheckQuery(ctx context.Context, req *mgmtpb.CheckQuery req.Shallow = true } + uuids := common.NewStringSet(req.Uuids...) + wantUUID := func(uuid string) bool { + return len(uuids) == 0 || uuids.Has(uuid) + } + + reports := []*chkpb.CheckReport{} + if !req.Shallow { dResp, err := svc.makePoolCheckerCall(ctx, drpc.MethodCheckerQuery, req) if err != nil { @@ -340,16 +347,29 @@ func (svc *mgmtSvc) SystemCheckQuery(ctx context.Context, req *mgmtpb.CheckQuery if err = proto.Unmarshal(dResp.Body, resp); err != nil { return nil, errors.Wrap(err, "unmarshal CheckQuery response") } + + for _, r := range resp.Reports { + if wantUUID(r.PoolUuid) { + reports = append(reports, r) + } + } } + // Collect saved older reports cfList, err := svc.sysdb.GetCheckerFindings(req.GetSeqs()...) if err != nil { return nil, err } for _, f := range cfList { - resp.Reports = append(resp.Reports, &f.CheckReport) + if wantUUID(f.PoolUuid) { + reports = append(reports, &f.CheckReport) + } } + sort.Slice(reports, func(i, j int) bool { + return reports[i].Seq < reports[j].Seq + }) + resp.Reports = reports return resp, nil } diff --git a/src/control/server/mgmt_check_test.go b/src/control/server/mgmt_check_test.go index 0465dec5e19..b59daad699e 100644 --- a/src/control/server/mgmt_check_test.go +++ b/src/control/server/mgmt_check_test.go @@ -641,3 +641,211 @@ func TestServer_mgmtSvc_SystemCheckSetPolicy(t *testing.T) { }) } } + +func TestServer_mgmtSvc_SystemCheckQuery(t *testing.T) { + uuids := testPoolUUIDs(3) + testFindingsMS := []*chkpb.CheckReport{} + testFindingsDrpc := []*chkpb.CheckReport{} + drpcPools := []*mgmtpb.CheckQueryPool{} + for i, uuid := range uuids { + testFindingsMS = append(testFindingsMS, &chkpb.CheckReport{ + Seq: uint64(i + 1), + Class: chkpb.CheckInconsistClass_CIC_CONT_BAD_LABEL, + Action: chkpb.CheckInconsistAction_CIA_TRUST_MS, + PoolUuid: uuid, + }) + + testFindingsDrpc = append(testFindingsDrpc, &chkpb.CheckReport{ + Seq: uint64(i + 1 + len(uuids)), + Class: chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_ENGINE, + Action: chkpb.CheckInconsistAction_CIA_TRUST_MS, + PoolUuid: uuid, + }) + + drpcPools = append(drpcPools, &mgmtpb.CheckQueryPool{ + Uuid: uuid, + Status: chkpb.CheckPoolStatus(i), + Phase: chkpb.CheckScanPhase(i), + }) + } + + drpcResp := &mgmtpb.CheckQueryResp{ + InsStatus: chkpb.CheckInstStatus_CIS_RUNNING, + InsPhase: chkpb.CheckScanPhase_CSP_AGGREGATION, + Pools: drpcPools, + Reports: testFindingsDrpc, + } + + for name, tc := range map[string]struct { + createMS func(*testing.T, logging.Logger) *mgmtSvc + setupDrpc func(*testing.T, *mgmtSvc) + req *mgmtpb.CheckQueryReq + expResp *mgmtpb.CheckQueryResp + expErr error + }{ + "not MS replica": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + svc := newTestMgmtSvc(t, log) + svc.sysdb = raft.MockDatabaseWithCfg(t, log, &raft.DatabaseConfig{ + SystemName: build.DefaultSystemName, + Replicas: []*net.TCPAddr{{IP: net.IP{111, 222, 1, 1}}}, + }) + return svc + }, + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expErr: errors.New("replica"), + }, + "checker is not enabled": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return testSvcWithMemberState(t, log, system.MemberStateCheckerStarted, uuids) + }, + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expErr: checker.FaultCheckerNotEnabled, + }, + "bad member states": { + createMS: func(t *testing.T, log logging.Logger) *mgmtSvc { + return testSvcCheckerEnabled(t, log, system.MemberStateJoined, uuids) + }, + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expErr: errors.New("expected states"), + }, + "dRPC fails": { + setupDrpc: func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClient(ms, nil, errors.New("mock dRPC")) + }, + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expErr: errors.New("mock dRPC"), + }, + "bad resp": { + setupDrpc: func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClientBytes(ms, []byte("garbage"), nil) + }, + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expErr: errors.New("unmarshal CheckQuery response"), + }, + "success": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + }, + expResp: &mgmtpb.CheckQueryResp{ + InsStatus: chkpb.CheckInstStatus_CIS_RUNNING, + InsPhase: chkpb.CheckScanPhase_CSP_AGGREGATION, + Pools: drpcPools, + Reports: append(testFindingsMS, testFindingsDrpc...), + }, + }, + "shallow": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + Shallow: true, + }, + setupDrpc: func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC")) + }, + expResp: &mgmtpb.CheckQueryResp{ + Reports: testFindingsMS, + }, + }, + "request sequence numbers": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + Seqs: []uint64{2, 3}, + }, + setupDrpc: func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC")) + }, + expResp: &mgmtpb.CheckQueryResp{ + Reports: []*chkpb.CheckReport{ + testFindingsMS[1], + testFindingsMS[2], + }, + }, + }, + "request invalid sequence number": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + Seqs: []uint64{2, 3, 25}, + }, + setupDrpc: func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC")) + }, + expErr: errors.New("not found"), + }, + "request all uuids": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + Uuids: uuids, + }, + expResp: &mgmtpb.CheckQueryResp{ + InsStatus: chkpb.CheckInstStatus_CIS_RUNNING, + InsPhase: chkpb.CheckScanPhase_CSP_AGGREGATION, + Pools: drpcPools, + Reports: append(testFindingsMS, testFindingsDrpc...), + }, + }, + "filter uuids": { + req: &mgmtpb.CheckQueryReq{ + Sys: "daos_server", + Uuids: []string{uuids[0], uuids[2]}, + }, + expResp: &mgmtpb.CheckQueryResp{ + InsStatus: chkpb.CheckInstStatus_CIS_RUNNING, + InsPhase: chkpb.CheckScanPhase_CSP_AGGREGATION, + Pools: drpcPools, + Reports: []*chkpb.CheckReport{ + testFindingsMS[0], + testFindingsMS[2], + testFindingsDrpc[0], + testFindingsDrpc[2], + }, + }, + }, + } { + t.Run(name, func(t *testing.T) { + log, buf := logging.NewTestLogger(t.Name()) + defer test.ShowBufferOnFailure(t, buf) + + if tc.createMS == nil { + tc.createMS = func(t *testing.T, log logging.Logger) *mgmtSvc { + svc := testSvcCheckerEnabled(t, log, system.MemberStateCheckerStarted, uuids) + for _, f := range testFindingsMS { + if err := svc.sysdb.AddCheckerFinding(&checker.Finding{CheckReport: *f}); err != nil { + t.Fatalf("unable to add finding %+v: %s", f, err.Error()) + } + } + return svc + } + } + svc := tc.createMS(t, log) + + if tc.setupDrpc == nil { + tc.setupDrpc = func(t *testing.T, ms *mgmtSvc) { + setupMockDrpcClient(ms, drpcResp, nil) + } + } + tc.setupDrpc(t, svc) + + resp, err := svc.SystemCheckQuery(test.Context(t), tc.req) + + test.CmpErr(t, tc.expErr, err) + if diff := cmp.Diff(tc.expResp, resp, + cmpopts.IgnoreUnexported( + mgmtpb.CheckQueryResp{}, + mgmtpb.CheckQueryPool{}, + chkpb.CheckReport{}), + ); diff != "" { + t.Fatalf("want-, got+:\n%s", diff) + } + }) + } +} From cbf716c2462cdf125c29e6c02a81ffd83fa754ea Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 29 Apr 2024 12:27:46 -0700 Subject: [PATCH 04/10] DAOS-623 test: fix avocado run --failfast (#14253) avocado run --failfast on is now avocado run --failfast Signed-off-by: Dalton Bohning --- src/tests/ftest/util/avocado_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/util/avocado_utils.py b/src/tests/ftest/util/avocado_utils.py index 4e7d7b09e61..4038fc2c168 100644 --- a/src/tests/ftest/util/avocado_utils.py +++ b/src/tests/ftest/util/avocado_utils.py @@ -205,7 +205,7 @@ def get_run_command(self, test, tag_filters, sparse, failfast): if tag_filters: command.extend(tag_filters) if failfast: - command.extend(["--failfast", "on"]) + command.append("--failfast") command.extend(["--mux-yaml", test.yaml_file]) if test.extra_yaml: command.extend(test.extra_yaml) From 867c8eecc4531a43d37951d7cf96218b568009c9 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 29 Apr 2024 12:39:51 -0700 Subject: [PATCH 05/10] DAOS-15684 test: add test case for custom server name (#14225) Add a case for changing the default name of "daos_server" Fix bug when setting config name Signed-off-by: Dalton Bohning Co-authored-by: Mohamad Chaarawi --- src/client/dfs/duns.c | 2 +- src/include/daos_pool.h | 2 -- src/tests/ftest/control/daos_system_query.py | 2 +- src/tests/ftest/control/daos_system_query.yaml | 3 ++- src/tests/ftest/util/apricot/apricot/test.py | 2 +- 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/client/dfs/duns.c b/src/client/dfs/duns.c index 1a1bdbda623..71ce57bf097 100644 --- a/src/client/dfs/duns.c +++ b/src/client/dfs/duns.c @@ -1377,7 +1377,7 @@ duns_set_sys_name(struct duns_attr_t *attrp, const char *sys) { if (attrp == NULL) return EINVAL; - D_STRNDUP(attrp->da_sys, sys, DAOS_SYS_NAME_MAX_LEN); + D_STRNDUP(attrp->da_sys, sys, DAOS_SYS_NAME_MAX); if (attrp->da_sys == NULL) return ENOMEM; diff --git a/src/include/daos_pool.h b/src/include/daos_pool.h index 5edc5aa3c2d..99173ea6638 100644 --- a/src/include/daos_pool.h +++ b/src/include/daos_pool.h @@ -210,8 +210,6 @@ struct daos_pool_cont_info2 { void *pci_reserved[2]; }; -#define DAOS_SYS_NAME_MAX_LEN 127 - /** * Connect to the DAOS pool identified by \a pool, a label or UUID string. * Upon a successful completion, \a poh returns the pool handle, and \a info diff --git a/src/tests/ftest/control/daos_system_query.py b/src/tests/ftest/control/daos_system_query.py index 8c91f47d8b3..29f279d2c6c 100644 --- a/src/tests/ftest/control/daos_system_query.py +++ b/src/tests/ftest/control/daos_system_query.py @@ -21,7 +21,7 @@ def test_daos_system_query(self): :avocado: tags=all,full_regression :avocado: tags=vm :avocado: tags=control,daos_cmd - :avocado: tags=DaosSystemQuery,daos_system_query,test_daos_system_query + :avocado: tags=DaosSystemQuery,test_daos_system_query """ daos_cmd = self.get_daos_command() diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml index 05be8c5b096..3d1b9762231 100644 --- a/src/tests/ftest/control/daos_system_query.yaml +++ b/src/tests/ftest/control/daos_system_query.yaml @@ -1,8 +1,9 @@ hosts: test_servers: 1 + test_clients: 1 timeout: 80 server_config: - name: daos_server + name: other_dserver # Use a non-default name engines_per_host: 1 engines: 0: diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index ce15a430493..e2e838a8536 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -715,7 +715,7 @@ def setUp(self): # The server config name should be obtained from each ServerManager # object, but some tests still use this TestWithServers attribute. - self.server_group = self.params.get("name", "/server_config/", "daos_server") + self.server_group = self.params.get("name", "/run/server_config/*", "daos_server") # The optional namespace for the server configuration test yaml parameters. self.server_config_namespace = self.params.get("server_config_namespace", "/run/setup/*") From c859be0043cd0cca97066ebd9fd716594d7f4c86 Mon Sep 17 00:00:00 2001 From: saurabhtandan Date: Mon, 29 Apr 2024 14:01:43 -0700 Subject: [PATCH 06/10] DAOS-14823 test: Changing scm-size for pool create (#13871) Increasing scm-size for pool create from 128Mib to 256Mib Signed-off-by: Saurabh Tandan --- src/tests/ftest/deployment/io_sys_admin.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml index 1e4f6b4dec0..cbee965200a 100644 --- a/src/tests/ftest/deployment/io_sys_admin.yaml +++ b/src/tests/ftest/deployment/io_sys_admin.yaml @@ -26,7 +26,7 @@ dmg: storage_sub_command: scan pool_1: control_method: dmg - scm_size: 128MiB + scm_size: 256MiB nvme_size: 16GiB pool_2: control_method: dmg From b593cea06800d0ad269615f6be9156fcf8badc1c Mon Sep 17 00:00:00 2001 From: Makito Kano Date: Tue, 30 Apr 2024 06:27:03 +0900 Subject: [PATCH 07/10] DAOS-15759 test: Remove utils/cr_demo (#14265) Catastrophic recovery demo scripts that are in utils/cr_demo introduces security vulnerabilities. The scripts are for the CR demo and no longer necessary, so remove them. Signed-off-by: Makito Kano --- utils/cr_demo/demo_utils.py | 379 ------------------------- utils/cr_demo/run_demo_aurora.py | 434 ----------------------------- utils/cr_demo/show_fixed_aurora.py | 117 -------- 3 files changed, 930 deletions(-) delete mode 100644 utils/cr_demo/demo_utils.py delete mode 100644 utils/cr_demo/run_demo_aurora.py delete mode 100644 utils/cr_demo/show_fixed_aurora.py diff --git a/utils/cr_demo/demo_utils.py b/utils/cr_demo/demo_utils.py deleted file mode 100644 index 1007ce77e24..00000000000 --- a/utils/cr_demo/demo_utils.py +++ /dev/null @@ -1,379 +0,0 @@ -""" - (C) Copyright 2023 Intel Corporation. - - SPDX-License-Identifier: BSD-2-Clause-Patent -""" -import subprocess # nosec - -import yaml - - -# Storage-related methods -def format_storage(host_list): - """Call dmg storage format. - - Args: - host_list (str): List of hosts to format. - """ - format_cmd = ["dmg", "storage", "format", "--host-list=" + host_list] - run_command(command=format_cmd) - - -def storage_query_usage(host_list): - """Call dmg storage query usage. - - Args: - host_list (str): List of hosts to query. - """ - storage_query_cmd = ["dmg", "storage", "query", "usage", "--host-list=" + host_list] - run_command(command=storage_query_cmd) - - -# Pool-related methods -def create_pool(pool_size, pool_label, ranks=None, nsvc=None): - """Call dmg pool create. - - Args: - pool_size (str): Pool size. - pool_label (str): Pool label. - ranks (str): Ranks to create pool. Defaults to None. - nsvc (str): Number of service replicas. Defaults to None. - """ - create_pool_cmd = ["dmg", "pool", "create", pool_label, "--size=" + pool_size] - if ranks: - create_pool_cmd.append("--ranks=" + ranks) - if nsvc: - create_pool_cmd.append("--nsvc=" + nsvc) - run_command(command=create_pool_cmd) - - -def list_pool(verbose=False, json=False, no_query=False): - """Call dmg pool list. - - Args: - verbose (bool): Whether to use --verbose. Defaults to False. - json (bool): Whether to use --json. If used, verbose value would be irrelevant. - Defaults to False. - no_query (bool): Whether to use --no-query. Defaults to False. - - Returns: - str: If --json is used, return stdout. Otherwise None. - - """ - list_pool_cmd = ["dmg", "pool", "list"] - if json: - list_pool_cmd.append("--json") - if verbose: - list_pool_cmd.append("--verbose") - if no_query: - list_pool_cmd.append("--no-query") - command = " ".join(list_pool_cmd) - print(f"Command: {command}") - - if json: - result = subprocess.run( - list_pool_cmd, stdout=subprocess.PIPE, universal_newlines=True, check=False) - return result.stdout - - subprocess.run(list_pool_cmd, check=False) - return None - - -def pool_get_prop(pool_label, properties): - """Call dmg pool get-prop - - Args: - pool_label (str): Pool label. - properties (str): Properties to query. Separate them with comma if there are - multiple properties. - """ - get_prop_cmd = ["dmg", "pool", "get-prop", pool_label, properties] - run_command(command=get_prop_cmd) - - -def pool_query(pool_label): - """Call dmg pool query - - Args: - pool_label (str): Pool label. - """ - pool_query_cmd = ["dmg", "pool", "query", pool_label] - run_command(command=pool_query_cmd) - - -# Container-related methods -def create_container(pool_label, cont_label): - """Call daos container create. - - Args: - pool_label (str): Pool label. - cont_label (str): Container label. - """ - cont_create_cmd = ["daos", "container", "create", pool_label, cont_label] - run_command(command=cont_create_cmd) - - -def cont_get_prop(pool_label, cont_label, properties=None): - """Call daos container get-prop - - Args: - pool_label (str): Pool label. - cont_label (str): Container label. - properties (str): Properties to query. Separate them with comma if there are - multiple properties. Defaults to None. - """ - get_prop_cmd = ["daos", "container", "get-prop", pool_label, cont_label] - if properties: - get_prop_cmd.append("--properties=" + properties) - run_command(command=get_prop_cmd) - - -# Fault-related methods -def inject_fault_mgmt(pool_label, fault_type): - """Call dmg faults mgmt-svc to inject fault. - - Args: - pool_label (str): Pool label. - fault_type (str): Fault type. - """ - inject_fault_cmd = ["dmg", "faults", "mgmt-svc", "pool", pool_label, fault_type] - run_command(command=inject_fault_cmd) - - -def inject_fault_pool(pool_label, fault_type): - """Call dmg faults pool-svc to inject fault. - - Args: - pool_label (str): Pool label. - fault_type (str): Fault type. - """ - inject_fault_cmd = ["dmg", "faults", "pool-svc", pool_label, fault_type] - run_command(command=inject_fault_cmd) - - -def inject_fault_daos(pool_label, cont_label, fault_type): - """Call daos faults to inject fault. - - Args: - pool_label (str): Pool label. - cont_label (str): Container label. - fault_type (str): Fault type. - """ - location = "--location=" + fault_type - inject_fault_cmd = ["daos", "faults", "container", pool_label, cont_label, location] - run_command(command=inject_fault_cmd) - - -# Check-related methods -def check_enable(): - """Call dmg check enable""" - check_enable_cmd = ["dmg", "check", "enable"] - run_command(command=check_enable_cmd) - - -def check_set_policy(reset_defaults=False, all_interactive=False): - """Call dmg check set-policy with --reset-defaults or --all-interactive. - - Args: - reset_defaults (bool): Set all policies to their default action. Defaults to - False. - all_interactive (bool): Set all policies to interactive. Defaults to False. - """ - if reset_defaults != all_interactive: - check_set_policy_cmd = ["dmg", "check", "set-policy"] - if reset_defaults: - check_set_policy_cmd.append("--reset-defaults") - if all_interactive: - check_set_policy_cmd.append("--all-interactive") - run_command(command=check_set_policy_cmd) - - -def check_start(policies=None): - """Call dmg check start - - Args: - policies (str): Repair policies such as POOL_BAD_LABEL:CIA_INTERACT - """ - check_start_cmd = ["dmg", "check", "start"] - if policies: - check_start_cmd.extend(["-p", policies]) - run_command(command=check_start_cmd) - - -def check_query(json=False): - """Call dmg check query - - Args: - json (bool): Whether to use --json. Defaults to False. - - Returns: - str: If --json is used, return stdout. Otherwise None. - - """ - if json: - check_query_cmd = ["dmg", "--json", "check", "query"] - else: - check_query_cmd = ["dmg", "check", "query"] - command = " ".join(check_query_cmd) - print(f"Command: {command}") - - if json: - result = subprocess.run( - check_query_cmd, stdout=subprocess.PIPE, universal_newlines=True, check=False) - return result.stdout - - subprocess.run(check_query_cmd, check=False) - return None - - -def check_disable(): - """Call dmg check disable""" - check_disable_cmd = ["dmg", "check", "disable"] - run_command(command=check_disable_cmd) - - -def repeat_check_query(): - """Allow user to repeatedly call dmg check query.""" - while True: - user_input = input("Hit y to query, n to proceed to next step: ") - if user_input == "y": - check_query() - elif user_input == "n": - break - else: - print("Please enter y or n.") - - -def check_repair(sequence_num, action): - """Call dmg check repair - - Args: - sequence_num (str): Sequence number for repair action. - action (str): Repair action number. - """ - check_repair_cmd = ["dmg", "check", "repair", sequence_num, action] - run_command(command=check_repair_cmd) - - -# System-related methods -def system_stop(force=False): - """Stop servers. - - Args: - force (bool): Whether to use --force. Defaults to None. - """ - system_stop_cmd = ["dmg", "system", "stop"] - if force: - system_stop_cmd.append("--force") - run_command(command=system_stop_cmd) - - -def system_start(): - """Start servers.""" - system_start_cmd = ["dmg", "system", "start"] - run_command(command=system_start_cmd) - - -def system_query(json=False, verbose=False): - """Call dmg system query - - Args: - json (bool): Whether to use --json. Defaults to False. - verbose (bool): Whether to use --verbose. Defaults to False. - - Returns: - str: Command output. - - """ - if json: - system_query_cmd = ["dmg", "--json", "system", "query"] - else: - system_query_cmd = ["dmg", "system", "query"] - if verbose: - system_query_cmd.append("--verbose") - command = " ".join(system_query_cmd) - print(f"Command: {command}") - - if json: - result = subprocess.run( - system_query_cmd, stdout=subprocess.PIPE, universal_newlines=True, - check=False) - return result.stdout - - subprocess.run(system_query_cmd, check=False) - return None - - -# Utility methods -def create_uuid_to_seqnum(): - """Create pool UUID to sequence number mapping. - - Returns: - dict: UUID to sequence number mapping for each pool. Sequence number will be used - during repair. - - """ - uuid_to_seqnum = {} - stdout = check_query(json=True) - generated_yaml = yaml.safe_load(stdout) - for report in generated_yaml["response"]["reports"]: - uuid_to_seqnum[report["pool_uuid"]] = report["seq"] - - return uuid_to_seqnum - - -def create_label_to_uuid(): - """Create label to UUID mapping. - - Returns: - dict: Pool label to UUID. - - """ - label_to_uuid = {} - stdout = list_pool(json=True) - generated_yaml = yaml.safe_load(stdout) - for pool in generated_yaml["response"]["pools"]: - label_to_uuid[pool["label"]] = pool["uuid"] - - return label_to_uuid - - -def get_current_labels(): - """Get current pool labels from MS. - - Returns: - list: Current pool labels. - - """ - pool_labels = [] - stdout = list_pool(json=True) - generated_yaml = yaml.safe_load(stdout) - for pool in generated_yaml["response"]["pools"]: - pool_labels.append(pool["label"]) - - return pool_labels - - -def convert_list_to_str(original_list, separator): - """Convert given list to a string with each item separated by separator. - - Args: - original_list (list): List of items. - separator (str): Separator to separate each item in the new string list. - - Returns: - str: String list. - - """ - return separator.join(map(str, original_list)) - - -def run_command(command): - """Print given command and run. - - Args: - command (list): List of characters that make up the command. - """ - cmd_str = " ".join(command) - print(f"Command: {cmd_str}") - subprocess.run(command, check=False) diff --git a/utils/cr_demo/run_demo_aurora.py b/utils/cr_demo/run_demo_aurora.py deleted file mode 100644 index c7f1962c2be..00000000000 --- a/utils/cr_demo/run_demo_aurora.py +++ /dev/null @@ -1,434 +0,0 @@ -""" - (C) Copyright 2023 Intel Corporation. - - SPDX-License-Identifier: BSD-2-Clause-Patent -""" -import argparse -import re -import subprocess # nosec -import time -from collections import defaultdict - -import yaml -from ClusterShell.NodeSet import NodeSet -from demo_utils import (check_disable, check_enable, check_repair, check_set_policy, check_start, - cont_get_prop, convert_list_to_str, create_container, create_pool, - create_uuid_to_seqnum, format_storage, inject_fault_daos, - inject_fault_mgmt, inject_fault_pool, list_pool, pool_get_prop, - repeat_check_query, storage_query_usage, system_query, system_start, - system_stop) - -# Run this script on Aurora node as user. e.g., -# python3 run_demo_aurora.py -l aurora-daos-[0001-0100] - -TEST_CMD = "sudo date" -test_cmd_list = TEST_CMD.split(" ") -print(f"Check sudo works by calling: {TEST_CMD}") -subprocess.run(test_cmd_list, check=False) - -POOL_SIZE = "5T" -POOL_SIZE_F5 = "3T" -POOL_LABEL = "tank" -CONT_LABEL = "bucket" -# Number of seconds to wait for engines to start for 1 group setup. -FORMAT_SLEEP_SEC = 35 - -print("\nF1: Dangling pool") -print("F2: Lost the majority of pool service replicas") -print("F3: Orphan pool") -print("F4: Inconsistent pool label between MS and PS") -print("F5: Orphan pool shard") -print("F6: Dangling pool map") -print("F7: Orphan container") -print("F8: Inconsistent container label between CS and container property") - -PARSER = argparse.ArgumentParser() -PARSER.add_argument( - "-l", "--hostlist", required=True, help="List of hosts to run the demo") -ARGS = vars(PARSER.parse_args()) - -HOSTLIST = ARGS["hostlist"] - -print(f"\n1. Format storage on {HOSTLIST}.") -format_storage(host_list=HOSTLIST) - -print(f"\nWait for {FORMAT_SLEEP_SEC} sec for format...") -time.sleep(FORMAT_SLEEP_SEC) - -# Call dmg system query to obtain the IP address of necessary ranks. -rank_to_ip = {} -stdout = system_query(json=True) -# Printing system query output helps, but the output will be long if there are many ranks. -# print(f"dmg system query stdout = {stdout}") -generated_yaml = yaml.safe_load(stdout) -RANK_COUNT = 0 -JOINED_COUNT = 0 -for member in generated_yaml["response"]["members"]: - rank_to_ip[member["rank"]] = member["addr"].split(":")[0] - RANK_COUNT += 1 - if member["state"] == "joined": - JOINED_COUNT += 1 -# Print the number of ranks and joined ranks as a reference. -node_set = NodeSet(HOSTLIST) -hostlist = list(node_set) -print(f"\n{len(hostlist)} nodes; {RANK_COUNT} ranks; {JOINED_COUNT} joined") - -# Create rank to mount point map and host to ranks map for F2 and F5. -# 1. scp daos_control.log from all nodes to here, where this script runs. scp the local -# file as well. Add hostname to the end of the file name. The log contains rank and PID. -# Number of nodes used for F2. -NODE_COUNT = 2 -for i in range(NODE_COUNT): - scp_cmd_list = ["scp", f"{hostlist[i]}:/var/tmp/daos_testing/daos_control.log", - f"/var/tmp/daos_testing/daos_control_{hostlist[i]}.log"] - subprocess.run(scp_cmd_list, check=False) - -# 2. Determine the rank to PID mapping from the control logs. In addition, determine the -# host to ranks mapping for creating the pool. We need to know the four ranks for the -# first two nodes. We'll use many nodes in Aurora, but only two nodes for F2. -rank_to_pid = {} -host_to_ranks = defaultdict(list) -SEARCH_STR = r"DAOS I/O Engine.*process (\d+) started on rank (\d+)" -for i in range(NODE_COUNT): - with open( - f"/var/tmp/daos_testing/daos_control_{hostlist[i]}.log", "r", - encoding="utf-8") as file: - for line in file: - match = re.findall(SEARCH_STR, line) - if match: - print(match) - pid = int(match[0][0]) - rank = int(match[0][1]) - rank_to_pid[rank] = pid - host_to_ranks[hostlist[i]].append(rank) - -# 3. Determine the PID to mount point mapping by calling ps ax and search for daos_engine. -# Sample line: -# 84877 ? SLl 102:04 /usr/bin/daos_engine -t 8 -x 1 -g daos_server -d -# /var/run/daos_server -T 2 -n /mnt/daos1/daos_nvme.conf -p 1 -I 1 -r 8192 -H 2 -s -# /mnt/daos1 -pid_to_mount = {} -MOUNT_0 = "/mnt/daos0" -MOUNT_1 = "/mnt/daos1" -for i in range(NODE_COUNT): - clush_ps_ax = ["clush", "-w", hostlist[i], "ps ax"] - result = subprocess.check_output(clush_ps_ax) - result_list = result.decode("utf-8").split("\n") - for result in result_list: - if "daos_engine" in result: - print(result) - if MOUNT_0 in result: - pid = re.split(r"\s+", result)[1] - pid = int(pid) - pid_to_mount[pid] = MOUNT_0 - elif MOUNT_1 in result: - pid = re.split(r"\s+", result)[1] - pid = int(pid) - pid_to_mount[pid] = MOUNT_1 - -# 4. Determine the four ranks in hostlist[0] and hostlist[1] to create F2 pool. -f2_ranks = [] -f2_ranks.extend(host_to_ranks[hostlist[0]]) -f2_ranks.extend(host_to_ranks[hostlist[1]]) -# Ranks in the map are int, so convert them to string and separate them with comma. -F2_RANKS_STR = convert_list_to_str(original_list=f2_ranks, separator=",") - -# 5. Determine the two ranks in hostlist[0] to create F5 pool. -f5_ranks = [] -f5_ranks.extend(host_to_ranks[hostlist[0]]) -# Ranks in the map are int, so convert them to string and separate them with comma. -F5_RANKS_STR = convert_list_to_str(original_list=f5_ranks, separator=",") - -# Add input here to make sure all ranks are joined before starting the script. -input("\n2. Create 8 pools and containers. Hit enter...") -POOL_LABEL_1 = POOL_LABEL + "_F1" -POOL_LABEL_2 = POOL_LABEL + "_F2" -POOL_LABEL_3 = POOL_LABEL + "_F3" -POOL_LABEL_4 = POOL_LABEL + "_F4" -POOL_LABEL_5 = POOL_LABEL + "_F5" -POOL_LABEL_6 = POOL_LABEL + "_F6" -POOL_LABEL_7 = POOL_LABEL + "_F7" -POOL_LABEL_8 = POOL_LABEL + "_F8" -CONT_LABEL_7 = CONT_LABEL + "_F7" -CONT_LABEL_8 = CONT_LABEL + "_F8" - -# F1. CIC_POOL_NONEXIST_ON_ENGINE - dangling pool -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_1) -# F2. CIC_POOL_LESS_SVC_WITHOUT_QUORUM -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_2, ranks=F2_RANKS_STR, nsvc="3") -# F3. CIC_POOL_NONEXIST_ON_MS - orphan pool -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_3) -# F4. CIC_POOL_BAD_LABEL - inconsistent pool label between MS and PS -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_4) -# F5. CIC_ENGINE_NONEXIST_IN_MAP - orphan pool shard -create_pool(pool_size=POOL_SIZE_F5, pool_label=POOL_LABEL_5, ranks=F5_RANKS_STR) -# F6. CIC_ENGINE_HAS_NO_STORAGE - dangling pool map -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_6) -# F7. CIC_CONT_NONEXIST_ON_PS - orphan container -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_7) -create_container(pool_label=POOL_LABEL_7, cont_label=CONT_LABEL_7) -print() -# F8. CIC_CONT_BAD_LABEL -create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_8) -create_container(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8) - -print("(Create label to UUID mapping and obtain service replicas for F2.)") -label_to_uuid = {} -f2_service_replicas = [] -stdout = list_pool(json=True) -generated_yaml = yaml.safe_load(stdout) -for pool in generated_yaml["response"]["pools"]: - label_to_uuid[pool["label"]] = pool["uuid"] - # Collect service replicas for F2. - if pool["label"] == POOL_LABEL_2: - f2_service_replicas = pool["svc_reps"] - -print(f"\n(F2 service replicas = {f2_service_replicas})") - -print(f"\n3-F5. Print storage usage to show original usage of {POOL_LABEL_5}. " - f"Pool is created on {hostlist[0]}.") -# F5 pool is created on hostlist[0] ranks, but we'll copy the pool dir from there to one -# of the ranks in hostlist[1], so show both. -f5_host_list = f"{hostlist[0]},{hostlist[1]}" -storage_query_usage(host_list=f5_host_list) - -print("\n4. Inject fault with dmg for F1, F3, F4, F7, F8.") -# F1 -inject_fault_pool(pool_label=POOL_LABEL_1, fault_type="CIC_POOL_NONEXIST_ON_ENGINE") - -# F3 -inject_fault_mgmt(pool_label=POOL_LABEL_3, fault_type="CIC_POOL_NONEXIST_ON_MS") - -# F4 -inject_fault_mgmt(pool_label=POOL_LABEL_4, fault_type="CIC_POOL_BAD_LABEL") - -# F7 -inject_fault_daos( - pool_label=POOL_LABEL_7, cont_label=CONT_LABEL_7, fault_type="DAOS_CHK_CONT_ORPHAN") - -# F8 -inject_fault_daos( - pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8, - fault_type="DAOS_CHK_CONT_BAD_LABEL") - -input("\n5-1. Stop servers to manipulate for F2, F5, F6, F7. Hit enter...") -system_stop(force=True) - -# F2: Destroy tank_2 rdb-pool on two of the three service replicas. Call them rank a and -# b. Select the first two service replicas. -svc_rep_a = f2_service_replicas[0] -svc_rep_b = f2_service_replicas[1] -rank_a_ip = rank_to_ip[svc_rep_a] -rank_b_ip = rank_to_ip[svc_rep_b] -rank_a_mount = pid_to_mount[rank_to_pid[svc_rep_a]] -rank_b_mount = pid_to_mount[rank_to_pid[svc_rep_b]] -rm_rank_a = f"sudo rm {rank_a_mount}/{label_to_uuid[POOL_LABEL_2]}/rdb-pool" -rm_rank_b = f"sudo rm {rank_b_mount}/{label_to_uuid[POOL_LABEL_2]}/rdb-pool" -clush_rm_rank_a = ["clush", "-w", rank_a_ip, rm_rank_a] -clush_rm_rank_b = ["clush", "-w", rank_b_ip, rm_rank_b] -print("(F2: Destroy tank_F2 rdb-pool on rank a and b.)") -print(f"Command for rank a: {clush_rm_rank_a}\n") -print(f"Command for rank b: {clush_rm_rank_b}\n") -subprocess.run(clush_rm_rank_a, check=False) -subprocess.run(clush_rm_rank_b, check=False) - -# F5: Copy tank_5 pool directory from /mnt/daos1 in hostlist[0] to /mnt/daos0 in -# hostlist[1]. Match owner. (Mount points are arbitrary.) -# In order to copy the pool directory without password, there are two things to set up. -# 1. Since we're running rsync as user, update the mode of the source pool directory as -# below. -# Set 777 for /mnt/daos1 and /mnt/daos1//* i.e., -# chmod 777 /mnt/daos1; chmod -R 777 /mnt/daos1/ -# 2. Update mode of the destination mount point to 777. e.g., -# clush -w "sudo chmod 777 /mnt/daos0" - -# Alternatively, we can generate public-private key pair for root and call scp with sudo. -# Then we don't need to do step 2 (update mode to 777). - -print("(F5: Update mode of the source pool directory.)") -pool_uuid_5 = label_to_uuid[POOL_LABEL_5] -chmod_cmd = f"sudo chmod 777 /mnt/daos1; sudo chmod -R 777 /mnt/daos1/{pool_uuid_5}" -clush_chmod_cmd = ["clush", "-w", hostlist[0], chmod_cmd] -print(f"Command: {clush_chmod_cmd}\n") -subprocess.run(clush_chmod_cmd, check=False) - -print("(F5: Update mode of the destination mount point.)") -CHMOD_CMD = "sudo chmod 777 /mnt/daos0" -clush_chmod_cmd = ["clush", "-w", hostlist[1], CHMOD_CMD] -print(f"Command: {clush_chmod_cmd}\n") -subprocess.run(clush_chmod_cmd, check=False) - -# Since we're sending each file (vos-0 to 15 + rdb-pool) one at a time rather than the -# whole pool directory, we need to create the destination fake pool directory first. -print("(F5: Create a fake pool directory at the destination mount point.)") -mkdir_cmd = f"sudo mkdir /mnt/daos0/{pool_uuid_5}" -clush_mkdir_cmd = ["clush", "-w", hostlist[1], mkdir_cmd] -print(f"Command: {clush_mkdir_cmd}\n") -subprocess.run(clush_mkdir_cmd, check=False) - -print("(F5: Update mode of the fake pool directory at destination.)") -chmod_cmd = f"sudo chmod 777 /mnt/daos0/{pool_uuid_5}" -clush_chmod_cmd = ["clush", "-w", hostlist[1], chmod_cmd] -print(f"Command: {clush_chmod_cmd}\n") -subprocess.run(clush_chmod_cmd, check=False) - -# Run the following xargs + rsync command on hostlist[0] using clush: -# ls /mnt/daos1/ | xargs --max-procs=16 -I% \ -# rsync -avz /mnt/daos1//% hostlist[1]:/mnt/daos0/ - -# 1. The initial ls command lists the content of the pool directory, which contains 16 vos -# files (because there are 16 targets) and rdb-pool file. -# 2. By using xargs, each item of the ls output is passed into rsync and the rsync -# commands are executed in parallel. i.e., each file is sent by separate rsync process in -# parallel. - -# * We use --max-procs=16 to support at most 16 rsync processes to run in parallel. -# * -I% means replace % in the following rsync command by the output of ls. i.e., file -# name. -# * rsync -avz means archive, verbose, and compress. By using compress, we can -# significantly reduce the size of the data and the transfer time. -# * By running rsync in parallel, we can significantly reduce the transfer time. e.g., For -# a 2TB pool with 8 targets per engine, each vos file size is about 7G (rdb-pool is -# smaller). If we run a simple rsync, which runs serially, it takes 1 min 50 sec. -# However, if we run them in parallel, it's reduced to 24 sec. -print(f"(F5: Copy pool directory from {hostlist[0]} to {hostlist[1]}.)") -xargs_rsync_cmd = (f"ls /mnt/daos1/{pool_uuid_5} | xargs --max-procs=16 -I% " - f"rsync -avz /mnt/daos1/{pool_uuid_5}/% " - f"{hostlist[1]}:/mnt/daos0/{pool_uuid_5}") -clush_xargs_rsync_cmd = ["clush", "-w", hostlist[0], xargs_rsync_cmd] -print(f"Command: {clush_xargs_rsync_cmd}\n") -subprocess.run(clush_xargs_rsync_cmd, check=False) - -print("(F5: Set owner for the copied dir and files to daos_server:daos_server.)") -chown_cmd = f"sudo chown -R daos_server:daos_server /mnt/daos0/{pool_uuid_5}" -clush_chown_cmd = ["clush", "-w", hostlist[1], chown_cmd] -print(f"Command: {clush_chown_cmd}\n") -subprocess.run(clush_chown_cmd, check=False) - -print("(F6: Remove vos-0 from one of the nodes.)") -pool_uuid_6 = label_to_uuid[POOL_LABEL_6] -rm_cmd = f"sudo rm -rf /mnt/daos0/{pool_uuid_6}/vos-0" -# Remove vos-0 from /mnt/daos0 in rank 0 node. Note that /mnt/daos0 may not be mapped to -# rank 0. Rank 0 is mapped to either daos0 or daos1. However, we don't care for the -# purpose of testing dangling pool map. -clush_rm_cmd = ["clush", "-w", rank_to_ip[0], rm_cmd] -print(f"Command: {clush_rm_cmd}\n") -subprocess.run(clush_rm_cmd, check=False) - -print("F7: Use ddb to show that the container is left in shards.") -pool_uuid_7 = label_to_uuid[POOL_LABEL_7] -# Run ddb on /mnt/daos0 of rank 0 node. -ddb_cmd = f"sudo ddb /mnt/daos0/{pool_uuid_7}/vos-0 ls" -# ddb with clush causes some authentication error. tank_F7 is created across all ranks, so -# just run ddb locally as a workaround. -ddb_cmd_list = ddb_cmd.split(" ") -print(f"Command: {ddb_cmd}") -subprocess.run(ddb_cmd_list, check=False) - -# (optional) F3: Show pool directory at mount point to verify that the pool exists on -# engine. - -print("\n5-2. Restart servers.") -system_start() - -input("\n6. Show the faults injected for each pool/container for F1, F3, F4, F5, F8. " - "Hit enter...") -print(f"6-F1. Show dangling pool entry for {POOL_LABEL_1}.") -# F3 part 1 -print(f"6-F3. MS doesn't recognize {POOL_LABEL_3}.") -# F4 part 1 -print(f"6-F4-1. Label ({POOL_LABEL_4}) in MS is corrupted with -fault added.") -list_pool(no_query=True) - -# F2: (optional) Try to create a container, which will hang. - -# F4 part 2 -print(f"\n6-F4-2. Label ({POOL_LABEL_4}) in PS is still original.") -POOL_LABEL_4_FAULT = POOL_LABEL_4 + "-fault" -pool_get_prop(pool_label=POOL_LABEL_4_FAULT, properties="label") - -# F5: Call dmg storage query usage to show that the pool is using more space. -print(f"\n6-F5. Print storage usage to show that {POOL_LABEL_5} is using more space. " - f"Pool directory is copied to {hostlist[1]}.") -storage_query_usage(host_list=f5_host_list) - -# F8: Show inconsistency by getting the container label. -print("\n6-F8. Show container label inconsistency.") -cont_get_prop(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8) -print(f"Error because container ({CONT_LABEL_8}) doesn't exist on container service.\n") - -print(f"Container ({CONT_LABEL_8}) exists on property.") -cont_get_prop(pool_label=POOL_LABEL_8, cont_label="new-label", properties="label") - -input("\n7. Enable checker. Hit enter...") -system_stop(force=True) -check_enable() - -input("\n8. Start checker with interactive mode. Hit enter...") -check_set_policy(all_interactive=True) -print() -check_start() -print() -repeat_check_query() - -input("\n8-1. Select repair options for F1 to F4. Hit enter...") -print("(Create UUID to sequence number.)") -uuid_to_seqnum = create_uuid_to_seqnum() -SEQ_NUM_1 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_1]])) -SEQ_NUM_2 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_2]])) -SEQ_NUM_3 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_3]])) -SEQ_NUM_4 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_4]])) -SEQ_NUM_5 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_5]])) -SEQ_NUM_6 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_6]])) -SEQ_NUM_7 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_7]])) -SEQ_NUM_8 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_8]])) - -# F1: 1: Discard the dangling pool entry from MS [suggested]. -print(f"\n{POOL_LABEL_1} - 1: Discard the dangling pool entry from MS [suggested].") -check_repair(sequence_num=SEQ_NUM_1, action="1") - -# F2: 2: Start pool service under DICTATE mode from rank 1 [suggested]. -print(f"\n{POOL_LABEL_2} - 2: Start pool service under DICTATE mode from rank 1 " - f"[suggested].") -check_repair(sequence_num=SEQ_NUM_2, action="2") - -# F3:2: Re-add the orphan pool back to MS [suggested]. -print(f"\n{POOL_LABEL_3} - 2: Re-add the orphan pool back to MS [suggested].") -check_repair(sequence_num=SEQ_NUM_3, action="2") - -# F4: 2: Trust PS pool label. -print(f"\n{POOL_LABEL_4} - 2: Trust PS pool label.") -check_repair(sequence_num=SEQ_NUM_4, action="2") - -print() -# Call dmg check query until n is entered. -repeat_check_query() - -input("\n8-2. Select repair options for F5 to F8. Hit enter...") -# F5: 1: Discard the orphan pool shard to release space [suggested]. -print(f"\n{POOL_LABEL_5} - 1: Discard the orphan pool shard to release space " - f"[suggested].") -check_repair(sequence_num=SEQ_NUM_5, action="1") - -# F6: 1: Change pool map for the dangling map entry [suggested]. -print(f"\n{POOL_LABEL_6} - 1: Change pool map for the dangling map entry as down " - f"[suggested].") -check_repair(sequence_num=SEQ_NUM_6, action="1") - -# F7: 1: Destroy the orphan container to release space [suggested]. -print(f"\n{POOL_LABEL_7} - 1: Destroy the orphan container to release space [suggested].") -check_repair(sequence_num=SEQ_NUM_7, action="1") - -# F8: 2: Trust the container label in container property. -print(f"\n{POOL_LABEL_8} - 2: Trust the container label in container property.") -check_repair(sequence_num=SEQ_NUM_8, action="2") - -print() -# Call dmg check query until n is entered. -repeat_check_query() - -print("\n9. Disable the checker.") -check_disable() -system_start() - -print("\nRun show_fixed_aurora.py to show the issues fixed...") diff --git a/utils/cr_demo/show_fixed_aurora.py b/utils/cr_demo/show_fixed_aurora.py deleted file mode 100644 index 6271023ac13..00000000000 --- a/utils/cr_demo/show_fixed_aurora.py +++ /dev/null @@ -1,117 +0,0 @@ -""" - (C) Copyright 2023 Intel Corporation. - - SPDX-License-Identifier: BSD-2-Clause-Patent -""" -import argparse -import subprocess # nosec - -import yaml -from ClusterShell.NodeSet import NodeSet -from demo_utils import (cont_get_prop, create_container, list_pool, pool_get_prop, pool_query, - storage_query_usage, system_query, system_stop) - -# Run this script on Aurora node as user after running run_demo_aurora.py. E.g., -# python3 show_fixed_aurora.py -l aurora-daos-[0001-0100] - -TEST_CMD = "sudo date" -test_cmd_list = TEST_CMD.split(" ") -print(f"Check sudo works by calling: {TEST_CMD}") -subprocess.run(test_cmd_list, check=False) - -POOL_LABEL = "tank" -CONT_LABEL = "bucket" -TARGET_PER_RANK = 16 - -PARSER = argparse.ArgumentParser() -PARSER.add_argument( - "-l", "--hostlist", required=True, help="List of hosts used for run_demo.py") -ARGS = vars(PARSER.parse_args()) -HOSTLIST = ARGS["hostlist"] -node_set = NodeSet(HOSTLIST) -hostlist = list(node_set) - -# Call dmg system query to obtain the IP address of necessary ranks. -rank_to_ip = {} -stdout = system_query(json=True) -# Printing system query output helps, but the output will be long if there are many ranks. -# print(f"dmg system query stdout = {stdout}") -generated_yaml = yaml.safe_load(stdout) -RANK_COUNT = 0 -JOINED_COUNT = 0 -for member in generated_yaml["response"]["members"]: - rank_to_ip[member["rank"]] = member["addr"].split(":")[0] - RANK_COUNT += 1 - if member["state"] == "joined": - JOINED_COUNT += 1 -# Print the number of ranks and joined ranks as a reference. -print(f"\n{RANK_COUNT} ranks; {JOINED_COUNT} joined") -TOTAL_TARGET = RANK_COUNT * TARGET_PER_RANK - -POOL_LABEL_1 = POOL_LABEL + "_F1" -POOL_LABEL_2 = POOL_LABEL + "_F2" -POOL_LABEL_3 = POOL_LABEL + "_F3" -POOL_LABEL_4 = POOL_LABEL + "_F4" -POOL_LABEL_5 = POOL_LABEL + "_F5" -POOL_LABEL_6 = POOL_LABEL + "_F6" -POOL_LABEL_7 = POOL_LABEL + "_F7" -POOL_LABEL_8 = POOL_LABEL + "_F8" -CONT_LABEL_8 = CONT_LABEL + "_F8" - -print("(Create label to UUID mapping.)") -label_to_uuid = {} -stdout = list_pool(json=True) -generated_yaml = yaml.safe_load(stdout) -for pool in generated_yaml["response"]["pools"]: - label_to_uuid[pool["label"]] = pool["uuid"] - -input("\n10. Show the issues fixed. Hit enter...") -print(f"10-F1. Dangling pool ({POOL_LABEL_1}) was removed.") -print(f"10-F3. Orphan pool ({POOL_LABEL_3}) was reconstructed.") -list_pool() - -print(f"10-F2. Create a container on {POOL_LABEL_2}. Pool can be started now, so it " - f"should succeed.") -CONT_LABEL_2 = CONT_LABEL + "_2" -create_container(pool_label=POOL_LABEL_2, cont_label=CONT_LABEL_2) -# (optional) Show that rdb-pool file in rank 0 and 2 are recovered. - -print(f"\n10-F4. Label inconsistency for {POOL_LABEL_4} was resolved. " - f"See pool list above.") -pool_get_prop(pool_label=POOL_LABEL_4, properties="label") - -# F5: Call dmg storage query usage to verify the storage was reclaimed. - Not working due -# to a bug. Instead, show that pool directory on dst node (rank 3 for 4-VM) was removed. -print(f"\n10-F5-1. Print storage usage to show that storage used by {POOL_LABEL_5} is " - f"reclaimed after pool directory is removed from {hostlist[1]}.") -f5_host_list = f"{hostlist[0]},{hostlist[1]}" -storage_query_usage(host_list=f5_host_list) - -print(f"\n10-F5-2. {label_to_uuid[POOL_LABEL_5]} pool directory on {hostlist[1]} " - f"at /mnt/daos0 was removed.") -LS_CMD = "ls /mnt/daos0" -clush_ls_cmd = ["clush", "-w", hostlist[1], LS_CMD] -print(f"Command: {clush_ls_cmd}\n") -subprocess.run(clush_ls_cmd, check=False) - -EXPECTED_TARGET = TOTAL_TARGET - 1 -print( - f"\n10-F6. {POOL_LABEL_6} has one less target ({TOTAL_TARGET} -> {EXPECTED_TARGET}).") -pool_query(pool_label=POOL_LABEL_6) -# (optional) Reintegrate rank 1 on pool 6. Wait for rebuild to finish. Then verify the -# target count. - -# F8: Verify that the inconsistency is fixed. The label is back to the original. -print(f"\n10-F8. Container label inconsistency for {CONT_LABEL_8} was fixed.") -cont_get_prop(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8, properties="label") - -# F7: Stop server. Call the same ddb command to verify that the container is removed from -# shard. -print(f"\n10-F7. Use ddb to verify that the container in {POOL_LABEL_7} is removed " - f"from shards.") -system_stop(force=True) -pool_uuid_7 = label_to_uuid[POOL_LABEL_7] -ddb_cmd = f"sudo ddb /mnt/daos0/{pool_uuid_7}/vos-0 ls" -ddb_cmd_list = ddb_cmd.split(" ") -print(f"Command: {ddb_cmd}") -subprocess.run(ddb_cmd_list, check=False) From 40fab0cf42f3dafff33f25f5d7ebada9d7f0c260 Mon Sep 17 00:00:00 2001 From: Dalton Bohning Date: Mon, 29 Apr 2024 14:30:59 -0700 Subject: [PATCH 08/10] DAOS-15659 test: fix local ftest prefix (#14173) PR #13565 accidentally broke how ftest determines the prefix from .build_vars.json because it is no longer installed. Eliminate the need for .build_vars.json in ftest entirely by using shutil.which("daos") and support setting DAOS_TEST_PREFIX Signed-off-by: Dalton Bohning --- src/tests/ftest/util/apricot/apricot/test.py | 31 +--- src/tests/ftest/util/environment_utils.py | 146 ++++++++++--------- 2 files changed, 85 insertions(+), 92 deletions(-) diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py index e2e838a8536..aa9949499c3 100644 --- a/src/tests/ftest/util/apricot/apricot/test.py +++ b/src/tests/ftest/util/apricot/apricot/test.py @@ -5,7 +5,6 @@ """ # pylint: disable=too-many-lines -import json import os import random import re @@ -23,6 +22,7 @@ from daos_utils import DaosCommand from distro_utils import detect from dmg_utils import get_dmg_command +from environment_utils import TestEnvironment from exception_utils import CommandFailure from fault_config_utils import FaultInjection from general_utils import (DaosTestError, dict_to_str, dump_engines_stacks, @@ -125,9 +125,7 @@ def __init__(self, *args, **kwargs): # use 'add_cancel_ticket()' to add to this set. self._teardown_cancel = set() self._teardown_errors = [] - self.basepath = None self.prefix = None - self.ofi_prefix = None self.cancel_file = os.path.join(os.sep, "scratch", "CI-skip-list-master") # List of methods to call during tearDown to cleanup after the steps @@ -150,22 +148,9 @@ def __init__(self, *args, **kwargs): def setUp(self): """Set up each test case.""" - # get paths from the build_vars generated by build - try: - with open('../../.build_vars.json', encoding="utf-8") as build_vars: - build_paths = json.load(build_vars) - self.basepath = os.path.normpath(os.path.join(build_paths['PREFIX'], - '..') + os.path.sep) - self.prefix = build_paths['PREFIX'] - try: - self.ofi_prefix = build_paths['OFI_PREFIX'] - except KeyError: - self.ofi_prefix = os.sep + "usr" - except FileNotFoundError: - self.prefix = "/usr" - self.basepath = "/" - self.ofi_prefix = os.sep + "usr" - self.log.info("No build vars file, assuming RPM install") + test_env = TestEnvironment() + self.prefix = test_env.daos_prefix + self.log.info("Using daos install prefix = %s", self.prefix) self.cancel_from_list() self.check_variant_skip() self.log.info("*** SETUP running on %s ***", str(detect())) @@ -536,13 +521,11 @@ def setUp(self): # set default shared dir for daos tests in case DAOS_TEST_SHARED_DIR # is not set, for RPM env and non-RPM env. - if self.prefix != "/usr": + if os.path.normpath(self.prefix) != os.path.join(os.sep, 'usr'): self.tmp = os.path.join(self.prefix, 'tmp') else: - self.tmp = os.getenv( - 'DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test')) - if not os.path.exists(self.tmp): - os.makedirs(self.tmp) + self.tmp = os.getenv('DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test')) + os.makedirs(self.tmp, exist_ok=True) self.log.debug("Shared test directory: %s", self.tmp) self.log.debug("Common test directory: %s", self.test_dir) diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py index a6653418544..d8a6c0d6def 100644 --- a/src/tests/ftest/util/environment_utils.py +++ b/src/tests/ftest/util/environment_utils.py @@ -3,8 +3,8 @@ SPDX-License-Identifier: BSD-2-Clause-Patent """ -import json import os +import shutil import site from ClusterShell.NodeSet import NodeSet @@ -18,61 +18,25 @@ class TestEnvironmentException(Exception): """Exception for launch.py execution.""" -def _get_build_environment(logger, build_vars_file): - """Obtain DAOS build environment variables from the .build_vars.json file. - - Args: - logger (Logger): logger for the messages produced by this method - build_vars_file (str): the full path to the DAOS build_vars.json file - - Raises: - TestEnvironmentException: if there is an error obtaining the DAOS build environment - - Returns: - str: The prefix of the DAOS install. - None: If the file is not present. - """ - logger.debug("Obtaining DAOS build environment from %s", build_vars_file) - try: - with open(build_vars_file, encoding="utf-8") as vars_file: - return json.load(vars_file)["PREFIX"] - - except FileNotFoundError: - return None - - except Exception as error: # pylint: disable=broad-except - raise TestEnvironmentException("Error obtaining build environment:", str(error)) from error - - -def _update_path(logger, build_vars_file): +def _update_path(daos_prefix): """Update the PATH environment variable for functional testing. Args: - logger (Logger): logger for the messages produced by this method - build_vars_file (str): the full path to the DAOS build_vars.json file + daos_prefix (str): daos install prefix - Raises: - TestEnvironmentException: if there is an error obtaining the DAOS build environment """ - base_dir = _get_build_environment(logger, build_vars_file) - - path = os.environ.get("PATH") - - parts = path.split(":") - - # If a custom prefix is used for the daos installation then prepend that to the path so that - # any binaries provided are picked up from there, else do not modify the path. - if base_dir: - bin_dir = os.path.join(base_dir, "bin") - sbin_dir = os.path.join(base_dir, "sbin") + parts = os.environ.get("PATH").split(":") + # Insert bin and sbin at the beginning of PATH if prefix is not /usr + if daos_prefix != os.path.join(os.sep, "usr"): + bin_dir = os.path.join(daos_prefix, "bin") + sbin_dir = os.path.join(daos_prefix, "sbin") parts.insert(0, bin_dir) parts.insert(0, sbin_dir) # /usr/sbin is not setup on non-root user for CI nodes. # SCM formatting tool mkfs.ext4 is located under /usr/sbin directory. usr_sbin = os.path.join(os.sep, "usr", "sbin") - if usr_sbin not in parts: parts.append(usr_sbin) @@ -142,6 +106,7 @@ class TestEnvironment(): 'insecure_mode': 'DAOS_TEST_INSECURE_MODE', 'bullseye_src': 'DAOS_TEST_BULLSEYE_SRC', 'bullseye_file': 'COVFILE', + 'daos_prefix': 'DAOS_TEST_PREFIX' } def __init__(self): @@ -176,23 +141,25 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu self.insecure_mode = insecure_mode if self.log_dir is None: - self.log_dir = self.default_log_dir() + self.log_dir = self._default_log_dir() if self.shared_dir is None: - self.shared_dir = self.default_shared_dir() + self.shared_dir = self._default_shared_dir() if self.app_dir is None: - self.app_dir = self.default_app_dir() + self.app_dir = self._default_app_dir() if self.user_dir is None: - self.user_dir = self.default_user_dir() + self.user_dir = self._default_user_dir() if self.interface is None: - self.interface = self.default_interface(logger, all_hosts) + self.interface = self._default_interface(logger, all_hosts) if self.provider is None: - self.provider = self.default_provider(logger, servers) + self.provider = self._default_provider(logger, servers) if self.insecure_mode is None: - self.insecure_mode = self.default_insecure_mode() + self.insecure_mode = self._default_insecure_mode() if self.bullseye_src is None: - self.bullseye_src = self.default_bullseye_src() + self.bullseye_src = self._default_bullseye_src() if self.bullseye_file is None: - self.bullseye_file = self.default_bullseye_file() + self.bullseye_file = self._default_bullseye_file() + if self.daos_prefix is None: + self.daos_prefix = self._default_daos_prefix(logger) def __set_value(self, key, value): """Set the test environment variable. @@ -224,7 +191,7 @@ def app_dir(self, value): """ self.__set_value('app_dir', value) - def default_app_dir(self): + def _default_app_dir(self): """Get the default application directory path. Returns: @@ -269,7 +236,7 @@ def log_dir(self, value): self.__set_value('log_dir', value) @staticmethod - def default_log_dir(): + def _default_log_dir(): """Get the default local log directory path. Returns: @@ -296,7 +263,7 @@ def shared_dir(self, value): self.__set_value('shared_dir', value) @staticmethod - def default_shared_dir(): + def _default_shared_dir(): """Get the default shared log directory path. Returns: @@ -322,7 +289,7 @@ def user_dir(self, value): """ self.__set_value('user_dir', value) - def default_user_dir(self): + def _default_user_dir(self): """Get the default user directory path. Returns: @@ -348,7 +315,7 @@ def interface(self, value): """ self.__set_value('interface', value) - def default_interface(self, logger, hosts): + def _default_interface(self, logger, hosts): """Get the default interface. Args: @@ -394,7 +361,7 @@ def provider(self, value): else: self.__set_value('provider', value) - def default_provider(self, logger, hosts): + def _default_provider(self, logger, hosts): """Get the default provider. Args: @@ -463,7 +430,7 @@ def insecure_mode(self, value): self.__set_value('insecure_mode', value) @staticmethod - def default_insecure_mode(): + def _default_insecure_mode(): """Get the default insecure mode. Returns: @@ -490,7 +457,7 @@ def bullseye_src(self, value): self.__set_value('bullseye_src', value) @staticmethod - def default_bullseye_src(): + def _default_bullseye_src(): """Get the default bullseye source file. Returns: @@ -517,7 +484,7 @@ def bullseye_file(self, value): self.__set_value('bullseye_file', value) @staticmethod - def default_bullseye_file(): + def _default_bullseye_file(): """Get the default bullseye file. Returns: @@ -525,6 +492,50 @@ def default_bullseye_file(): """ return os.path.join(os.sep, "tmp", "test.cov") + @property + def daos_prefix(self): + """Get the daos_prefix. + + Returns: + str: the daos_prefix + """ + return os.environ.get(self.__ENV_VAR_MAP['daos_prefix']) + + @daos_prefix.setter + def daos_prefix(self, value): + """Set the daos_prefix. + + Args: + value (str, bool): the daos_prefix + """ + self.__set_value('daos_prefix', value) + + def _default_daos_prefix(self, logger): + """Get the default daos_prefix. + + Args: + logger (Logger): logger for the messages produced by this method + + Raises: + TestEnvironmentException: if there is an error obtaining the default daos_prefix + + Returns: + str: the default daos_prefix + """ + if logger is None: + return None + + logger.debug( + "Detecting daos_prefix for %s - %s not set", + self.daos_prefix, self.__ENV_VAR_MAP['daos_prefix']) + + daos_bin_path = shutil.which('daos') + if not daos_bin_path: + raise TestEnvironmentException("Failed to find installed daos!") + + # E.g. /usr/bin/daos -> /usr + return os.path.dirname(os.path.dirname(daos_bin_path)) + def set_test_environment(logger, test_env=None, servers=None, clients=None, provider=None, insecure_mode=False, details=None): @@ -551,15 +562,14 @@ def set_test_environment(logger, test_env=None, servers=None, clients=None, prov logger.debug("Setting up the test environment variables") if test_env: - # Update the PATH environment variable - build_vars_file = os.path.join( - os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", ".build_vars.json") - _update_path(logger, build_vars_file) - - # Get the default fabric interface and provider + # Get the default fabric interface, provider, and daos prefix test_env.set_defaults(logger, servers, clients, provider, insecure_mode) logger.info("Testing with interface: %s", test_env.interface) logger.info("Testing with provider: %s", test_env.provider) + logger.info("Testing with daos_prefix: %s", test_env.daos_prefix) + + # Update the PATH environment variable + _update_path(test_env.daos_prefix) if details: details["interface"] = test_env.interface From 6a2c3e4e47b2727d2f63fff8dd137243534dbdb0 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Tue, 30 Apr 2024 14:11:23 +0800 Subject: [PATCH 09/10] DAOS-15713 chk: fix kinds of coverity issues (#14242) CID: 2555541 2555529 2555524 2555517 2555545 2555527 Signed-off-by: Fan Yong --- src/chk/chk_common.c | 2 ++ src/chk/chk_engine.c | 2 +- src/chk/chk_leader.c | 3 +-- src/chk/chk_upcall.c | 31 ++++++++++++------------------- src/include/daos_srv/daos_chk.h | 2 +- src/mgmt/srv_drpc.c | 12 ++++++------ src/tests/suite/daos_cr.c | 3 +++ 7 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c index fda4efc9973..c5b0d044c7a 100644 --- a/src/chk/chk_common.c +++ b/src/chk/chk_common.c @@ -1238,6 +1238,8 @@ chk_ins_init(struct chk_instance **p_ins) out_init: if (rc == 0) *p_ins = ins; + else + D_FREE(ins); return rc; } diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c index bdb142ea8bc..f9e9fad2a31 100644 --- a/src/chk/chk_engine.c +++ b/src/chk/chk_engine.c @@ -2933,7 +2933,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags) D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc)); if (cbk->cb_phase < phase) { - cbk->cb_phase = cbk->cb_phase; + cbk->cb_phase = phase; /* QUEST: How to estimate the left time? */ cbk->cb_time.ct_left_time = CHK__CHECK_SCAN_PHASE__CSP_DONE - cbk->cb_phase; rc = chk_bk_update_pool(cbk, uuid_str); diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c index b0f744b4bbb..f29fbe70f76 100644 --- a/src/chk/chk_leader.c +++ b/src/chk/chk_leader.c @@ -3385,8 +3385,7 @@ chk_leader_prop(chk_prop_cb_t prop_cb, void *buf) { struct chk_property *prop = &chk_leader->ci_prop; - return prop_cb(buf, (struct chk_policy *)prop->cp_policies, - CHK_POLICY_MAX - 1, prop->cp_flags); + return prop_cb(buf, prop->cp_policies, CHK_POLICY_MAX - 1, prop->cp_flags); } static int diff --git a/src/chk/chk_upcall.c b/src/chk/chk_upcall.c index 893b7d1ec32..bbc05db5f75 100644 --- a/src/chk/chk_upcall.c +++ b/src/chk/chk_upcall.c @@ -94,8 +94,6 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re D_ASPRINTF(report.pool_uuid, DF_UUIDF, DP_UUID(*pool)); if (report.pool_uuid == NULL) D_GOTO(out, rc = -DER_NOMEM); - } else { - report.pool_uuid = NULL; } report.pool_label = pool_label; @@ -104,8 +102,6 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re D_ASPRINTF(report.cont_uuid, DF_UUIDF, DP_UUID(*cont)); if (report.cont_uuid == NULL) D_GOTO(out, rc = -DER_NOMEM); - } else { - report.cont_uuid = NULL; } report.cont_label = cont_label; @@ -114,24 +110,18 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re D_ASPRINTF(report.objid, DF_UOID, DP_UOID(*obj)); if (report.objid == NULL) D_GOTO(out, rc = -DER_NOMEM); - } else { - report.objid = NULL; } if (!daos_iov_empty(dkey)) { D_ASPRINTF(report.dkey, DF_KEY, DP_KEY(dkey)); if (report.dkey == NULL) D_GOTO(out, rc = -DER_NOMEM); - } else { - report.dkey = NULL; } if (!daos_iov_empty(akey)) { D_ASPRINTF(report.akey, DF_KEY, DP_KEY(akey)); if (report.akey == NULL) D_GOTO(out, rc = -DER_NOMEM); - } else { - report.akey = NULL; } D_ASPRINTF(report.timestamp, "%s", ctime(&tm)); @@ -150,20 +140,23 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re goto out; report.n_act_details = rc; - } else { - report.n_act_details = 0; - report.act_details = NULL; } rc = ds_chk_report_upcall(&report); out: - D_FREE(report.pool_uuid); - D_FREE(report.cont_uuid); - D_FREE(report.objid); - D_FREE(report.dkey); - D_FREE(report.akey); - D_FREE(report.timestamp); + if (report.pool_uuid != protobuf_c_empty_string) + D_FREE(report.pool_uuid); + if (report.cont_uuid != protobuf_c_empty_string) + D_FREE(report.cont_uuid); + if (report.objid != protobuf_c_empty_string) + D_FREE(report.objid); + if (report.dkey != protobuf_c_empty_string) + D_FREE(report.dkey); + if (report.akey != protobuf_c_empty_string) + D_FREE(report.akey); + if (report.timestamp != protobuf_c_empty_string) + D_FREE(report.timestamp); chk_sg_free(report.act_details, report.n_act_details); D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO, diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h index 93fc2a75c9c..756c5ec0cd8 100644 --- a/src/include/daos_srv/daos_chk.h +++ b/src/include/daos_srv/daos_chk.h @@ -71,7 +71,7 @@ typedef int (*chk_query_head_cb_t)(uint32_t ins_status, uint32_t ins_phase, typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t idx, void *buf); -typedef int (*chk_prop_cb_t)(void *buf, struct chk_policy *policies, int cnt, uint32_t flags); +typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags); int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr, struct chk_policy *policies, int pool_nr, uuid_t pools[], diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c index a840aec93f2..013ad396699 100644 --- a/src/mgmt/srv_drpc.c +++ b/src/mgmt/srv_drpc.c @@ -2850,13 +2850,16 @@ ds_chk_prob_free(Mgmt__CheckInconsistPolicy **policies, uint32_t policy_nr) #define ALL_CHK_POLICY CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN static int -ds_chk_prop_cb(void *buf, struct chk_policy *policies, int cnt, uint32_t flags) +ds_chk_prop_cb(void *buf, uint32_t policies[], int cnt, uint32_t flags) { Mgmt__CheckInconsistPolicy **ply = NULL; Mgmt__CheckPropResp *resp = buf; int rc = 0; int i = 0; + D_ASSERTF(cnt <= ALL_CHK_POLICY, "Too many inconsistency policies %u/%u\n", + cnt, ALL_CHK_POLICY); + D_ALLOC_ARRAY(ply, cnt); if (ply == NULL) return -DER_NOMEM; @@ -2867,11 +2870,8 @@ ds_chk_prop_cb(void *buf, struct chk_policy *policies, int cnt, uint32_t flags) D_GOTO(out, rc = -DER_NOMEM); mgmt__check_inconsist_policy__init(ply[i]); - if (policies[i].cp_class == 0 && cnt == ALL_CHK_POLICY) - ply[i]->inconsist_cas = i; - else - ply[i]->inconsist_cas = policies[i].cp_class; - ply[i]->inconsist_act = policies[i].cp_action; + ply[i]->inconsist_cas = i; + ply[i]->inconsist_act = policies[i]; } diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c index 5339add7e23..1e1b0c29a13 100644 --- a/src/tests/suite/daos_cr.c +++ b/src/tests/suite/daos_cr.c @@ -835,13 +835,16 @@ cr_cont_create(void **state, struct test_pool *pool, struct test_cont *cont, int char uuid_str[DAOS_UUID_STR_SIZE]; test_arg_t *arg = *state; daos_prop_t *prop = NULL; + mode_t saved; daos_handle_t coh; int fd; int rc; int rc1; + saved = umask(0); strncpy(cont->label, "/tmp/cr_cont_XXXXXX", sizeof(cont->label) - 1); fd = mkstemp(cont->label); + umask(saved); if (fd < 0) { print_message("CR: cont generate label failed: %s\n", strerror(errno)); return d_errno2der(errno); From e7aa7a882f8a2abb7df6c93fbdbd672b215a0b24 Mon Sep 17 00:00:00 2001 From: Liu Xuezhao Date: Tue, 30 Apr 2024 15:18:05 +0800 Subject: [PATCH 10/10] DAOS-15661 object: set correct map version for layout create (#14222) In obj_layout_create, it get pl_map by pl_map_find() without holding dp_map_lock, and then set "omd_ver = dc_pool_get_version(pool)". The map version of the pl_map possibly not same as dc_pool_get_version() if another thread refreshed the dc_pool's pool map. Signed-off-by: Xuezhao Liu --- src/object/cli_obj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index a93f84bc458..c209bfdeeed 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -304,7 +304,7 @@ obj_layout_create(struct dc_object *obj, unsigned int mode, bool refresh) D_GOTO(out, rc = -DER_INVAL); } - obj->cob_md.omd_ver = dc_pool_get_version(pool); + obj->cob_md.omd_ver = pool_map_get_version(map->pl_poolmap); obj->cob_md.omd_pdom_lvl = dc_obj_get_pdom(obj); obj->cob_md.omd_fdom_lvl = dc_obj_get_redun_lvl(obj); obj->cob_md.omd_pda = dc_obj_get_pda(obj);