From ee81e40f3f959cfafd60af808a44b0be1defd961 Mon Sep 17 00:00:00 2001
From: Ashley Pittman <ashley.m.pittman@intel.com>
Date: Mon, 29 Apr 2024 18:49:20 +0100
Subject: [PATCH 01/10] DAOS-15745 dfuse: Add the pre_read metrics whilst
 holding reference. (#14256)

Increase the pre-read statistics before replying to the read,
otherwise the oh might not be valid which can lead to unexpected
behaviour.

Signed-off-by: Ashley Pittman <ashley.m.pittman@intel.com>
---
 src/client/dfuse/ops/read.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/client/dfuse/ops/read.c b/src/client/dfuse/ops/read.c
index 991755d461b..26c97204fbb 100644
--- a/src/client/dfuse/ops/read.c
+++ b/src/client/dfuse/ops/read.c
@@ -97,6 +97,7 @@ dfuse_readahead_reply(fuse_req_t req, size_t len, off_t position, struct dfuse_o
 				position + reply_len - 1, position + reply_len, position + len - 1);
 	}
 
+	DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ);
 	DFUSE_REPLY_BUFQ(oh, req, oh->doh_readahead->dra_ev->de_iov.iov_buf + position, reply_len);
 	return true;
 }
@@ -143,10 +144,8 @@ dfuse_cb_read(fuse_req_t req, fuse_ino_t ino, size_t len, off_t position, struct
 		replied = dfuse_readahead_reply(req, len, position, oh);
 		D_MUTEX_UNLOCK(&oh->doh_readahead->dra_lock);
 
-		if (replied) {
-			DFUSE_IE_STAT_ADD(oh->doh_ie, DS_PRE_READ);
+		if (replied)
 			return;
-		}
 	}
 
 	eqt_idx = atomic_fetch_add_relaxed(&dfuse_info->di_eqt_idx, 1);

From ae34616f10d2b6e561447c385d1017c1850f7cbe Mon Sep 17 00:00:00 2001
From: dinghwah <48604964+dinghwah@users.noreply.github.com>
Date: Mon, 29 Apr 2024 14:45:59 -0400
Subject: [PATCH 02/10] DAOS-15628 test: Verify maximum containers create with
 and without dup metadata ops (#14243)

implement more test to metadata with disable svc.

Signed-off-by: Ding Ho <ding-hwa.ho@intel.com>
---
 src/tests/ftest/server/metadata.py   | 125 +++++++++++++++++++--------
 src/tests/ftest/server/metadata.yaml |  17 ++--
 2 files changed, 98 insertions(+), 44 deletions(-)

diff --git a/src/tests/ftest/server/metadata.py b/src/tests/ftest/server/metadata.py
index 1125003816a..e072ce5351b 100644
--- a/src/tests/ftest/server/metadata.py
+++ b/src/tests/ftest/server/metadata.py
@@ -1,5 +1,5 @@
 """
-  (C) Copyright 2019-2023 Intel Corporation.
+  (C) Copyright 2019-2024 Intel Corporation.
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
@@ -86,9 +86,19 @@ def pre_tear_down(self):
             self.log.debug("no pre-teardown steps defined")
         return error_list
 
-    def create_pool(self):
-        """Create a pool and display the svc ranks."""
-        self.add_pool()
+    def create_pool(self, svc_ops_enabled=True):
+        """Create a pool and display the svc ranks.
+
+        Args:
+            svc_ops_enabled (bool, optional): pool create with svc_ops_enabled. Defaults to True.
+
+        """
+        if svc_ops_enabled:
+            self.add_pool()
+        else:
+            params = {}
+            params['properties'] = "svc_ops_enabled:0"
+            self.add_pool(**params)
         self.log.info("Created pool %s: svc ranks:", self.pool.uuid)
         for index, rank in enumerate(self.pool.svc_ranks):
             self.log.info("[%d]: %d", index, rank)
@@ -236,24 +246,21 @@ def run_dummy_metadata_workload(self, duration=150):
 
         return True
 
-    def test_metadata_fillup(self):
-        """JIRA ID: DAOS-1512.
-
-        Test Description:
-            Test to verify no IO happens after metadata is full.
+    def metadata_fillup(self, svc_ops_enabled=True):
+        """Run test to verify number of resources that can be created until metadata is full.
 
-        Use Cases:
-            ?
+        Args:
+            svc_ops_enabled (bool): Pool create properties svc_ops_enabled. Defaults to True.
 
-        :avocado: tags=all,full_regression
-        :avocado: tags=hw,large
-        :avocado: tags=server,metadata
-        :avocado: tags=ObjectMetadata,test_metadata_fillup
         """
-        self.create_pool()
-        svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age")
-        if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age):
-            self.fail("failed to run dummy metadata workload")
+        self.log_step("Create pool with properties svc_ops_enabled: {}".format(svc_ops_enabled))
+        self.create_pool(svc_ops_enabled=svc_ops_enabled)
+        # Run dummy_metadata_workload when feature is enabled
+        if svc_ops_enabled:
+            self.log.info("svc_ops_enabled enabled, run dummy_metadata_workload")
+            svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age")
+            if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age):
+                self.fail("Failed to run dummy metadata workload")
         sequential_fail_max = self.params.get("fillup_seq_fail_max", "/run/metadata/*")
         num_cont_to_destroy = self.params.get("num_cont_to_destroy", "/run/metadata/*")
 
@@ -266,10 +273,7 @@ def test_metadata_fillup(self):
         # Phase 2: if Phase 1 passed:
         #          clean up several (not all) containers created (prove "critical" destroy
         #          in rdb (and vos) works without cascading no space errors
-
-        # Phase 1 sustained container creates even after no space error
-        self.log.info(
-            "Phase 1: sustained container creates: to no space and beyond")
+        self.log_step("Sustained container creates: to no space and beyond.")
         self.container = []
         sequential_fail_counter = 0
         in_failure = False
@@ -285,43 +289,44 @@ def test_metadata_fillup(self):
                     sequential_fail_counter += 1
                 if sequential_fail_counter >= sequential_fail_max:
                     self.log.info(
-                        "Phase 1: container %d - %d/%d sequential no space "
+                        "Container %d - %d/%d sequential no space "
                         "container create errors", sequential_fail_counter,
                         sequential_fail_max, loop)
                     break
 
                 if status and in_failure:
                     self.log.info(
-                        "Phase 1: container: %d - no space -> available "
+                        "Container: %d - no space -> available "
                         "transition, sequential no space failures: %d",
                         loop, sequential_fail_counter)
                     in_failure = False
                 elif not status and not in_failure:
                     self.log.info(
-                        "Phase 1: container: %d - available -> no space "
+                        "Container: %d - available -> no space "
                         "transition, sequential no space failures: %d",
                         loop, sequential_fail_counter)
                     in_failure = True
 
             except TestFail as error:
                 self.log.error(str(error))
-                self.fail("Phase 1: fail (unexpected container create error)")
+                self.fail("fail (unexpected container create error)")
+        self.log_step("Verify number of container within limit.")
         if len(self.container) >= self.created_containers_limit:
-            self.log.error("Phase 1: Created too many containers: %d > %d", len(self.container),
+            self.log.error("Created too many containers: %d > %d", len(self.container),
                            self.created_containers_limit)
-            self.fail("Phase 1: Created too many containers")
+            self.fail("Created too many containers")
         if len(self.container) < self.created_containers_min:
-            self.log.info("Phase 1: Created too few containers: %d < %d", len(self.container),
+            self.log.info("Created too few containers: %d < %d", len(self.container),
                           self.created_containers_min)
-            self.fail("Phase 1: Created too few containers")
+            self.fail("Created too few containers")
         self.log.info(
-            "Phase 1: passed (created %d / %d containers)", len(self.container), loop)
+            "Successfully created %d / %d containers)", len(self.container), loop)
 
         # Phase 2 clean up some containers (expected to succeed)
-        self.log.info("Phase 2: Cleaning up %d containers (expected to work)", num_cont_to_destroy)
+        msg = "Cleaning up {} containers after pool is full.".format(num_cont_to_destroy)
+        self.log_step(msg)
         if not self.destroy_num_containers(num_cont_to_destroy):
-            self.fail("Phase 2: fail (unexpected container destroy error)")
-        self.log.info("Phase 2: passed")
+            self.fail("Fail (unexpected container destroy error)")
 
         # Do not destroy containers in teardown (destroy pool while metadata rdb is full)
         for container in self.container:
@@ -329,6 +334,46 @@ def test_metadata_fillup(self):
         self.log.info("Leaving pool metadata rdb full (containers will not be destroyed)")
         self.log.info("Test passed")
 
+    def test_metadata_fillup_svc_ops_disabled(self):
+        """JIRA ID: DAOS-15628.
+
+        Test Description:
+            Test to verify number of resources that can be created until metadata is full,
+            when svc_ops disabled.
+        Use Cases:
+            1. Create pool with properties svc_ops_enabled:0.
+            2. Create container until no space.
+            3. Verify number of container within limit.
+            4. Cleaning up containers after pool is full.
+
+        :avocado: tags=all,full_regression
+        :avocado: tags=hw,large
+        :avocado: tags=server,metadata
+        :avocado: tags=ObjectMetadata,test_metadata_fillup_svc_ops_disabled
+        """
+        self.metadata_fillup(False)
+
+    def test_metadata_fillup_svc_ops_enabled(self):
+        """JIRA ID: DAOS-15628.
+
+        Test Description:
+            Test to verify number of resources that can be created until metadata is full,
+            when svc_ops_enabled.
+
+        Use Cases:
+            1. Create pool with properties svc_ops_enabled:1.
+               and run dummy metadata workload to fill up svc ops.
+            2. Create container until no space.
+            3. Verify number of container within limit.
+            4. Cleaning up containers after pool is full.
+
+        :avocado: tags=all,full_regression
+        :avocado: tags=hw,large
+        :avocado: tags=server,metadata
+        :avocado: tags=ObjectMetadata,test_metadata_fillup_svc_ops_enabled
+        """
+        self.metadata_fillup(True)
+
     def test_metadata_addremove(self):
         """JIRA ID: DAOS-1512.
 
@@ -344,9 +389,13 @@ def test_metadata_addremove(self):
         :avocado: tags=ObjectMetadata,test_metadata_addremove
         """
         self.create_pool()
-        if not self.run_dummy_metadata_workload():
-            self.fail("failed to run dummy metadata workload")
-
+        svc_ops_enabled = self.pool.get_property("svc_ops_enabled")
+        if svc_ops_enabled:
+            svc_ops_entry_age = self.pool.get_property("svc_ops_entry_age")
+            if not self.run_dummy_metadata_workload(duration=svc_ops_entry_age):
+                self.fail("failed to run dummy metadata workload")
+        else:
+            self.fail("svc_ops_enabled:0 is not supported for this testcase.")
         self.container = []
         mean_cont_cnt = 0
         percent_cont = self.params.get("mean_percent", "/run/metadata/*")
diff --git a/src/tests/ftest/server/metadata.yaml b/src/tests/ftest/server/metadata.yaml
index 80f37a5f128..e25ddc97170 100644
--- a/src/tests/ftest/server/metadata.yaml
+++ b/src/tests/ftest/server/metadata.yaml
@@ -2,9 +2,10 @@ hosts:
   test_servers: 4
   test_clients: 1
 timeouts:
-  test_metadata_fillup: 600
-  test_metadata_addremove: 1600
-  test_metadata_server_restart: 960
+  test_metadata_fillup_svc_ops_disabled: 400
+  test_metadata_fillup_svc_ops_enabled: 400
+  test_metadata_addremove: 1300
+  test_metadata_server_restart: 500
 server_config:
   name: daos_server
   engines_per_host: 2
@@ -47,7 +48,11 @@ pool:
   svcn: 5
   scm_size: 1G
   control_method: dmg
-  properties: svc_ops_enabled:1,svc_ops_entry_age:150
+  properties: svc_ops_entry_age:60
+# Uncomment the following for manual test with different svc_ops_entry_age value
+#  properties: svc_ops_entry_age:150
+#  properties: svc_ops_entry_age:300
+#  properties: svc_ops_entry_age:600
 container:
   control_method: API
   silent: true
@@ -61,7 +66,7 @@ ior:
 metadata:
   mean_percent: 1
   num_addremove_loops: 4
-  created_cont_min: 25000
-  created_cont_max: 36000
+  created_cont_min: 30000
+  created_cont_max: 39000
   num_cont_to_destroy: 500
   fillup_seq_fail_max: 512

From 1ce781b9bb330fe5e1310f43f7de0ffa84b73d49 Mon Sep 17 00:00:00 2001
From: Kris Jacque <kris.jacque@intel.com>
Date: Mon, 29 Apr 2024 12:56:24 -0600
Subject: [PATCH 03/10] DAOS-13520 control: Fix UUID filter for dmg check query
 (#13050)

Use requested UUIDs to filter check reports for specific pools.

Signed-off-by: Kris Jacque <kris.jacque@intel.com>
---
 src/control/server/mgmt_check.go      |  22 ++-
 src/control/server/mgmt_check_test.go | 208 ++++++++++++++++++++++++++
 2 files changed, 229 insertions(+), 1 deletion(-)

diff --git a/src/control/server/mgmt_check.go b/src/control/server/mgmt_check.go
index 606f4e2dad4..5312763cef8 100644
--- a/src/control/server/mgmt_check.go
+++ b/src/control/server/mgmt_check.go
@@ -331,6 +331,13 @@ func (svc *mgmtSvc) SystemCheckQuery(ctx context.Context, req *mgmtpb.CheckQuery
 		req.Shallow = true
 	}
 
+	uuids := common.NewStringSet(req.Uuids...)
+	wantUUID := func(uuid string) bool {
+		return len(uuids) == 0 || uuids.Has(uuid)
+	}
+
+	reports := []*chkpb.CheckReport{}
+
 	if !req.Shallow {
 		dResp, err := svc.makePoolCheckerCall(ctx, drpc.MethodCheckerQuery, req)
 		if err != nil {
@@ -340,16 +347,29 @@ func (svc *mgmtSvc) SystemCheckQuery(ctx context.Context, req *mgmtpb.CheckQuery
 		if err = proto.Unmarshal(dResp.Body, resp); err != nil {
 			return nil, errors.Wrap(err, "unmarshal CheckQuery response")
 		}
+
+		for _, r := range resp.Reports {
+			if wantUUID(r.PoolUuid) {
+				reports = append(reports, r)
+			}
+		}
 	}
 
+	// Collect saved older reports
 	cfList, err := svc.sysdb.GetCheckerFindings(req.GetSeqs()...)
 	if err != nil {
 		return nil, err
 	}
 
 	for _, f := range cfList {
-		resp.Reports = append(resp.Reports, &f.CheckReport)
+		if wantUUID(f.PoolUuid) {
+			reports = append(reports, &f.CheckReport)
+		}
 	}
+	sort.Slice(reports, func(i, j int) bool {
+		return reports[i].Seq < reports[j].Seq
+	})
+	resp.Reports = reports
 
 	return resp, nil
 }
diff --git a/src/control/server/mgmt_check_test.go b/src/control/server/mgmt_check_test.go
index 0465dec5e19..b59daad699e 100644
--- a/src/control/server/mgmt_check_test.go
+++ b/src/control/server/mgmt_check_test.go
@@ -641,3 +641,211 @@ func TestServer_mgmtSvc_SystemCheckSetPolicy(t *testing.T) {
 		})
 	}
 }
+
+func TestServer_mgmtSvc_SystemCheckQuery(t *testing.T) {
+	uuids := testPoolUUIDs(3)
+	testFindingsMS := []*chkpb.CheckReport{}
+	testFindingsDrpc := []*chkpb.CheckReport{}
+	drpcPools := []*mgmtpb.CheckQueryPool{}
+	for i, uuid := range uuids {
+		testFindingsMS = append(testFindingsMS, &chkpb.CheckReport{
+			Seq:      uint64(i + 1),
+			Class:    chkpb.CheckInconsistClass_CIC_CONT_BAD_LABEL,
+			Action:   chkpb.CheckInconsistAction_CIA_TRUST_MS,
+			PoolUuid: uuid,
+		})
+
+		testFindingsDrpc = append(testFindingsDrpc, &chkpb.CheckReport{
+			Seq:      uint64(i + 1 + len(uuids)),
+			Class:    chkpb.CheckInconsistClass_CIC_POOL_NONEXIST_ON_ENGINE,
+			Action:   chkpb.CheckInconsistAction_CIA_TRUST_MS,
+			PoolUuid: uuid,
+		})
+
+		drpcPools = append(drpcPools, &mgmtpb.CheckQueryPool{
+			Uuid:   uuid,
+			Status: chkpb.CheckPoolStatus(i),
+			Phase:  chkpb.CheckScanPhase(i),
+		})
+	}
+
+	drpcResp := &mgmtpb.CheckQueryResp{
+		InsStatus: chkpb.CheckInstStatus_CIS_RUNNING,
+		InsPhase:  chkpb.CheckScanPhase_CSP_AGGREGATION,
+		Pools:     drpcPools,
+		Reports:   testFindingsDrpc,
+	}
+
+	for name, tc := range map[string]struct {
+		createMS  func(*testing.T, logging.Logger) *mgmtSvc
+		setupDrpc func(*testing.T, *mgmtSvc)
+		req       *mgmtpb.CheckQueryReq
+		expResp   *mgmtpb.CheckQueryResp
+		expErr    error
+	}{
+		"not MS replica": {
+			createMS: func(t *testing.T, log logging.Logger) *mgmtSvc {
+				svc := newTestMgmtSvc(t, log)
+				svc.sysdb = raft.MockDatabaseWithCfg(t, log, &raft.DatabaseConfig{
+					SystemName: build.DefaultSystemName,
+					Replicas:   []*net.TCPAddr{{IP: net.IP{111, 222, 1, 1}}},
+				})
+				return svc
+			},
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expErr: errors.New("replica"),
+		},
+		"checker is not enabled": {
+			createMS: func(t *testing.T, log logging.Logger) *mgmtSvc {
+				return testSvcWithMemberState(t, log, system.MemberStateCheckerStarted, uuids)
+			},
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expErr: checker.FaultCheckerNotEnabled,
+		},
+		"bad member states": {
+			createMS: func(t *testing.T, log logging.Logger) *mgmtSvc {
+				return testSvcCheckerEnabled(t, log, system.MemberStateJoined, uuids)
+			},
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expErr: errors.New("expected states"),
+		},
+		"dRPC fails": {
+			setupDrpc: func(t *testing.T, ms *mgmtSvc) {
+				setupMockDrpcClient(ms, nil, errors.New("mock dRPC"))
+			},
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expErr: errors.New("mock dRPC"),
+		},
+		"bad resp": {
+			setupDrpc: func(t *testing.T, ms *mgmtSvc) {
+				setupMockDrpcClientBytes(ms, []byte("garbage"), nil)
+			},
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expErr: errors.New("unmarshal CheckQuery response"),
+		},
+		"success": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys: "daos_server",
+			},
+			expResp: &mgmtpb.CheckQueryResp{
+				InsStatus: chkpb.CheckInstStatus_CIS_RUNNING,
+				InsPhase:  chkpb.CheckScanPhase_CSP_AGGREGATION,
+				Pools:     drpcPools,
+				Reports:   append(testFindingsMS, testFindingsDrpc...),
+			},
+		},
+		"shallow": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys:     "daos_server",
+				Shallow: true,
+			},
+			setupDrpc: func(t *testing.T, ms *mgmtSvc) {
+				setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC"))
+			},
+			expResp: &mgmtpb.CheckQueryResp{
+				Reports: testFindingsMS,
+			},
+		},
+		"request sequence numbers": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys:  "daos_server",
+				Seqs: []uint64{2, 3},
+			},
+			setupDrpc: func(t *testing.T, ms *mgmtSvc) {
+				setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC"))
+			},
+			expResp: &mgmtpb.CheckQueryResp{
+				Reports: []*chkpb.CheckReport{
+					testFindingsMS[1],
+					testFindingsMS[2],
+				},
+			},
+		},
+		"request invalid sequence number": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys:  "daos_server",
+				Seqs: []uint64{2, 3, 25},
+			},
+			setupDrpc: func(t *testing.T, ms *mgmtSvc) {
+				setupMockDrpcClient(ms, nil, errors.New("shouldn't call dRPC"))
+			},
+			expErr: errors.New("not found"),
+		},
+		"request all uuids": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys:   "daos_server",
+				Uuids: uuids,
+			},
+			expResp: &mgmtpb.CheckQueryResp{
+				InsStatus: chkpb.CheckInstStatus_CIS_RUNNING,
+				InsPhase:  chkpb.CheckScanPhase_CSP_AGGREGATION,
+				Pools:     drpcPools,
+				Reports:   append(testFindingsMS, testFindingsDrpc...),
+			},
+		},
+		"filter uuids": {
+			req: &mgmtpb.CheckQueryReq{
+				Sys:   "daos_server",
+				Uuids: []string{uuids[0], uuids[2]},
+			},
+			expResp: &mgmtpb.CheckQueryResp{
+				InsStatus: chkpb.CheckInstStatus_CIS_RUNNING,
+				InsPhase:  chkpb.CheckScanPhase_CSP_AGGREGATION,
+				Pools:     drpcPools,
+				Reports: []*chkpb.CheckReport{
+					testFindingsMS[0],
+					testFindingsMS[2],
+					testFindingsDrpc[0],
+					testFindingsDrpc[2],
+				},
+			},
+		},
+	} {
+		t.Run(name, func(t *testing.T) {
+			log, buf := logging.NewTestLogger(t.Name())
+			defer test.ShowBufferOnFailure(t, buf)
+
+			if tc.createMS == nil {
+				tc.createMS = func(t *testing.T, log logging.Logger) *mgmtSvc {
+					svc := testSvcCheckerEnabled(t, log, system.MemberStateCheckerStarted, uuids)
+					for _, f := range testFindingsMS {
+						if err := svc.sysdb.AddCheckerFinding(&checker.Finding{CheckReport: *f}); err != nil {
+							t.Fatalf("unable to add finding %+v: %s", f, err.Error())
+						}
+					}
+					return svc
+				}
+			}
+			svc := tc.createMS(t, log)
+
+			if tc.setupDrpc == nil {
+				tc.setupDrpc = func(t *testing.T, ms *mgmtSvc) {
+					setupMockDrpcClient(ms, drpcResp, nil)
+				}
+			}
+			tc.setupDrpc(t, svc)
+
+			resp, err := svc.SystemCheckQuery(test.Context(t), tc.req)
+
+			test.CmpErr(t, tc.expErr, err)
+			if diff := cmp.Diff(tc.expResp, resp,
+				cmpopts.IgnoreUnexported(
+					mgmtpb.CheckQueryResp{},
+					mgmtpb.CheckQueryPool{},
+					chkpb.CheckReport{}),
+			); diff != "" {
+				t.Fatalf("want-, got+:\n%s", diff)
+			}
+		})
+	}
+}

From cbf716c2462cdf125c29e6c02a81ffd83fa754ea Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Mon, 29 Apr 2024 12:27:46 -0700
Subject: [PATCH 04/10] DAOS-623 test: fix avocado run --failfast (#14253)

avocado run --failfast on is now avocado run --failfast

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
---
 src/tests/ftest/util/avocado_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/ftest/util/avocado_utils.py b/src/tests/ftest/util/avocado_utils.py
index 4e7d7b09e61..4038fc2c168 100644
--- a/src/tests/ftest/util/avocado_utils.py
+++ b/src/tests/ftest/util/avocado_utils.py
@@ -205,7 +205,7 @@ def get_run_command(self, test, tag_filters, sparse, failfast):
         if tag_filters:
             command.extend(tag_filters)
         if failfast:
-            command.extend(["--failfast", "on"])
+            command.append("--failfast")
         command.extend(["--mux-yaml", test.yaml_file])
         if test.extra_yaml:
             command.extend(test.extra_yaml)

From 867c8eecc4531a43d37951d7cf96218b568009c9 Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Mon, 29 Apr 2024 12:39:51 -0700
Subject: [PATCH 05/10] DAOS-15684 test: add test case for custom server name
 (#14225)

Add a case for changing the default name of "daos_server"
Fix bug when setting config name

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
Co-authored-by: Mohamad Chaarawi <mohamad.chaarawi@intel.com>
---
 src/client/dfs/duns.c                          | 2 +-
 src/include/daos_pool.h                        | 2 --
 src/tests/ftest/control/daos_system_query.py   | 2 +-
 src/tests/ftest/control/daos_system_query.yaml | 3 ++-
 src/tests/ftest/util/apricot/apricot/test.py   | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/client/dfs/duns.c b/src/client/dfs/duns.c
index 1a1bdbda623..71ce57bf097 100644
--- a/src/client/dfs/duns.c
+++ b/src/client/dfs/duns.c
@@ -1377,7 +1377,7 @@ duns_set_sys_name(struct duns_attr_t *attrp, const char *sys)
 {
 	if (attrp == NULL)
 		return EINVAL;
-	D_STRNDUP(attrp->da_sys, sys, DAOS_SYS_NAME_MAX_LEN);
+	D_STRNDUP(attrp->da_sys, sys, DAOS_SYS_NAME_MAX);
 	if (attrp->da_sys == NULL)
 		return ENOMEM;
 
diff --git a/src/include/daos_pool.h b/src/include/daos_pool.h
index 5edc5aa3c2d..99173ea6638 100644
--- a/src/include/daos_pool.h
+++ b/src/include/daos_pool.h
@@ -210,8 +210,6 @@ struct daos_pool_cont_info2 {
 	void				*pci_reserved[2];
 };
 
-#define DAOS_SYS_NAME_MAX_LEN 127
-
 /**
  * Connect to the DAOS pool identified by \a pool, a label or UUID string.
  * Upon a successful completion, \a poh returns the pool handle, and \a info
diff --git a/src/tests/ftest/control/daos_system_query.py b/src/tests/ftest/control/daos_system_query.py
index 8c91f47d8b3..29f279d2c6c 100644
--- a/src/tests/ftest/control/daos_system_query.py
+++ b/src/tests/ftest/control/daos_system_query.py
@@ -21,7 +21,7 @@ def test_daos_system_query(self):
         :avocado: tags=all,full_regression
         :avocado: tags=vm
         :avocado: tags=control,daos_cmd
-        :avocado: tags=DaosSystemQuery,daos_system_query,test_daos_system_query
+        :avocado: tags=DaosSystemQuery,test_daos_system_query
         """
         daos_cmd = self.get_daos_command()
 
diff --git a/src/tests/ftest/control/daos_system_query.yaml b/src/tests/ftest/control/daos_system_query.yaml
index 05be8c5b096..3d1b9762231 100644
--- a/src/tests/ftest/control/daos_system_query.yaml
+++ b/src/tests/ftest/control/daos_system_query.yaml
@@ -1,8 +1,9 @@
 hosts:
   test_servers: 1
+  test_clients: 1
 timeout: 80
 server_config:
-  name: daos_server
+  name: other_dserver  # Use a non-default name
   engines_per_host: 1
   engines:
     0:
diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index ce15a430493..e2e838a8536 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -715,7 +715,7 @@ def setUp(self):
 
         # The server config name should be obtained from each ServerManager
         # object, but some tests still use this TestWithServers attribute.
-        self.server_group = self.params.get("name", "/server_config/", "daos_server")
+        self.server_group = self.params.get("name", "/run/server_config/*", "daos_server")
 
         # The optional namespace for the server configuration test yaml parameters.
         self.server_config_namespace = self.params.get("server_config_namespace", "/run/setup/*")

From c859be0043cd0cca97066ebd9fd716594d7f4c86 Mon Sep 17 00:00:00 2001
From: saurabhtandan <saurabh.tandan@intel.com>
Date: Mon, 29 Apr 2024 14:01:43 -0700
Subject: [PATCH 06/10] DAOS-14823 test: Changing scm-size for pool create
 (#13871)

Increasing scm-size for pool create from 128Mib to 256Mib

Signed-off-by: Saurabh Tandan <saurabh.tandan@intel.com>
---
 src/tests/ftest/deployment/io_sys_admin.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tests/ftest/deployment/io_sys_admin.yaml b/src/tests/ftest/deployment/io_sys_admin.yaml
index 1e4f6b4dec0..cbee965200a 100644
--- a/src/tests/ftest/deployment/io_sys_admin.yaml
+++ b/src/tests/ftest/deployment/io_sys_admin.yaml
@@ -26,7 +26,7 @@ dmg:
     storage_sub_command: scan
 pool_1:
   control_method: dmg
-  scm_size: 128MiB
+  scm_size: 256MiB
   nvme_size: 16GiB
 pool_2:
   control_method: dmg

From b593cea06800d0ad269615f6be9156fcf8badc1c Mon Sep 17 00:00:00 2001
From: Makito Kano <makito.kano@intel.com>
Date: Tue, 30 Apr 2024 06:27:03 +0900
Subject: [PATCH 07/10] DAOS-15759 test: Remove utils/cr_demo (#14265)

Catastrophic recovery demo scripts that are in utils/cr_demo
introduces security vulnerabilities. The scripts are for the
CR demo and no longer necessary, so remove them.

Signed-off-by: Makito Kano <makito.kano@intel.com>
---
 utils/cr_demo/demo_utils.py        | 379 -------------------------
 utils/cr_demo/run_demo_aurora.py   | 434 -----------------------------
 utils/cr_demo/show_fixed_aurora.py | 117 --------
 3 files changed, 930 deletions(-)
 delete mode 100644 utils/cr_demo/demo_utils.py
 delete mode 100644 utils/cr_demo/run_demo_aurora.py
 delete mode 100644 utils/cr_demo/show_fixed_aurora.py

diff --git a/utils/cr_demo/demo_utils.py b/utils/cr_demo/demo_utils.py
deleted file mode 100644
index 1007ce77e24..00000000000
--- a/utils/cr_demo/demo_utils.py
+++ /dev/null
@@ -1,379 +0,0 @@
-"""
-  (C) Copyright 2023 Intel Corporation.
-
-  SPDX-License-Identifier: BSD-2-Clause-Patent
-"""
-import subprocess  # nosec
-
-import yaml
-
-
-# Storage-related methods
-def format_storage(host_list):
-    """Call dmg storage format.
-
-    Args:
-        host_list (str): List of hosts to format.
-    """
-    format_cmd = ["dmg", "storage", "format", "--host-list=" + host_list]
-    run_command(command=format_cmd)
-
-
-def storage_query_usage(host_list):
-    """Call dmg storage query usage.
-
-    Args:
-        host_list (str): List of hosts to query.
-    """
-    storage_query_cmd = ["dmg", "storage", "query", "usage", "--host-list=" + host_list]
-    run_command(command=storage_query_cmd)
-
-
-# Pool-related methods
-def create_pool(pool_size, pool_label, ranks=None, nsvc=None):
-    """Call dmg pool create.
-
-    Args:
-        pool_size (str): Pool size.
-        pool_label (str): Pool label.
-        ranks (str): Ranks to create pool. Defaults to None.
-        nsvc (str): Number of service replicas. Defaults to None.
-    """
-    create_pool_cmd = ["dmg", "pool", "create", pool_label, "--size=" + pool_size]
-    if ranks:
-        create_pool_cmd.append("--ranks=" + ranks)
-    if nsvc:
-        create_pool_cmd.append("--nsvc=" + nsvc)
-    run_command(command=create_pool_cmd)
-
-
-def list_pool(verbose=False, json=False, no_query=False):
-    """Call dmg pool list.
-
-    Args:
-        verbose (bool): Whether to use --verbose. Defaults to False.
-        json (bool): Whether to use --json. If used, verbose value would be irrelevant.
-            Defaults to False.
-        no_query (bool): Whether to use --no-query. Defaults to False.
-
-    Returns:
-        str: If --json is used, return stdout. Otherwise None.
-
-    """
-    list_pool_cmd = ["dmg", "pool", "list"]
-    if json:
-        list_pool_cmd.append("--json")
-    if verbose:
-        list_pool_cmd.append("--verbose")
-    if no_query:
-        list_pool_cmd.append("--no-query")
-    command = " ".join(list_pool_cmd)
-    print(f"Command: {command}")
-
-    if json:
-        result = subprocess.run(
-            list_pool_cmd, stdout=subprocess.PIPE, universal_newlines=True, check=False)
-        return result.stdout
-
-    subprocess.run(list_pool_cmd, check=False)
-    return None
-
-
-def pool_get_prop(pool_label, properties):
-    """Call dmg pool get-prop <pool_label> <properties>
-
-    Args:
-        pool_label (str): Pool label.
-        properties (str): Properties to query. Separate them with comma if there are
-            multiple properties.
-    """
-    get_prop_cmd = ["dmg", "pool", "get-prop", pool_label, properties]
-    run_command(command=get_prop_cmd)
-
-
-def pool_query(pool_label):
-    """Call dmg pool query
-
-    Args:
-        pool_label (str): Pool label.
-    """
-    pool_query_cmd = ["dmg", "pool", "query", pool_label]
-    run_command(command=pool_query_cmd)
-
-
-# Container-related methods
-def create_container(pool_label, cont_label):
-    """Call daos container create.
-
-    Args:
-        pool_label (str): Pool label.
-        cont_label (str): Container label.
-    """
-    cont_create_cmd = ["daos", "container", "create", pool_label, cont_label]
-    run_command(command=cont_create_cmd)
-
-
-def cont_get_prop(pool_label, cont_label, properties=None):
-    """Call daos container get-prop <pool_label> <cont_label> <properties>
-
-    Args:
-        pool_label (str): Pool label.
-        cont_label (str): Container label.
-        properties (str): Properties to query. Separate them with comma if there are
-            multiple properties. Defaults to None.
-    """
-    get_prop_cmd = ["daos", "container", "get-prop", pool_label, cont_label]
-    if properties:
-        get_prop_cmd.append("--properties=" + properties)
-    run_command(command=get_prop_cmd)
-
-
-# Fault-related methods
-def inject_fault_mgmt(pool_label, fault_type):
-    """Call dmg faults mgmt-svc to inject fault.
-
-    Args:
-        pool_label (str): Pool label.
-        fault_type (str): Fault type.
-    """
-    inject_fault_cmd = ["dmg", "faults", "mgmt-svc", "pool", pool_label, fault_type]
-    run_command(command=inject_fault_cmd)
-
-
-def inject_fault_pool(pool_label, fault_type):
-    """Call dmg faults pool-svc to inject fault.
-
-    Args:
-        pool_label (str): Pool label.
-        fault_type (str): Fault type.
-    """
-    inject_fault_cmd = ["dmg", "faults", "pool-svc", pool_label, fault_type]
-    run_command(command=inject_fault_cmd)
-
-
-def inject_fault_daos(pool_label, cont_label, fault_type):
-    """Call daos faults to inject fault.
-
-    Args:
-        pool_label (str): Pool label.
-        cont_label (str): Container label.
-        fault_type (str): Fault type.
-    """
-    location = "--location=" + fault_type
-    inject_fault_cmd = ["daos", "faults", "container", pool_label, cont_label, location]
-    run_command(command=inject_fault_cmd)
-
-
-# Check-related methods
-def check_enable():
-    """Call dmg check enable"""
-    check_enable_cmd = ["dmg", "check", "enable"]
-    run_command(command=check_enable_cmd)
-
-
-def check_set_policy(reset_defaults=False, all_interactive=False):
-    """Call dmg check set-policy with --reset-defaults or --all-interactive.
-
-    Args:
-        reset_defaults (bool): Set all policies to their default action. Defaults to
-            False.
-        all_interactive (bool): Set all policies to interactive. Defaults to False.
-    """
-    if reset_defaults != all_interactive:
-        check_set_policy_cmd = ["dmg", "check", "set-policy"]
-        if reset_defaults:
-            check_set_policy_cmd.append("--reset-defaults")
-        if all_interactive:
-            check_set_policy_cmd.append("--all-interactive")
-        run_command(command=check_set_policy_cmd)
-
-
-def check_start(policies=None):
-    """Call dmg check start
-
-    Args:
-        policies (str): Repair policies such as POOL_BAD_LABEL:CIA_INTERACT
-    """
-    check_start_cmd = ["dmg", "check", "start"]
-    if policies:
-        check_start_cmd.extend(["-p", policies])
-    run_command(command=check_start_cmd)
-
-
-def check_query(json=False):
-    """Call dmg check query
-
-    Args:
-        json (bool): Whether to use --json. Defaults to False.
-
-    Returns:
-        str: If --json is used, return stdout. Otherwise None.
-
-    """
-    if json:
-        check_query_cmd = ["dmg", "--json", "check", "query"]
-    else:
-        check_query_cmd = ["dmg", "check", "query"]
-    command = " ".join(check_query_cmd)
-    print(f"Command: {command}")
-
-    if json:
-        result = subprocess.run(
-            check_query_cmd, stdout=subprocess.PIPE, universal_newlines=True, check=False)
-        return result.stdout
-
-    subprocess.run(check_query_cmd, check=False)
-    return None
-
-
-def check_disable():
-    """Call dmg check disable"""
-    check_disable_cmd = ["dmg", "check", "disable"]
-    run_command(command=check_disable_cmd)
-
-
-def repeat_check_query():
-    """Allow user to repeatedly call dmg check query."""
-    while True:
-        user_input = input("Hit y to query, n to proceed to next step: ")
-        if user_input == "y":
-            check_query()
-        elif user_input == "n":
-            break
-        else:
-            print("Please enter y or n.")
-
-
-def check_repair(sequence_num, action):
-    """Call dmg check repair
-
-    Args:
-        sequence_num (str): Sequence number for repair action.
-        action (str): Repair action number.
-    """
-    check_repair_cmd = ["dmg", "check", "repair", sequence_num, action]
-    run_command(command=check_repair_cmd)
-
-
-# System-related methods
-def system_stop(force=False):
-    """Stop servers.
-
-    Args:
-        force (bool): Whether to use --force. Defaults to None.
-    """
-    system_stop_cmd = ["dmg", "system", "stop"]
-    if force:
-        system_stop_cmd.append("--force")
-    run_command(command=system_stop_cmd)
-
-
-def system_start():
-    """Start servers."""
-    system_start_cmd = ["dmg", "system", "start"]
-    run_command(command=system_start_cmd)
-
-
-def system_query(json=False, verbose=False):
-    """Call dmg system query
-
-    Args:
-        json (bool): Whether to use --json. Defaults to False.
-        verbose (bool): Whether to use --verbose. Defaults to False.
-
-    Returns:
-        str: Command output.
-
-    """
-    if json:
-        system_query_cmd = ["dmg", "--json", "system", "query"]
-    else:
-        system_query_cmd = ["dmg", "system", "query"]
-    if verbose:
-        system_query_cmd.append("--verbose")
-    command = " ".join(system_query_cmd)
-    print(f"Command: {command}")
-
-    if json:
-        result = subprocess.run(
-            system_query_cmd, stdout=subprocess.PIPE, universal_newlines=True,
-            check=False)
-        return result.stdout
-
-    subprocess.run(system_query_cmd, check=False)
-    return None
-
-
-# Utility methods
-def create_uuid_to_seqnum():
-    """Create pool UUID to sequence number mapping.
-
-    Returns:
-        dict: UUID to sequence number mapping for each pool. Sequence number will be used
-            during repair.
-
-    """
-    uuid_to_seqnum = {}
-    stdout = check_query(json=True)
-    generated_yaml = yaml.safe_load(stdout)
-    for report in generated_yaml["response"]["reports"]:
-        uuid_to_seqnum[report["pool_uuid"]] = report["seq"]
-
-    return uuid_to_seqnum
-
-
-def create_label_to_uuid():
-    """Create label to UUID mapping.
-
-    Returns:
-        dict: Pool label to UUID.
-
-    """
-    label_to_uuid = {}
-    stdout = list_pool(json=True)
-    generated_yaml = yaml.safe_load(stdout)
-    for pool in generated_yaml["response"]["pools"]:
-        label_to_uuid[pool["label"]] = pool["uuid"]
-
-    return label_to_uuid
-
-
-def get_current_labels():
-    """Get current pool labels from MS.
-
-    Returns:
-        list: Current pool labels.
-
-    """
-    pool_labels = []
-    stdout = list_pool(json=True)
-    generated_yaml = yaml.safe_load(stdout)
-    for pool in generated_yaml["response"]["pools"]:
-        pool_labels.append(pool["label"])
-
-    return pool_labels
-
-
-def convert_list_to_str(original_list, separator):
-    """Convert given list to a string with each item separated by separator.
-
-    Args:
-        original_list (list): List of items.
-        separator (str): Separator to separate each item in the new string list.
-
-    Returns:
-        str: String list.
-
-    """
-    return separator.join(map(str, original_list))
-
-
-def run_command(command):
-    """Print given command and run.
-
-    Args:
-        command (list): List of characters that make up the command.
-    """
-    cmd_str = " ".join(command)
-    print(f"Command: {cmd_str}")
-    subprocess.run(command, check=False)
diff --git a/utils/cr_demo/run_demo_aurora.py b/utils/cr_demo/run_demo_aurora.py
deleted file mode 100644
index c7f1962c2be..00000000000
--- a/utils/cr_demo/run_demo_aurora.py
+++ /dev/null
@@ -1,434 +0,0 @@
-"""
-  (C) Copyright 2023 Intel Corporation.
-
-  SPDX-License-Identifier: BSD-2-Clause-Patent
-"""
-import argparse
-import re
-import subprocess  # nosec
-import time
-from collections import defaultdict
-
-import yaml
-from ClusterShell.NodeSet import NodeSet
-from demo_utils import (check_disable, check_enable, check_repair, check_set_policy, check_start,
-                        cont_get_prop, convert_list_to_str, create_container, create_pool,
-                        create_uuid_to_seqnum, format_storage, inject_fault_daos,
-                        inject_fault_mgmt, inject_fault_pool, list_pool, pool_get_prop,
-                        repeat_check_query, storage_query_usage, system_query, system_start,
-                        system_stop)
-
-# Run this script on Aurora node as user. e.g.,
-# python3 run_demo_aurora.py -l aurora-daos-[0001-0100]
-
-TEST_CMD = "sudo date"
-test_cmd_list = TEST_CMD.split(" ")
-print(f"Check sudo works by calling: {TEST_CMD}")
-subprocess.run(test_cmd_list, check=False)
-
-POOL_SIZE = "5T"
-POOL_SIZE_F5 = "3T"
-POOL_LABEL = "tank"
-CONT_LABEL = "bucket"
-# Number of seconds to wait for engines to start for 1 group setup.
-FORMAT_SLEEP_SEC = 35
-
-print("\nF1: Dangling pool")
-print("F2: Lost the majority of pool service replicas")
-print("F3: Orphan pool")
-print("F4: Inconsistent pool label between MS and PS")
-print("F5: Orphan pool shard")
-print("F6: Dangling pool map")
-print("F7: Orphan container")
-print("F8: Inconsistent container label between CS and container property")
-
-PARSER = argparse.ArgumentParser()
-PARSER.add_argument(
-    "-l", "--hostlist", required=True, help="List of hosts to run the demo")
-ARGS = vars(PARSER.parse_args())
-
-HOSTLIST = ARGS["hostlist"]
-
-print(f"\n1. Format storage on {HOSTLIST}.")
-format_storage(host_list=HOSTLIST)
-
-print(f"\nWait for {FORMAT_SLEEP_SEC} sec for format...")
-time.sleep(FORMAT_SLEEP_SEC)
-
-# Call dmg system query to obtain the IP address of necessary ranks.
-rank_to_ip = {}
-stdout = system_query(json=True)
-# Printing system query output helps, but the output will be long if there are many ranks.
-# print(f"dmg system query stdout = {stdout}")
-generated_yaml = yaml.safe_load(stdout)
-RANK_COUNT = 0
-JOINED_COUNT = 0
-for member in generated_yaml["response"]["members"]:
-    rank_to_ip[member["rank"]] = member["addr"].split(":")[0]
-    RANK_COUNT += 1
-    if member["state"] == "joined":
-        JOINED_COUNT += 1
-# Print the number of ranks and joined ranks as a reference.
-node_set = NodeSet(HOSTLIST)
-hostlist = list(node_set)
-print(f"\n{len(hostlist)} nodes; {RANK_COUNT} ranks; {JOINED_COUNT} joined")
-
-# Create rank to mount point map and host to ranks map for F2 and F5.
-# 1. scp daos_control.log from all nodes to here, where this script runs. scp the local
-# file as well. Add hostname to the end of the file name. The log contains rank and PID.
-# Number of nodes used for F2.
-NODE_COUNT = 2
-for i in range(NODE_COUNT):
-    scp_cmd_list = ["scp", f"{hostlist[i]}:/var/tmp/daos_testing/daos_control.log",
-                    f"/var/tmp/daos_testing/daos_control_{hostlist[i]}.log"]
-    subprocess.run(scp_cmd_list, check=False)
-
-# 2. Determine the rank to PID mapping from the control logs. In addition, determine the
-# host to ranks mapping for creating the pool. We need to know the four ranks for the
-# first two nodes. We'll use many nodes in Aurora, but only two nodes for F2.
-rank_to_pid = {}
-host_to_ranks = defaultdict(list)
-SEARCH_STR = r"DAOS I/O Engine.*process (\d+) started on rank (\d+)"
-for i in range(NODE_COUNT):
-    with open(
-        f"/var/tmp/daos_testing/daos_control_{hostlist[i]}.log", "r",
-            encoding="utf-8") as file:
-        for line in file:
-            match = re.findall(SEARCH_STR, line)
-            if match:
-                print(match)
-                pid = int(match[0][0])
-                rank = int(match[0][1])
-                rank_to_pid[rank] = pid
-                host_to_ranks[hostlist[i]].append(rank)
-
-# 3. Determine the PID to mount point mapping by calling ps ax and search for daos_engine.
-# Sample line:
-# 84877 ?        SLl  102:04 /usr/bin/daos_engine -t 8 -x 1 -g daos_server -d
-# /var/run/daos_server -T 2 -n /mnt/daos1/daos_nvme.conf -p 1 -I 1 -r 8192 -H 2 -s
-# /mnt/daos1
-pid_to_mount = {}
-MOUNT_0 = "/mnt/daos0"
-MOUNT_1 = "/mnt/daos1"
-for i in range(NODE_COUNT):
-    clush_ps_ax = ["clush", "-w", hostlist[i], "ps ax"]
-    result = subprocess.check_output(clush_ps_ax)
-    result_list = result.decode("utf-8").split("\n")
-    for result in result_list:
-        if "daos_engine" in result:
-            print(result)
-            if MOUNT_0 in result:
-                pid = re.split(r"\s+", result)[1]
-                pid = int(pid)
-                pid_to_mount[pid] = MOUNT_0
-            elif MOUNT_1 in result:
-                pid = re.split(r"\s+", result)[1]
-                pid = int(pid)
-                pid_to_mount[pid] = MOUNT_1
-
-# 4. Determine the four ranks in hostlist[0] and hostlist[1] to create F2 pool.
-f2_ranks = []
-f2_ranks.extend(host_to_ranks[hostlist[0]])
-f2_ranks.extend(host_to_ranks[hostlist[1]])
-# Ranks in the map are int, so convert them to string and separate them with comma.
-F2_RANKS_STR = convert_list_to_str(original_list=f2_ranks, separator=",")
-
-# 5. Determine the two ranks in hostlist[0] to create F5 pool.
-f5_ranks = []
-f5_ranks.extend(host_to_ranks[hostlist[0]])
-# Ranks in the map are int, so convert them to string and separate them with comma.
-F5_RANKS_STR = convert_list_to_str(original_list=f5_ranks, separator=",")
-
-# Add input here to make sure all ranks are joined before starting the script.
-input("\n2. Create 8 pools and containers. Hit enter...")
-POOL_LABEL_1 = POOL_LABEL + "_F1"
-POOL_LABEL_2 = POOL_LABEL + "_F2"
-POOL_LABEL_3 = POOL_LABEL + "_F3"
-POOL_LABEL_4 = POOL_LABEL + "_F4"
-POOL_LABEL_5 = POOL_LABEL + "_F5"
-POOL_LABEL_6 = POOL_LABEL + "_F6"
-POOL_LABEL_7 = POOL_LABEL + "_F7"
-POOL_LABEL_8 = POOL_LABEL + "_F8"
-CONT_LABEL_7 = CONT_LABEL + "_F7"
-CONT_LABEL_8 = CONT_LABEL + "_F8"
-
-# F1. CIC_POOL_NONEXIST_ON_ENGINE - dangling pool
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_1)
-# F2. CIC_POOL_LESS_SVC_WITHOUT_QUORUM
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_2, ranks=F2_RANKS_STR, nsvc="3")
-# F3. CIC_POOL_NONEXIST_ON_MS - orphan pool
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_3)
-# F4. CIC_POOL_BAD_LABEL - inconsistent pool label between MS and PS
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_4)
-# F5. CIC_ENGINE_NONEXIST_IN_MAP - orphan pool shard
-create_pool(pool_size=POOL_SIZE_F5, pool_label=POOL_LABEL_5, ranks=F5_RANKS_STR)
-# F6. CIC_ENGINE_HAS_NO_STORAGE - dangling pool map
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_6)
-# F7. CIC_CONT_NONEXIST_ON_PS - orphan container
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_7)
-create_container(pool_label=POOL_LABEL_7, cont_label=CONT_LABEL_7)
-print()
-# F8. CIC_CONT_BAD_LABEL
-create_pool(pool_size=POOL_SIZE, pool_label=POOL_LABEL_8)
-create_container(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8)
-
-print("(Create label to UUID mapping and obtain service replicas for F2.)")
-label_to_uuid = {}
-f2_service_replicas = []
-stdout = list_pool(json=True)
-generated_yaml = yaml.safe_load(stdout)
-for pool in generated_yaml["response"]["pools"]:
-    label_to_uuid[pool["label"]] = pool["uuid"]
-    # Collect service replicas for F2.
-    if pool["label"] == POOL_LABEL_2:
-        f2_service_replicas = pool["svc_reps"]
-
-print(f"\n(F2 service replicas = {f2_service_replicas})")
-
-print(f"\n3-F5. Print storage usage to show original usage of {POOL_LABEL_5}. "
-      f"Pool is created on {hostlist[0]}.")
-# F5 pool is created on hostlist[0] ranks, but we'll copy the pool dir from there to one
-# of the ranks in hostlist[1], so show both.
-f5_host_list = f"{hostlist[0]},{hostlist[1]}"
-storage_query_usage(host_list=f5_host_list)
-
-print("\n4. Inject fault with dmg for F1, F3, F4, F7, F8.")
-# F1
-inject_fault_pool(pool_label=POOL_LABEL_1, fault_type="CIC_POOL_NONEXIST_ON_ENGINE")
-
-# F3
-inject_fault_mgmt(pool_label=POOL_LABEL_3, fault_type="CIC_POOL_NONEXIST_ON_MS")
-
-# F4
-inject_fault_mgmt(pool_label=POOL_LABEL_4, fault_type="CIC_POOL_BAD_LABEL")
-
-# F7
-inject_fault_daos(
-    pool_label=POOL_LABEL_7, cont_label=CONT_LABEL_7, fault_type="DAOS_CHK_CONT_ORPHAN")
-
-# F8
-inject_fault_daos(
-    pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8,
-    fault_type="DAOS_CHK_CONT_BAD_LABEL")
-
-input("\n5-1. Stop servers to manipulate for F2, F5, F6, F7. Hit enter...")
-system_stop(force=True)
-
-# F2: Destroy tank_2 rdb-pool on two of the three service replicas. Call them rank a and
-# b. Select the first two service replicas.
-svc_rep_a = f2_service_replicas[0]
-svc_rep_b = f2_service_replicas[1]
-rank_a_ip = rank_to_ip[svc_rep_a]
-rank_b_ip = rank_to_ip[svc_rep_b]
-rank_a_mount = pid_to_mount[rank_to_pid[svc_rep_a]]
-rank_b_mount = pid_to_mount[rank_to_pid[svc_rep_b]]
-rm_rank_a = f"sudo rm {rank_a_mount}/{label_to_uuid[POOL_LABEL_2]}/rdb-pool"
-rm_rank_b = f"sudo rm {rank_b_mount}/{label_to_uuid[POOL_LABEL_2]}/rdb-pool"
-clush_rm_rank_a = ["clush", "-w", rank_a_ip, rm_rank_a]
-clush_rm_rank_b = ["clush", "-w", rank_b_ip, rm_rank_b]
-print("(F2: Destroy tank_F2 rdb-pool on rank a and b.)")
-print(f"Command for rank a: {clush_rm_rank_a}\n")
-print(f"Command for rank b: {clush_rm_rank_b}\n")
-subprocess.run(clush_rm_rank_a, check=False)
-subprocess.run(clush_rm_rank_b, check=False)
-
-# F5: Copy tank_5 pool directory from /mnt/daos1 in hostlist[0] to /mnt/daos0 in
-# hostlist[1]. Match owner. (Mount points are arbitrary.)
-# In order to copy the pool directory without password, there are two things to set up.
-# 1. Since we're running rsync as user, update the mode of the source pool directory as
-# below.
-# Set 777 for /mnt/daos1 and /mnt/daos1/<pool_5>/* i.e.,
-# chmod 777 /mnt/daos1; chmod -R 777 /mnt/daos1/<pool_5>
-# 2. Update mode of the destination mount point to 777. e.g.,
-# clush -w <dst_host> "sudo chmod 777 /mnt/daos0"
-
-# Alternatively, we can generate public-private key pair for root and call scp with sudo.
-# Then we don't need to do step 2 (update mode to 777).
-
-print("(F5: Update mode of the source pool directory.)")
-pool_uuid_5 = label_to_uuid[POOL_LABEL_5]
-chmod_cmd = f"sudo chmod 777 /mnt/daos1; sudo chmod -R 777 /mnt/daos1/{pool_uuid_5}"
-clush_chmod_cmd = ["clush", "-w", hostlist[0], chmod_cmd]
-print(f"Command: {clush_chmod_cmd}\n")
-subprocess.run(clush_chmod_cmd, check=False)
-
-print("(F5: Update mode of the destination mount point.)")
-CHMOD_CMD = "sudo chmod 777 /mnt/daos0"
-clush_chmod_cmd = ["clush", "-w", hostlist[1], CHMOD_CMD]
-print(f"Command: {clush_chmod_cmd}\n")
-subprocess.run(clush_chmod_cmd, check=False)
-
-# Since we're sending each file (vos-0 to 15 + rdb-pool) one at a time rather than the
-# whole pool directory, we need to create the destination fake pool directory first.
-print("(F5: Create a fake pool directory at the destination mount point.)")
-mkdir_cmd = f"sudo mkdir /mnt/daos0/{pool_uuid_5}"
-clush_mkdir_cmd = ["clush", "-w", hostlist[1], mkdir_cmd]
-print(f"Command: {clush_mkdir_cmd}\n")
-subprocess.run(clush_mkdir_cmd, check=False)
-
-print("(F5: Update mode of the fake pool directory at destination.)")
-chmod_cmd = f"sudo chmod 777 /mnt/daos0/{pool_uuid_5}"
-clush_chmod_cmd = ["clush", "-w", hostlist[1], chmod_cmd]
-print(f"Command: {clush_chmod_cmd}\n")
-subprocess.run(clush_chmod_cmd, check=False)
-
-# Run the following xargs + rsync command on hostlist[0] using clush:
-# ls /mnt/daos1/<pool_uuid_5> | xargs --max-procs=16 -I% \
-# rsync -avz /mnt/daos1/<pool_uuid_5>/% hostlist[1]:/mnt/daos0/<pool_uuid_5>
-
-# 1. The initial ls command lists the content of the pool directory, which contains 16 vos
-# files (because there are 16 targets) and rdb-pool file.
-# 2. By using xargs, each item of the ls output is passed into rsync and the rsync
-# commands are executed in parallel. i.e., each file is sent by separate rsync process in
-# parallel.
-
-# * We use --max-procs=16 to support at most 16 rsync processes to run in parallel.
-# * -I% means replace % in the following rsync command by the output of ls. i.e., file
-# name.
-# * rsync -avz means archive, verbose, and compress. By using compress, we can
-# significantly reduce the size of the data and the transfer time.
-# * By running rsync in parallel, we can significantly reduce the transfer time. e.g., For
-# a 2TB pool with 8 targets per engine, each vos file size is about 7G (rdb-pool is
-# smaller). If we run a simple rsync, which runs serially, it takes 1 min 50 sec.
-# However, if we run them in parallel, it's reduced to 24 sec.
-print(f"(F5: Copy pool directory from {hostlist[0]} to {hostlist[1]}.)")
-xargs_rsync_cmd = (f"ls /mnt/daos1/{pool_uuid_5} | xargs --max-procs=16 -I% "
-                   f"rsync -avz /mnt/daos1/{pool_uuid_5}/% "
-                   f"{hostlist[1]}:/mnt/daos0/{pool_uuid_5}")
-clush_xargs_rsync_cmd = ["clush", "-w", hostlist[0], xargs_rsync_cmd]
-print(f"Command: {clush_xargs_rsync_cmd}\n")
-subprocess.run(clush_xargs_rsync_cmd, check=False)
-
-print("(F5: Set owner for the copied dir and files to daos_server:daos_server.)")
-chown_cmd = f"sudo chown -R daos_server:daos_server /mnt/daos0/{pool_uuid_5}"
-clush_chown_cmd = ["clush", "-w", hostlist[1], chown_cmd]
-print(f"Command: {clush_chown_cmd}\n")
-subprocess.run(clush_chown_cmd, check=False)
-
-print("(F6: Remove vos-0 from one of the nodes.)")
-pool_uuid_6 = label_to_uuid[POOL_LABEL_6]
-rm_cmd = f"sudo rm -rf /mnt/daos0/{pool_uuid_6}/vos-0"
-# Remove vos-0 from /mnt/daos0 in rank 0 node. Note that /mnt/daos0 may not be mapped to
-# rank 0. Rank 0 is mapped to either daos0 or daos1. However, we don't care for the
-# purpose of testing dangling pool map.
-clush_rm_cmd = ["clush", "-w", rank_to_ip[0], rm_cmd]
-print(f"Command: {clush_rm_cmd}\n")
-subprocess.run(clush_rm_cmd, check=False)
-
-print("F7: Use ddb to show that the container is left in shards.")
-pool_uuid_7 = label_to_uuid[POOL_LABEL_7]
-# Run ddb on /mnt/daos0 of rank 0 node.
-ddb_cmd = f"sudo ddb /mnt/daos0/{pool_uuid_7}/vos-0 ls"
-# ddb with clush causes some authentication error. tank_F7 is created across all ranks, so
-# just run ddb locally as a workaround.
-ddb_cmd_list = ddb_cmd.split(" ")
-print(f"Command: {ddb_cmd}")
-subprocess.run(ddb_cmd_list, check=False)
-
-# (optional) F3: Show pool directory at mount point to verify that the pool exists on
-# engine.
-
-print("\n5-2. Restart servers.")
-system_start()
-
-input("\n6. Show the faults injected for each pool/container for F1, F3, F4, F5, F8. "
-      "Hit enter...")
-print(f"6-F1. Show dangling pool entry for {POOL_LABEL_1}.")
-# F3 part 1
-print(f"6-F3. MS doesn't recognize {POOL_LABEL_3}.")
-# F4 part 1
-print(f"6-F4-1. Label ({POOL_LABEL_4}) in MS is corrupted with -fault added.")
-list_pool(no_query=True)
-
-# F2: (optional) Try to create a container, which will hang.
-
-# F4 part 2
-print(f"\n6-F4-2. Label ({POOL_LABEL_4}) in PS is still original.")
-POOL_LABEL_4_FAULT = POOL_LABEL_4 + "-fault"
-pool_get_prop(pool_label=POOL_LABEL_4_FAULT, properties="label")
-
-# F5: Call dmg storage query usage to show that the pool is using more space.
-print(f"\n6-F5. Print storage usage to show that {POOL_LABEL_5} is using more space. "
-      f"Pool directory is copied to {hostlist[1]}.")
-storage_query_usage(host_list=f5_host_list)
-
-# F8: Show inconsistency by getting the container label.
-print("\n6-F8. Show container label inconsistency.")
-cont_get_prop(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8)
-print(f"Error because container ({CONT_LABEL_8}) doesn't exist on container service.\n")
-
-print(f"Container ({CONT_LABEL_8}) exists on property.")
-cont_get_prop(pool_label=POOL_LABEL_8, cont_label="new-label", properties="label")
-
-input("\n7. Enable checker. Hit enter...")
-system_stop(force=True)
-check_enable()
-
-input("\n8. Start checker with interactive mode. Hit enter...")
-check_set_policy(all_interactive=True)
-print()
-check_start()
-print()
-repeat_check_query()
-
-input("\n8-1. Select repair options for F1 to F4. Hit enter...")
-print("(Create UUID to sequence number.)")
-uuid_to_seqnum = create_uuid_to_seqnum()
-SEQ_NUM_1 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_1]]))
-SEQ_NUM_2 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_2]]))
-SEQ_NUM_3 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_3]]))
-SEQ_NUM_4 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_4]]))
-SEQ_NUM_5 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_5]]))
-SEQ_NUM_6 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_6]]))
-SEQ_NUM_7 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_7]]))
-SEQ_NUM_8 = str(hex(uuid_to_seqnum[label_to_uuid[POOL_LABEL_8]]))
-
-# F1: 1: Discard the dangling pool entry from MS [suggested].
-print(f"\n{POOL_LABEL_1} - 1: Discard the dangling pool entry from MS [suggested].")
-check_repair(sequence_num=SEQ_NUM_1, action="1")
-
-# F2: 2: Start pool service under DICTATE mode from rank 1 [suggested].
-print(f"\n{POOL_LABEL_2} - 2: Start pool service under DICTATE mode from rank 1 "
-      f"[suggested].")
-check_repair(sequence_num=SEQ_NUM_2, action="2")
-
-# F3:2: Re-add the orphan pool back to MS [suggested].
-print(f"\n{POOL_LABEL_3} - 2: Re-add the orphan pool back to MS [suggested].")
-check_repair(sequence_num=SEQ_NUM_3, action="2")
-
-# F4: 2: Trust PS pool label.
-print(f"\n{POOL_LABEL_4} - 2: Trust PS pool label.")
-check_repair(sequence_num=SEQ_NUM_4, action="2")
-
-print()
-# Call dmg check query until n is entered.
-repeat_check_query()
-
-input("\n8-2. Select repair options for F5 to F8. Hit enter...")
-# F5: 1: Discard the orphan pool shard to release space [suggested].
-print(f"\n{POOL_LABEL_5} - 1: Discard the orphan pool shard to release space "
-      f"[suggested].")
-check_repair(sequence_num=SEQ_NUM_5, action="1")
-
-# F6: 1: Change pool map for the dangling map entry [suggested].
-print(f"\n{POOL_LABEL_6} - 1: Change pool map for the dangling map entry as down "
-      f"[suggested].")
-check_repair(sequence_num=SEQ_NUM_6, action="1")
-
-# F7: 1: Destroy the orphan container to release space [suggested].
-print(f"\n{POOL_LABEL_7} - 1: Destroy the orphan container to release space [suggested].")
-check_repair(sequence_num=SEQ_NUM_7, action="1")
-
-# F8: 2: Trust the container label in container property.
-print(f"\n{POOL_LABEL_8} - 2: Trust the container label in container property.")
-check_repair(sequence_num=SEQ_NUM_8, action="2")
-
-print()
-# Call dmg check query until n is entered.
-repeat_check_query()
-
-print("\n9. Disable the checker.")
-check_disable()
-system_start()
-
-print("\nRun show_fixed_aurora.py to show the issues fixed...")
diff --git a/utils/cr_demo/show_fixed_aurora.py b/utils/cr_demo/show_fixed_aurora.py
deleted file mode 100644
index 6271023ac13..00000000000
--- a/utils/cr_demo/show_fixed_aurora.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""
-  (C) Copyright 2023 Intel Corporation.
-
-  SPDX-License-Identifier: BSD-2-Clause-Patent
-"""
-import argparse
-import subprocess  # nosec
-
-import yaml
-from ClusterShell.NodeSet import NodeSet
-from demo_utils import (cont_get_prop, create_container, list_pool, pool_get_prop, pool_query,
-                        storage_query_usage, system_query, system_stop)
-
-# Run this script on Aurora node as user after running run_demo_aurora.py. E.g.,
-# python3 show_fixed_aurora.py -l aurora-daos-[0001-0100]
-
-TEST_CMD = "sudo date"
-test_cmd_list = TEST_CMD.split(" ")
-print(f"Check sudo works by calling: {TEST_CMD}")
-subprocess.run(test_cmd_list, check=False)
-
-POOL_LABEL = "tank"
-CONT_LABEL = "bucket"
-TARGET_PER_RANK = 16
-
-PARSER = argparse.ArgumentParser()
-PARSER.add_argument(
-    "-l", "--hostlist", required=True, help="List of hosts used for run_demo.py")
-ARGS = vars(PARSER.parse_args())
-HOSTLIST = ARGS["hostlist"]
-node_set = NodeSet(HOSTLIST)
-hostlist = list(node_set)
-
-# Call dmg system query to obtain the IP address of necessary ranks.
-rank_to_ip = {}
-stdout = system_query(json=True)
-# Printing system query output helps, but the output will be long if there are many ranks.
-# print(f"dmg system query stdout = {stdout}")
-generated_yaml = yaml.safe_load(stdout)
-RANK_COUNT = 0
-JOINED_COUNT = 0
-for member in generated_yaml["response"]["members"]:
-    rank_to_ip[member["rank"]] = member["addr"].split(":")[0]
-    RANK_COUNT += 1
-    if member["state"] == "joined":
-        JOINED_COUNT += 1
-# Print the number of ranks and joined ranks as a reference.
-print(f"\n{RANK_COUNT} ranks; {JOINED_COUNT} joined")
-TOTAL_TARGET = RANK_COUNT * TARGET_PER_RANK
-
-POOL_LABEL_1 = POOL_LABEL + "_F1"
-POOL_LABEL_2 = POOL_LABEL + "_F2"
-POOL_LABEL_3 = POOL_LABEL + "_F3"
-POOL_LABEL_4 = POOL_LABEL + "_F4"
-POOL_LABEL_5 = POOL_LABEL + "_F5"
-POOL_LABEL_6 = POOL_LABEL + "_F6"
-POOL_LABEL_7 = POOL_LABEL + "_F7"
-POOL_LABEL_8 = POOL_LABEL + "_F8"
-CONT_LABEL_8 = CONT_LABEL + "_F8"
-
-print("(Create label to UUID mapping.)")
-label_to_uuid = {}
-stdout = list_pool(json=True)
-generated_yaml = yaml.safe_load(stdout)
-for pool in generated_yaml["response"]["pools"]:
-    label_to_uuid[pool["label"]] = pool["uuid"]
-
-input("\n10. Show the issues fixed. Hit enter...")
-print(f"10-F1. Dangling pool ({POOL_LABEL_1}) was removed.")
-print(f"10-F3. Orphan pool ({POOL_LABEL_3}) was reconstructed.")
-list_pool()
-
-print(f"10-F2. Create a container on {POOL_LABEL_2}. Pool can be started now, so it "
-      f"should succeed.")
-CONT_LABEL_2 = CONT_LABEL + "_2"
-create_container(pool_label=POOL_LABEL_2, cont_label=CONT_LABEL_2)
-# (optional) Show that rdb-pool file in rank 0 and 2 are recovered.
-
-print(f"\n10-F4. Label inconsistency for {POOL_LABEL_4} was resolved. "
-      f"See pool list above.")
-pool_get_prop(pool_label=POOL_LABEL_4, properties="label")
-
-# F5: Call dmg storage query usage to verify the storage was reclaimed. - Not working due
-# to a bug. Instead, show that pool directory on dst node (rank 3 for 4-VM) was removed.
-print(f"\n10-F5-1. Print storage usage to show that storage used by {POOL_LABEL_5} is "
-      f"reclaimed after pool directory is removed from {hostlist[1]}.")
-f5_host_list = f"{hostlist[0]},{hostlist[1]}"
-storage_query_usage(host_list=f5_host_list)
-
-print(f"\n10-F5-2. {label_to_uuid[POOL_LABEL_5]} pool directory on {hostlist[1]} "
-      f"at /mnt/daos0 was removed.")
-LS_CMD = "ls /mnt/daos0"
-clush_ls_cmd = ["clush", "-w", hostlist[1], LS_CMD]
-print(f"Command: {clush_ls_cmd}\n")
-subprocess.run(clush_ls_cmd, check=False)
-
-EXPECTED_TARGET = TOTAL_TARGET - 1
-print(
-    f"\n10-F6. {POOL_LABEL_6} has one less target ({TOTAL_TARGET} -> {EXPECTED_TARGET}).")
-pool_query(pool_label=POOL_LABEL_6)
-# (optional) Reintegrate rank 1 on pool 6. Wait for rebuild to finish. Then verify the
-# target count.
-
-# F8: Verify that the inconsistency is fixed. The label is back to the original.
-print(f"\n10-F8. Container label inconsistency for {CONT_LABEL_8} was fixed.")
-cont_get_prop(pool_label=POOL_LABEL_8, cont_label=CONT_LABEL_8, properties="label")
-
-# F7: Stop server. Call the same ddb command to verify that the container is removed from
-# shard.
-print(f"\n10-F7. Use ddb to verify that the container in {POOL_LABEL_7} is removed "
-      f"from shards.")
-system_stop(force=True)
-pool_uuid_7 = label_to_uuid[POOL_LABEL_7]
-ddb_cmd = f"sudo ddb /mnt/daos0/{pool_uuid_7}/vos-0 ls"
-ddb_cmd_list = ddb_cmd.split(" ")
-print(f"Command: {ddb_cmd}")
-subprocess.run(ddb_cmd_list, check=False)

From 40fab0cf42f3dafff33f25f5d7ebada9d7f0c260 Mon Sep 17 00:00:00 2001
From: Dalton Bohning <dalton.bohning@intel.com>
Date: Mon, 29 Apr 2024 14:30:59 -0700
Subject: [PATCH 08/10] DAOS-15659 test: fix local ftest prefix (#14173)

PR #13565 accidentally broke how ftest determines the prefix from
.build_vars.json because it is no longer installed.

Eliminate the need for .build_vars.json in ftest entirely by using
shutil.which("daos") and support setting DAOS_TEST_PREFIX

Signed-off-by: Dalton Bohning <dalton.bohning@intel.com>
---
 src/tests/ftest/util/apricot/apricot/test.py |  31 +---
 src/tests/ftest/util/environment_utils.py    | 146 ++++++++++---------
 2 files changed, 85 insertions(+), 92 deletions(-)

diff --git a/src/tests/ftest/util/apricot/apricot/test.py b/src/tests/ftest/util/apricot/apricot/test.py
index e2e838a8536..aa9949499c3 100644
--- a/src/tests/ftest/util/apricot/apricot/test.py
+++ b/src/tests/ftest/util/apricot/apricot/test.py
@@ -5,7 +5,6 @@
 """
 # pylint: disable=too-many-lines
 
-import json
 import os
 import random
 import re
@@ -23,6 +22,7 @@
 from daos_utils import DaosCommand
 from distro_utils import detect
 from dmg_utils import get_dmg_command
+from environment_utils import TestEnvironment
 from exception_utils import CommandFailure
 from fault_config_utils import FaultInjection
 from general_utils import (DaosTestError, dict_to_str, dump_engines_stacks,
@@ -125,9 +125,7 @@ def __init__(self, *args, **kwargs):
         # use 'add_cancel_ticket(<ticket>)' to add to this set.
         self._teardown_cancel = set()
         self._teardown_errors = []
-        self.basepath = None
         self.prefix = None
-        self.ofi_prefix = None
         self.cancel_file = os.path.join(os.sep, "scratch", "CI-skip-list-master")
 
         # List of methods to call during tearDown to cleanup after the steps
@@ -150,22 +148,9 @@ def __init__(self, *args, **kwargs):
 
     def setUp(self):
         """Set up each test case."""
-        # get paths from the build_vars generated by build
-        try:
-            with open('../../.build_vars.json', encoding="utf-8") as build_vars:
-                build_paths = json.load(build_vars)
-            self.basepath = os.path.normpath(os.path.join(build_paths['PREFIX'],
-                                                          '..') + os.path.sep)
-            self.prefix = build_paths['PREFIX']
-            try:
-                self.ofi_prefix = build_paths['OFI_PREFIX']
-            except KeyError:
-                self.ofi_prefix = os.sep + "usr"
-        except FileNotFoundError:
-            self.prefix = "/usr"
-            self.basepath = "/"
-            self.ofi_prefix = os.sep + "usr"
-            self.log.info("No build vars file, assuming RPM install")
+        test_env = TestEnvironment()
+        self.prefix = test_env.daos_prefix
+        self.log.info("Using daos install prefix = %s", self.prefix)
         self.cancel_from_list()
         self.check_variant_skip()
         self.log.info("*** SETUP running on %s ***", str(detect()))
@@ -536,13 +521,11 @@ def setUp(self):
 
         # set default shared dir for daos tests in case DAOS_TEST_SHARED_DIR
         # is not set, for RPM env and non-RPM env.
-        if self.prefix != "/usr":
+        if os.path.normpath(self.prefix) != os.path.join(os.sep, 'usr'):
             self.tmp = os.path.join(self.prefix, 'tmp')
         else:
-            self.tmp = os.getenv(
-                'DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test'))
-        if not os.path.exists(self.tmp):
-            os.makedirs(self.tmp)
+            self.tmp = os.getenv('DAOS_TEST_SHARED_DIR', os.path.expanduser('~/daos_test'))
+        os.makedirs(self.tmp, exist_ok=True)
         self.log.debug("Shared test directory: %s", self.tmp)
         self.log.debug("Common test directory: %s", self.test_dir)
 
diff --git a/src/tests/ftest/util/environment_utils.py b/src/tests/ftest/util/environment_utils.py
index a6653418544..d8a6c0d6def 100644
--- a/src/tests/ftest/util/environment_utils.py
+++ b/src/tests/ftest/util/environment_utils.py
@@ -3,8 +3,8 @@
 
   SPDX-License-Identifier: BSD-2-Clause-Patent
 """
-import json
 import os
+import shutil
 import site
 
 from ClusterShell.NodeSet import NodeSet
@@ -18,61 +18,25 @@ class TestEnvironmentException(Exception):
     """Exception for launch.py execution."""
 
 
-def _get_build_environment(logger, build_vars_file):
-    """Obtain DAOS build environment variables from the .build_vars.json file.
-
-    Args:
-        logger (Logger): logger for the messages produced by this method
-        build_vars_file (str): the full path to the DAOS build_vars.json file
-
-    Raises:
-        TestEnvironmentException: if there is an error obtaining the DAOS build environment
-
-    Returns:
-        str: The prefix of the DAOS install.
-        None: If the file is not present.
-    """
-    logger.debug("Obtaining DAOS build environment from %s", build_vars_file)
-    try:
-        with open(build_vars_file, encoding="utf-8") as vars_file:
-            return json.load(vars_file)["PREFIX"]
-
-    except FileNotFoundError:
-        return None
-
-    except Exception as error:      # pylint: disable=broad-except
-        raise TestEnvironmentException("Error obtaining build environment:", str(error)) from error
-
-
-def _update_path(logger, build_vars_file):
+def _update_path(daos_prefix):
     """Update the PATH environment variable for functional testing.
 
     Args:
-        logger (Logger): logger for the messages produced by this method
-        build_vars_file (str): the full path to the DAOS build_vars.json file
+        daos_prefix (str): daos install prefix
 
-    Raises:
-        TestEnvironmentException: if there is an error obtaining the DAOS build environment
     """
-    base_dir = _get_build_environment(logger, build_vars_file)
-
-    path = os.environ.get("PATH")
-
-    parts = path.split(":")
-
-    # If a custom prefix is used for the daos installation then prepend that to the path so that
-    # any binaries provided are picked up from there, else do not modify the path.
-    if base_dir:
-        bin_dir = os.path.join(base_dir, "bin")
-        sbin_dir = os.path.join(base_dir, "sbin")
+    parts = os.environ.get("PATH").split(":")
 
+    # Insert bin and sbin at the beginning of PATH if prefix is not /usr
+    if daos_prefix != os.path.join(os.sep, "usr"):
+        bin_dir = os.path.join(daos_prefix, "bin")
+        sbin_dir = os.path.join(daos_prefix, "sbin")
         parts.insert(0, bin_dir)
         parts.insert(0, sbin_dir)
 
     # /usr/sbin is not setup on non-root user for CI nodes.
     # SCM formatting tool mkfs.ext4 is located under /usr/sbin directory.
     usr_sbin = os.path.join(os.sep, "usr", "sbin")
-
     if usr_sbin not in parts:
         parts.append(usr_sbin)
 
@@ -142,6 +106,7 @@ class TestEnvironment():
         'insecure_mode': 'DAOS_TEST_INSECURE_MODE',
         'bullseye_src': 'DAOS_TEST_BULLSEYE_SRC',
         'bullseye_file': 'COVFILE',
+        'daos_prefix': 'DAOS_TEST_PREFIX'
     }
 
     def __init__(self):
@@ -176,23 +141,25 @@ def set_defaults(self, logger, servers=None, clients=None, provider=None, insecu
             self.insecure_mode = insecure_mode
 
         if self.log_dir is None:
-            self.log_dir = self.default_log_dir()
+            self.log_dir = self._default_log_dir()
         if self.shared_dir is None:
-            self.shared_dir = self.default_shared_dir()
+            self.shared_dir = self._default_shared_dir()
         if self.app_dir is None:
-            self.app_dir = self.default_app_dir()
+            self.app_dir = self._default_app_dir()
         if self.user_dir is None:
-            self.user_dir = self.default_user_dir()
+            self.user_dir = self._default_user_dir()
         if self.interface is None:
-            self.interface = self.default_interface(logger, all_hosts)
+            self.interface = self._default_interface(logger, all_hosts)
         if self.provider is None:
-            self.provider = self.default_provider(logger, servers)
+            self.provider = self._default_provider(logger, servers)
         if self.insecure_mode is None:
-            self.insecure_mode = self.default_insecure_mode()
+            self.insecure_mode = self._default_insecure_mode()
         if self.bullseye_src is None:
-            self.bullseye_src = self.default_bullseye_src()
+            self.bullseye_src = self._default_bullseye_src()
         if self.bullseye_file is None:
-            self.bullseye_file = self.default_bullseye_file()
+            self.bullseye_file = self._default_bullseye_file()
+        if self.daos_prefix is None:
+            self.daos_prefix = self._default_daos_prefix(logger)
 
     def __set_value(self, key, value):
         """Set the test environment variable.
@@ -224,7 +191,7 @@ def app_dir(self, value):
         """
         self.__set_value('app_dir', value)
 
-    def default_app_dir(self):
+    def _default_app_dir(self):
         """Get the default application directory path.
 
         Returns:
@@ -269,7 +236,7 @@ def log_dir(self, value):
         self.__set_value('log_dir', value)
 
     @staticmethod
-    def default_log_dir():
+    def _default_log_dir():
         """Get the default local log directory path.
 
         Returns:
@@ -296,7 +263,7 @@ def shared_dir(self, value):
         self.__set_value('shared_dir', value)
 
     @staticmethod
-    def default_shared_dir():
+    def _default_shared_dir():
         """Get the default shared log directory path.
 
         Returns:
@@ -322,7 +289,7 @@ def user_dir(self, value):
         """
         self.__set_value('user_dir', value)
 
-    def default_user_dir(self):
+    def _default_user_dir(self):
         """Get the default user directory path.
 
         Returns:
@@ -348,7 +315,7 @@ def interface(self, value):
         """
         self.__set_value('interface', value)
 
-    def default_interface(self, logger, hosts):
+    def _default_interface(self, logger, hosts):
         """Get the default interface.
 
         Args:
@@ -394,7 +361,7 @@ def provider(self, value):
         else:
             self.__set_value('provider', value)
 
-    def default_provider(self, logger, hosts):
+    def _default_provider(self, logger, hosts):
         """Get the default provider.
 
         Args:
@@ -463,7 +430,7 @@ def insecure_mode(self, value):
         self.__set_value('insecure_mode', value)
 
     @staticmethod
-    def default_insecure_mode():
+    def _default_insecure_mode():
         """Get the default insecure mode.
 
         Returns:
@@ -490,7 +457,7 @@ def bullseye_src(self, value):
         self.__set_value('bullseye_src', value)
 
     @staticmethod
-    def default_bullseye_src():
+    def _default_bullseye_src():
         """Get the default bullseye source file.
 
         Returns:
@@ -517,7 +484,7 @@ def bullseye_file(self, value):
         self.__set_value('bullseye_file', value)
 
     @staticmethod
-    def default_bullseye_file():
+    def _default_bullseye_file():
         """Get the default bullseye file.
 
         Returns:
@@ -525,6 +492,50 @@ def default_bullseye_file():
         """
         return os.path.join(os.sep, "tmp", "test.cov")
 
+    @property
+    def daos_prefix(self):
+        """Get the daos_prefix.
+
+        Returns:
+            str: the daos_prefix
+        """
+        return os.environ.get(self.__ENV_VAR_MAP['daos_prefix'])
+
+    @daos_prefix.setter
+    def daos_prefix(self, value):
+        """Set the daos_prefix.
+
+        Args:
+            value (str, bool): the daos_prefix
+        """
+        self.__set_value('daos_prefix', value)
+
+    def _default_daos_prefix(self, logger):
+        """Get the default daos_prefix.
+
+        Args:
+            logger (Logger): logger for the messages produced by this method
+
+        Raises:
+            TestEnvironmentException: if there is an error obtaining the default daos_prefix
+
+        Returns:
+            str: the default daos_prefix
+        """
+        if logger is None:
+            return None
+
+        logger.debug(
+            "Detecting daos_prefix for %s - %s not set",
+            self.daos_prefix, self.__ENV_VAR_MAP['daos_prefix'])
+
+        daos_bin_path = shutil.which('daos')
+        if not daos_bin_path:
+            raise TestEnvironmentException("Failed to find installed daos!")
+
+        # E.g. /usr/bin/daos -> /usr
+        return os.path.dirname(os.path.dirname(daos_bin_path))
+
 
 def set_test_environment(logger, test_env=None, servers=None, clients=None, provider=None,
                          insecure_mode=False, details=None):
@@ -551,15 +562,14 @@ def set_test_environment(logger, test_env=None, servers=None, clients=None, prov
     logger.debug("Setting up the test environment variables")
 
     if test_env:
-        # Update the PATH environment variable
-        build_vars_file = os.path.join(
-            os.path.dirname(os.path.realpath(__file__)), "..", "..", "..", ".build_vars.json")
-        _update_path(logger, build_vars_file)
-
-        # Get the default fabric interface and provider
+        # Get the default fabric interface, provider, and daos prefix
         test_env.set_defaults(logger, servers, clients, provider, insecure_mode)
         logger.info("Testing with interface:   %s", test_env.interface)
         logger.info("Testing with provider:    %s", test_env.provider)
+        logger.info("Testing with daos_prefix: %s", test_env.daos_prefix)
+
+        # Update the PATH environment variable
+        _update_path(test_env.daos_prefix)
 
         if details:
             details["interface"] = test_env.interface

From 6a2c3e4e47b2727d2f63fff8dd137243534dbdb0 Mon Sep 17 00:00:00 2001
From: Nasf-Fan <fan.yong@intel.com>
Date: Tue, 30 Apr 2024 14:11:23 +0800
Subject: [PATCH 09/10] DAOS-15713 chk: fix kinds of coverity issues (#14242)

CID: 2555541 2555529 2555524 2555517 2555545 2555527

Signed-off-by: Fan Yong <fan.yong@intel.com>
---
 src/chk/chk_common.c            |  2 ++
 src/chk/chk_engine.c            |  2 +-
 src/chk/chk_leader.c            |  3 +--
 src/chk/chk_upcall.c            | 31 ++++++++++++-------------------
 src/include/daos_srv/daos_chk.h |  2 +-
 src/mgmt/srv_drpc.c             | 12 ++++++------
 src/tests/suite/daos_cr.c       |  3 +++
 7 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/src/chk/chk_common.c b/src/chk/chk_common.c
index fda4efc9973..c5b0d044c7a 100644
--- a/src/chk/chk_common.c
+++ b/src/chk/chk_common.c
@@ -1238,6 +1238,8 @@ chk_ins_init(struct chk_instance **p_ins)
 out_init:
 	if (rc == 0)
 		*p_ins = ins;
+	else
+		D_FREE(ins);
 
 	return rc;
 }
diff --git a/src/chk/chk_engine.c b/src/chk/chk_engine.c
index bdb142ea8bc..f9e9fad2a31 100644
--- a/src/chk/chk_engine.c
+++ b/src/chk/chk_engine.c
@@ -2933,7 +2933,7 @@ chk_engine_pool_start(uint64_t gen, uuid_t uuid, uint32_t phase, uint32_t flags)
 		D_GOTO(put, rc = (rc == -DER_NONEXIST ? 1 : rc));
 
 	if (cbk->cb_phase < phase) {
-		cbk->cb_phase = cbk->cb_phase;
+		cbk->cb_phase = phase;
 		/* QUEST: How to estimate the left time? */
 		cbk->cb_time.ct_left_time = CHK__CHECK_SCAN_PHASE__CSP_DONE - cbk->cb_phase;
 		rc = chk_bk_update_pool(cbk, uuid_str);
diff --git a/src/chk/chk_leader.c b/src/chk/chk_leader.c
index b0f744b4bbb..f29fbe70f76 100644
--- a/src/chk/chk_leader.c
+++ b/src/chk/chk_leader.c
@@ -3385,8 +3385,7 @@ chk_leader_prop(chk_prop_cb_t prop_cb, void *buf)
 {
 	struct chk_property	*prop = &chk_leader->ci_prop;
 
-	return prop_cb(buf, (struct chk_policy *)prop->cp_policies,
-		       CHK_POLICY_MAX - 1, prop->cp_flags);
+	return prop_cb(buf, prop->cp_policies, CHK_POLICY_MAX - 1, prop->cp_flags);
 }
 
 static int
diff --git a/src/chk/chk_upcall.c b/src/chk/chk_upcall.c
index 893b7d1ec32..bbc05db5f75 100644
--- a/src/chk/chk_upcall.c
+++ b/src/chk/chk_upcall.c
@@ -94,8 +94,6 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re
 		D_ASPRINTF(report.pool_uuid, DF_UUIDF, DP_UUID(*pool));
 		if (report.pool_uuid == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
-	} else {
-		report.pool_uuid = NULL;
 	}
 
 	report.pool_label = pool_label;
@@ -104,8 +102,6 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re
 		D_ASPRINTF(report.cont_uuid, DF_UUIDF, DP_UUID(*cont));
 		if (report.cont_uuid == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
-	} else {
-		report.cont_uuid = NULL;
 	}
 
 	report.cont_label = cont_label;
@@ -114,24 +110,18 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re
 		D_ASPRINTF(report.objid, DF_UOID, DP_UOID(*obj));
 		if (report.objid == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
-	} else {
-		report.objid = NULL;
 	}
 
 	if (!daos_iov_empty(dkey)) {
 		D_ASPRINTF(report.dkey, DF_KEY, DP_KEY(dkey));
 		if (report.dkey == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
-	} else {
-		report.dkey = NULL;
 	}
 
 	if (!daos_iov_empty(akey)) {
 		D_ASPRINTF(report.akey, DF_KEY, DP_KEY(akey));
 		if (report.akey == NULL)
 			D_GOTO(out, rc = -DER_NOMEM);
-	} else {
-		report.akey = NULL;
 	}
 
 	D_ASPRINTF(report.timestamp, "%s", ctime(&tm));
@@ -150,20 +140,23 @@ chk_report_upcall(uint64_t gen, uint64_t seq, uint32_t cla, uint32_t act, int re
 			goto out;
 
 		report.n_act_details = rc;
-	} else {
-		report.n_act_details = 0;
-		report.act_details = NULL;
 	}
 
 	rc = ds_chk_report_upcall(&report);
 
 out:
-	D_FREE(report.pool_uuid);
-	D_FREE(report.cont_uuid);
-	D_FREE(report.objid);
-	D_FREE(report.dkey);
-	D_FREE(report.akey);
-	D_FREE(report.timestamp);
+	if (report.pool_uuid != protobuf_c_empty_string)
+		D_FREE(report.pool_uuid);
+	if (report.cont_uuid != protobuf_c_empty_string)
+		D_FREE(report.cont_uuid);
+	if (report.objid != protobuf_c_empty_string)
+		D_FREE(report.objid);
+	if (report.dkey != protobuf_c_empty_string)
+		D_FREE(report.dkey);
+	if (report.akey != protobuf_c_empty_string)
+		D_FREE(report.akey);
+	if (report.timestamp != protobuf_c_empty_string)
+		D_FREE(report.timestamp);
 	chk_sg_free(report.act_details, report.n_act_details);
 
 	D_CDEBUG(rc != 0, DLOG_ERR, DLOG_INFO,
diff --git a/src/include/daos_srv/daos_chk.h b/src/include/daos_srv/daos_chk.h
index 93fc2a75c9c..756c5ec0cd8 100644
--- a/src/include/daos_srv/daos_chk.h
+++ b/src/include/daos_srv/daos_chk.h
@@ -71,7 +71,7 @@ typedef int (*chk_query_head_cb_t)(uint32_t ins_status, uint32_t ins_phase,
 
 typedef int (*chk_query_pool_cb_t)(struct chk_query_pool_shard *shard, uint32_t idx, void *buf);
 
-typedef int (*chk_prop_cb_t)(void *buf, struct chk_policy *policies, int cnt, uint32_t flags);
+typedef int (*chk_prop_cb_t)(void *buf, uint32_t policies[], int cnt, uint32_t flags);
 
 int chk_leader_start(uint32_t rank_nr, d_rank_t *ranks, uint32_t policy_nr,
 		     struct chk_policy *policies, int pool_nr, uuid_t pools[],
diff --git a/src/mgmt/srv_drpc.c b/src/mgmt/srv_drpc.c
index a840aec93f2..013ad396699 100644
--- a/src/mgmt/srv_drpc.c
+++ b/src/mgmt/srv_drpc.c
@@ -2850,13 +2850,16 @@ ds_chk_prob_free(Mgmt__CheckInconsistPolicy **policies, uint32_t policy_nr)
 #define ALL_CHK_POLICY	CHK__CHECK_INCONSIST_CLASS__CIC_UNKNOWN
 
 static int
-ds_chk_prop_cb(void *buf, struct chk_policy *policies, int cnt, uint32_t flags)
+ds_chk_prop_cb(void *buf, uint32_t policies[], int cnt, uint32_t flags)
 {
 	Mgmt__CheckInconsistPolicy	**ply = NULL;
 	Mgmt__CheckPropResp		 *resp = buf;
 	int				  rc = 0;
 	int				  i = 0;
 
+	D_ASSERTF(cnt <= ALL_CHK_POLICY, "Too many inconsistency policies %u/%u\n",
+		  cnt, ALL_CHK_POLICY);
+
 	D_ALLOC_ARRAY(ply, cnt);
 	if (ply == NULL)
 		return -DER_NOMEM;
@@ -2867,11 +2870,8 @@ ds_chk_prop_cb(void *buf, struct chk_policy *policies, int cnt, uint32_t flags)
 			D_GOTO(out, rc = -DER_NOMEM);
 
 		mgmt__check_inconsist_policy__init(ply[i]);
-		if (policies[i].cp_class == 0 && cnt == ALL_CHK_POLICY)
-			ply[i]->inconsist_cas = i;
-		else
-			ply[i]->inconsist_cas = policies[i].cp_class;
-		ply[i]->inconsist_act = policies[i].cp_action;
+		ply[i]->inconsist_cas = i;
+		ply[i]->inconsist_act = policies[i];
 	}
 
 
diff --git a/src/tests/suite/daos_cr.c b/src/tests/suite/daos_cr.c
index 5339add7e23..1e1b0c29a13 100644
--- a/src/tests/suite/daos_cr.c
+++ b/src/tests/suite/daos_cr.c
@@ -835,13 +835,16 @@ cr_cont_create(void **state, struct test_pool *pool, struct test_cont *cont, int
 	char		 uuid_str[DAOS_UUID_STR_SIZE];
 	test_arg_t	*arg = *state;
 	daos_prop_t	*prop = NULL;
+	mode_t		 saved;
 	daos_handle_t	 coh;
 	int		 fd;
 	int		 rc;
 	int		 rc1;
 
+	saved = umask(0);
 	strncpy(cont->label, "/tmp/cr_cont_XXXXXX", sizeof(cont->label) - 1);
 	fd = mkstemp(cont->label);
+	umask(saved);
 	if (fd < 0) {
 		print_message("CR: cont generate label failed: %s\n", strerror(errno));
 		return d_errno2der(errno);

From e7aa7a882f8a2abb7df6c93fbdbd672b215a0b24 Mon Sep 17 00:00:00 2001
From: Liu Xuezhao <xuezhao.liu@intel.com>
Date: Tue, 30 Apr 2024 15:18:05 +0800
Subject: [PATCH 10/10] DAOS-15661 object: set correct map version for layout
 create (#14222)

In obj_layout_create, it get pl_map by pl_map_find() without holding
dp_map_lock, and then set "omd_ver = dc_pool_get_version(pool)".
The map version of the pl_map possibly not same as dc_pool_get_version()
if another thread refreshed the dc_pool's pool map.

Signed-off-by: Xuezhao Liu <xuezhao.liu@intel.com>
---
 src/object/cli_obj.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c
index a93f84bc458..c209bfdeeed 100644
--- a/src/object/cli_obj.c
+++ b/src/object/cli_obj.c
@@ -304,7 +304,7 @@ obj_layout_create(struct dc_object *obj, unsigned int mode, bool refresh)
 		D_GOTO(out, rc = -DER_INVAL);
 	}
 
-	obj->cob_md.omd_ver = dc_pool_get_version(pool);
+	obj->cob_md.omd_ver = pool_map_get_version(map->pl_poolmap);
 	obj->cob_md.omd_pdom_lvl = dc_obj_get_pdom(obj);
 	obj->cob_md.omd_fdom_lvl = dc_obj_get_redun_lvl(obj);
 	obj->cob_md.omd_pda = dc_obj_get_pda(obj);