Merge branch 'feature/fault_domain' into kjacque/fault_domain/fault-p…

…ath-format Features: control
daos-stack · Sep 23, 2024 · 30de197 · 30de197
2 parents 88abeaf + c500d95
commit 30de197
Show file tree

Hide file tree

Showing 270 changed files with 19,490 additions and 13,942 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -309,6 +309,9 @@ pipeline {
         string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VERBS_PROVIDER_LABEL',
                defaultValue: 'ci_nvme5',
                description: 'Label to use for 5 node Functional Hardware Medium Verbs Provider (MD on SSD) stages')
+        string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL',
+               defaultValue: 'ci_vmd5',
+               description: 'Label to use for the Functional Hardware Medium VMD stage')
         string(name: 'FUNCTIONAL_HARDWARE_MEDIUM_UCX_PROVIDER_LABEL',
                defaultValue: 'ci_ofed5',
                description: 'Label to use for 5 node Functional Hardware Medium UCX Provider stage')
@@ -1182,6 +1185,19 @@ pipeline {
                             run_if_landing: false,
                             job_status: job_status_internal
                         ),
+                        'Functional Hardware Medium VMD': getFunctionalTestStage(
+                            name: 'Functional Hardware Medium VMD',
+                            pragma_suffix: '-hw-medium-vmd',
+                            label: params.FUNCTIONAL_HARDWARE_MEDIUM_VMD_LABEL,
+                            next_version: next_version,
+                            stage_tags: 'hw_vmd,medium',
+                            /* groovylint-disable-next-line UnnecessaryGetter */
+                            default_tags: startedByTimer() ? 'pr daily_regression' : 'pr',
+                            nvme: 'auto',
+                            run_if_pr: false,
+                            run_if_landing: false,
+                            job_status: job_status_internal
+                        ),
                         'Functional Hardware Medium Verbs Provider': getFunctionalTestStage(
                             name: 'Functional Hardware Medium Verbs Provider',
                             pragma_suffix: '-hw-medium-verbs-provider',

diff --git a/docs/admin/administration.md b/docs/admin/administration.md
@@ -620,21 +620,17 @@ Usage:
 [nvme-faulty command options]
       -u, --uuid=     Device UUID to set
       -f, --force     Do not require confirmation
+      -l, --host=     Single host address <ipv4addr/hostname> to connect to
 ```
 
 To manually evict an NVMe SSD (auto eviction is covered later in this section),
 the device state needs to be set faulty by running the following command:
 ```bash
-$ dmg -l boro-11 storage set nvme-faulty --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
+$ dmg storage set nvme-faulty --host=boro-11 --uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
 NOTICE: This command will permanently mark the device as unusable!
 Are you sure you want to continue? (yes/no)
 yes
--------
-boro-11
--------
-  Devices
-    UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:]
-            Targets:[] Rank:0 State:EVICTED LED:ON
+set-faulty operation performed successfully on the following host: wolf-310:10001
 ```
 The device state will transition from "NORMAL" to "EVICTED" (shown above), during which time the
 faulty device reaction will have been triggered (all targets on the SSD will be rebuilt).
@@ -693,19 +689,14 @@ Usage:
 [nvme command options]
           --old-uuid= Device UUID of hot-removed SSD
           --new-uuid= Device UUID of new device
-          --no-reint  Bypass reintegration of device and just bring back online.
+          -l, --host= Single host address <ipv4addr/hostname> to connect to
 ```
 
 To replace an NVMe SSD with an evicted device and reintegrate it into use with
 DAOS, run the following command:
 ```bash
-$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b
--------
-boro-11
--------
-  Devices
-    UUID:80c9f1be-84b9-4318-a1be-c416c96ca48b [TrAddr:]
-      Targets:[] Rank:1 State:NORMAL LED:OFF
+$ dmg storage replace nvme --host=boro-11 --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=80c9f1be-84b9-4318-a1be-c416c96ca48b
+dev-replace operation performed successfully on the following host: boro-11:10001
 ```
 The old, now replaced device will remain in an "EVICTED" state until it is unplugged.
 The new device will transition from a "NEW" state to a "NORMAL" state (shown above).
@@ -716,14 +707,9 @@ In order to reuse a device that was previously set as FAULTY and evicted from th
 system, an admin can run the following command (setting the old device UUID to be the
 new device UUID):
 ```bash
-$ dmg -l boro-11 storage replace nvme --old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
+$ dmg storage replace nvme --host=boro-11 ---old-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19 --new-uuid=5bd91603-d3c7-4fb7-9a71-76bc25690c19
 NOTICE: Attempting to reuse a previously set FAULTY device!
--------
-boro-11
--------
-  Devices
-    UUID:5bd91603-d3c7-4fb7-9a71-76bc25690c19 [TrAddr:]
-      Targets:[] Rank:1 State:NORMAL LED:OFF
+dev-replace operation performed successfully on the following host: boro-11:10001
 ```
 The FAULTY device will transition from an "EVICTED" state back to a "NORMAL" state,
 and will again be available for use with DAOS. The use case of this command will mainly

diff --git a/src/bio/README.md b/src/bio/README.md
@@ -209,7 +209,7 @@ Devices:
 <a id="82"></a>
 - Manually Set Device State to FAULTY: **$dmg storage set nvme-faulty**
 ```
-$ dmg storage set nvme-faulty --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
+$ dmg storage set nvme-faulty --host=localhost --uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
 Devices
         UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8d:00.0]
             Targets:[0] Rank:0 State:EVICTED
@@ -219,7 +219,7 @@ Devices
 <a id="83"></a>
 - Replace an evicted device with a new device: **$dmg storage replace nvme**
 ```
-$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e
+$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=8131fc39-4b1c-4662-bea1-734e728c434e
 Devices
         UUID:8131fc39-4b1c-4662-bea1-734e728c434e [TrAddr:0000:8d:00.0]
             Targets:[0] Rank:0 State:NORMAL
@@ -229,7 +229,7 @@ Devices
 <a id="84"></a>
 - Reuse a previously evicted device: **$dmg storage replace nvme**
 ```
-$ dmg storage replace nvme --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
+$ dmg storage replace nvme --host=localhost --old-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 --new-uuid=9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0
 Devices
         UUID:9fb3ce57-1841-43e6-8b70-2a5e7fb2a1d0 [TrAddr:0000:8a:00.0]
             Targets:[0] Rank:0 State:NORMAL

diff --git a/src/client/java/daos-java/pom.xml b/src/client/java/daos-java/pom.xml
@@ -36,7 +36,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.16.3</version>
+      <version>3.25.5</version>
     </dependency>
     <dependency>
       <groupId>io.netty</groupId>

diff --git a/src/common/tests_dmg_helpers.c b/src/common/tests_dmg_helpers.c
@@ -1393,7 +1393,7 @@ dmg_storage_set_nvme_fault(const char *dmg_config_file,
 			D_GOTO(out, rc = -DER_NOMEM);
 	}
 
-	args = cmd_push_arg(args, &argcount, " --host-list=%s ", host);
+	args = cmd_push_arg(args, &argcount, " --host=%s ", host);
 	if (args == NULL)
 		D_GOTO(out, rc = -DER_NOMEM);
 

diff --git a/src/container/srv_target.c b/src/container/srv_target.c
@@ -929,9 +929,10 @@ cont_child_start(struct ds_pool_child *pool_child, const uuid_t co_uuid,
 	 * 2. Pool is going to be destroyed, or;
 	 * 3. Pool service is going to be stopped;
 	 */
-	if (cont_child->sc_stopping) {
-		D_ERROR(DF_CONT"[%d]: Container is in stopping\n",
-			DP_CONT(pool_child->spc_uuid, co_uuid), tgt_id);
+	if (cont_child->sc_stopping || cont_child->sc_destroying) {
+		D_ERROR(DF_CONT"[%d]: Container is being stopped or destroyed (s=%d, d=%d)\n",
+			DP_CONT(pool_child->spc_uuid, co_uuid), tgt_id,
+			cont_child->sc_stopping, cont_child->sc_destroying);
 		rc = -DER_SHUTDOWN;
 	} else if (!cont_child_started(cont_child)) {
 		if (!ds_pool_skip_for_check(pool_child->spc_pool)) {
@@ -1200,71 +1201,58 @@ cont_child_destroy_one(void *vin)
 	struct dsm_tls		       *tls = dsm_tls_get();
 	struct cont_tgt_destroy_in     *in = vin;
 	struct ds_pool_child	       *pool;
-	int				rc, retry_cnt = 0;
+	struct ds_cont_child	       *cont;
+	int				rc;
 
 	pool = ds_pool_child_lookup(in->tdi_pool_uuid);
 	if (pool == NULL)
 		D_GOTO(out, rc = -DER_NO_HDL);
 
-	while (1) {
-		struct ds_cont_child *cont;
+	rc = cont_child_lookup(tls->dt_cont_cache, in->tdi_uuid,
+			       in->tdi_pool_uuid, false /* create */, &cont);
+	if (rc == -DER_NONEXIST)
+		D_GOTO(out_pool, rc = 0);
 
-		rc = cont_child_lookup(tls->dt_cont_cache, in->tdi_uuid,
-				       in->tdi_pool_uuid, false /* create */,
-				       &cont);
-		if (rc == -DER_NONEXIST)
-			break;
-
-		if (rc != 0)
-			D_GOTO(out_pool, rc);
+	if (rc != 0)
+		D_GOTO(out_pool, rc);
 
-		if (cont->sc_open > 0) {
-			if (retry_cnt > 0)
-				D_ERROR(DF_CONT": Container is re-opened (%d) by race\n",
-					DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
-					cont->sc_open);
-			else
-				D_ERROR(DF_CONT": Container is still in open(%d)\n",
-					DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid),
-					cont->sc_open);
-			cont_child_put(tls->dt_cont_cache, cont);
-			D_GOTO(out_pool, rc = -DER_BUSY);
-		}
+	if (cont->sc_open > 0) {
+		D_ERROR(DF_CONT": Container is still in open(%d)\n",
+			DP_CONT(cont->sc_pool->spc_uuid, cont->sc_uuid), cont->sc_open);
+		cont_child_put(tls->dt_cont_cache, cont);
+		D_GOTO(out_pool, rc = -DER_BUSY);
+	}
 
-		cont_child_stop(cont);
+	if (cont->sc_destroying) {
+		cont_child_put(tls->dt_cont_cache, cont);
+		D_GOTO(out_pool, rc = -DER_BUSY);
+	}
+	cont->sc_destroying = 1; /* nobody can take refcount anymore */
 
-		ABT_mutex_lock(cont->sc_mutex);
-		if (cont->sc_dtx_resyncing)
-			ABT_cond_wait(cont->sc_dtx_resync_cond, cont->sc_mutex);
-		ABT_mutex_unlock(cont->sc_mutex);
+	cont_child_stop(cont);
 
-		/* Make sure checksum scrubbing has stopped */
-		ABT_mutex_lock(cont->sc_mutex);
-		if (cont->sc_scrubbing) {
-			sched_req_wakeup(cont->sc_pool->spc_scrubbing_req);
-			ABT_cond_wait(cont->sc_scrub_cond, cont->sc_mutex);
-		}
-		ABT_mutex_unlock(cont->sc_mutex);
+	ABT_mutex_lock(cont->sc_mutex);
+	if (cont->sc_dtx_resyncing)
+		ABT_cond_wait(cont->sc_dtx_resync_cond, cont->sc_mutex);
+	ABT_mutex_unlock(cont->sc_mutex);
 
-		/* Make sure rebuild has stopped */
-		ABT_mutex_lock(cont->sc_mutex);
-		if (cont->sc_rebuilding)
-			ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex);
-		ABT_mutex_unlock(cont->sc_mutex);
+	/* Make sure checksum scrubbing has stopped */
+	ABT_mutex_lock(cont->sc_mutex);
+	if (cont->sc_scrubbing) {
+		sched_req_wakeup(cont->sc_pool->spc_scrubbing_req);
+		ABT_cond_wait(cont->sc_scrub_cond, cont->sc_mutex);
+	}
+	ABT_mutex_unlock(cont->sc_mutex);
 
-		retry_cnt++;
-		if (retry_cnt > 1) {
-			D_ERROR("container is still in-use: open %u, resync %s, reindex %s\n",
-				cont->sc_open, cont->sc_dtx_resyncing ? "yes" : "no",
-				cont->sc_dtx_reindex ? "yes" : "no");
-			cont_child_put(tls->dt_cont_cache, cont);
-			D_GOTO(out_pool, rc = -DER_BUSY);
-		} /* else: resync should have completed, try again */
+	/* Make sure rebuild has stopped */
+	ABT_mutex_lock(cont->sc_mutex);
+	if (cont->sc_rebuilding)
+		ABT_cond_wait(cont->sc_rebuild_cond, cont->sc_mutex);
+	ABT_mutex_unlock(cont->sc_mutex);
 
-		/* nobody should see it again after eviction */
-		daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list);
-		daos_lru_ref_release(tls->dt_cont_cache, &cont->sc_list);
-	}
+	/* nobody should see it again after eviction */
+	daos_lru_ref_evict_wait(tls->dt_cont_cache, &cont->sc_list);
+	cont_child_put(tls->dt_cont_cache, cont);
 
 	D_DEBUG(DB_MD, DF_CONT": destroying vos container\n",
 		DP_CONT(pool->spc_uuid, in->tdi_uuid));
@@ -1375,7 +1363,7 @@ ds_cont_child_lookup(uuid_t pool_uuid, uuid_t cont_uuid,
 	if (rc != 0)
 		return rc;
 
-	if ((*ds_cont)->sc_stopping) {
+	if ((*ds_cont)->sc_stopping || (*ds_cont)->sc_destroying) {
 		cont_child_put(tls->dt_cont_cache, *ds_cont);
 		*ds_cont = NULL;
 		return -DER_SHUTDOWN;
@@ -2603,6 +2591,13 @@ cont_child_prop_update(void *data)
 		return rc;
 	}
 	D_ASSERT(child != NULL);
+	if (child->sc_stopping || child->sc_destroying) {
+		D_ERROR(DF_CONT" is being stopping or destroyed (s=%d, d=%d)\n",
+			DP_CONT(arg->cpa_pool_uuid, arg->cpa_cont_uuid),
+			child->sc_stopping, child->sc_destroying);
+		rc = -DER_SHUTDOWN;
+		goto out;
+	}
 	daos_props_2cont_props(arg->cpa_prop, &child->sc_props);
 
 	iv_entry = daos_prop_entry_get(arg->cpa_prop, DAOS_PROP_CO_STATUS);

diff --git a/src/control/go.mod b/src/control/go.mod
@@ -1,13 +1,15 @@
 module github.com/daos-stack/daos/src/control
 
-go 1.20
+go 1.21
+
+toolchain go1.22.3
 
 require (
 	github.com/Jille/raft-grpc-transport v1.2.0
 	github.com/desertbit/grumble v1.1.3
 	github.com/dustin/go-humanize v1.0.0
-	github.com/google/go-cmp v0.5.9
-	github.com/google/uuid v1.3.0
+	github.com/google/go-cmp v0.6.0
+	github.com/google/uuid v1.6.0
 	github.com/hashicorp/go-hclog v1.2.2
 	github.com/hashicorp/raft v1.3.9
 	github.com/hashicorp/raft-boltdb/v2 v2.0.0-20210409134258-03c10cc3d4ea
@@ -18,18 +20,18 @@ require (
 	github.com/prometheus/client_model v0.2.0
 	github.com/prometheus/common v0.32.1
 	go.etcd.io/bbolt v1.3.5
-	golang.org/x/net v0.23.0
-	golang.org/x/sys v0.18.0
+	golang.org/x/net v0.26.0
+	golang.org/x/sys v0.21.0
 	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1
-	google.golang.org/grpc v1.56.3
-	google.golang.org/protobuf v1.33.0
+	google.golang.org/grpc v1.66.2
+	google.golang.org/protobuf v1.34.1
 	gopkg.in/yaml.v2 v2.4.0
 )
 
 require (
 	github.com/armon/go-metrics v0.4.0 // indirect
 	github.com/beorn7/perks v1.0.1 // indirect
-	github.com/cespare/xxhash/v2 v2.2.0 // indirect
+	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 	github.com/desertbit/closer/v3 v3.1.2 // indirect
 	github.com/desertbit/columnize v2.1.0+incompatible // indirect
 	github.com/desertbit/go-shlex v0.1.1 // indirect
@@ -46,5 +48,5 @@ require (
 	github.com/mattn/go-isatty v0.0.14 // indirect
 	github.com/matttproud/golang_protobuf_extensions v1.0.1 // indirect
 	github.com/prometheus/procfs v0.7.3 // indirect
-	golang.org/x/text v0.14.0 // indirect
+	golang.org/x/text v0.16.0 // indirect
 )