diff --git a/src/engine/init.c b/src/engine/init.c
index c4dfb6e1997..5b2900d1615 100644
--- a/src/engine/init.c
+++ b/src/engine/init.c
@@ -75,11 +75,9 @@ int			dss_core_nr;
 unsigned int		dss_core_offset;
 /** NUMA node to bind to */
 int			dss_numa_node = -1;
-hwloc_bitmap_t	core_allocation_bitmap;
-/** a copy of the NUMA node object in the topology */
-hwloc_obj_t		numa_obj;
-/** number of cores in the given NUMA node */
-int			dss_num_cores_numa_node;
+/** Cached numa information */
+struct dss_numa_info   *numa_info;
+
 /** Module facility bitmask */
 static uint64_t		dss_mod_facs;
 /** Number of storage tiers: 2 for SCM and NVMe */
@@ -306,8 +304,11 @@ dss_topo_init()
 	int		num_cores_visited;
 	char		*cpuset;
 	int		k;
+	int              i;
+	int              rc = 0;
 	hwloc_obj_t	corenode;
 	bool            tgt_oversub = false;
+	bool             multi_socket = false;
 
 	hwloc_topology_init(&dss_topo);
 	hwloc_topology_load(dss_topo);
@@ -317,11 +318,14 @@ dss_topo_init()
 	depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
 	numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
 	d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
+	d_getenv_bool("DAOS_MULTISOCKET", &multi_socket);
+
+	if (multi_socket && numa_node_nr > 1)
+		dss_numa_nr = numa_node_nr;
 
 	/* if no NUMA node was specified, or NUMA data unavailable */
 	/* fall back to the legacy core allocation algorithm */
 	if (dss_numa_node == -1 || numa_node_nr <= 0) {
-		D_PRINT("Using legacy core allocation algorithm\n");
 		dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads,
 					    tgt_oversub);
 
@@ -332,62 +336,90 @@ dss_topo_init()
 				dss_core_offset, dss_core_nr - 1);
 			return -DER_INVAL;
 		}
-		return 0;
+
+		if (dss_numa_nr == 1) {
+			D_PRINT("Using legacy core allocation algorithm\n");
+			return 0;
+		}
+
+		if ((dss_tgt_offload_xs_nr % numa_node_nr) != 0) {
+			D_ERROR("helper count must be evenly divisible by numa count\n");
+			return -DER_INVAL;
+		}
+		if ((dss_tgt_nr % numa_node_nr) != 0) {
+			D_ERROR("tgt count must be evenly divisible by numa count\n");
+			return -DER_INVAL;
+		}
+		dss_tgt_offload_per_numa_xs_nr = dss_tgt_offload_xs_nr / numa_node_nr;
+		dss_tgt_per_numa_nr            = dss_tgt_nr / numa_node_nr;
+		D_PRINT("Using multi-socket core allocation algorithm nr=%d target_per=%d "
+			"offload_per=%d\n",
+			numa_node_nr, dss_tgt_per_numa_nr, dss_tgt_offload_per_numa_xs_nr);
 	}
 
-	if (dss_numa_node > numa_node_nr) {
+	if (!multi_socket && dss_numa_node > numa_node_nr) {
 		D_ERROR("Invalid NUMA node selected. "
 			"Must be no larger than %d\n",
 			numa_node_nr);
 		return -DER_INVAL;
 	}
 
-	numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
-	if (numa_obj == NULL) {
-		D_ERROR("NUMA node %d was not found in the topology",
-			dss_numa_node);
-		return -DER_INVAL;
-	}
+	D_ALLOC_ARRAY(numa_info, numa_node_nr);
+	if (numa_info == NULL)
+		return -DER_NOMEM;
 
-	/* create an empty bitmap, then set each bit as we */
-	/* find a core that matches */
-	core_allocation_bitmap = hwloc_bitmap_alloc();
-	if (core_allocation_bitmap == NULL) {
-		D_ERROR("Unable to allocate core allocation bitmap\n");
-		return -DER_INVAL;
-	}
+	for (i = 0; i < numa_node_nr; i++) {
+		hwloc_obj_t numa_obj;
+		numa_info[i].ni_idx = i;
+		numa_obj = numa_info[i].ni_obj = hwloc_get_obj_by_depth(dss_topo, depth, 0);
+		if (numa_obj == NULL) {
+			D_ERROR("NUMA node %d was not found in the topology", i);
+			D_GOTO(failed, rc = -DER_INVAL);
+		}
 
-	dss_num_cores_numa_node = 0;
-	num_cores_visited = 0;
+		/* create an empty bitmap, then set each bit as we */
+		/* find a core that matches */
+		numa_info[i].ni_core_allocation_bitmap = hwloc_bitmap_alloc();
+		if (numa_info[i].ni_core_allocation_bitmap == NULL) {
+			D_ERROR("Unable to allocate core allocation bitmap\n");
+			D_GOTO(failed, rc = -DER_INVAL);
+		}
 
-	for (k = 0; k < dss_core_nr; k++) {
-		corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
-		if (corenode == NULL)
-			continue;
-		if (hwloc_bitmap_isincluded(corenode->cpuset,
-					    numa_obj->cpuset) != 0) {
-			if (num_cores_visited++ >= dss_core_offset) {
-				hwloc_bitmap_set(core_allocation_bitmap, k);
-				hwloc_bitmap_asprintf(&cpuset,
-						      corenode->cpuset);
+		numa_info[i].ni_core_nr = 0;
+		num_cores_visited       = 0;
+
+		for (k = 0; k < dss_core_nr; k++) {
+			corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
+			if (corenode == NULL)
+				continue;
+			if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) {
+				if (num_cores_visited++ >= dss_core_offset) {
+					hwloc_bitmap_set(numa_info[i].ni_core_allocation_bitmap, k);
+					hwloc_bitmap_asprintf(&cpuset, corenode->cpuset);
+				}
+				numa_info[i].ni_core_nr++;
 			}
-			dss_num_cores_numa_node++;
 		}
-	}
-	hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
-	free(cpuset);
-
-	dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads,
-				    tgt_oversub);
-	if (dss_core_offset >= dss_num_cores_numa_node) {
-		D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
-			"should within range [0, %d]", dss_core_offset,
-			dss_num_cores_numa_node - 1);
-		return -DER_INVAL;
+		hwloc_bitmap_asprintf(&cpuset, numa_info[i].ni_core_allocation_bitmap);
+		free(cpuset);
+
+		if (i == dss_numa_node) {
+			dss_tgt_nr =
+			    dss_tgt_nr_get(numa_info[i].ni_core_nr, nr_threads, tgt_oversub);
+			if (dss_core_offset >= numa_info[i].ni_core_nr) {
+				D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
+					"should within range [0, %d]",
+					dss_core_offset, numa_info[i].ni_core_nr - 1);
+				D_GOTO(failed, rc = -DER_INVAL);
+			}
+			D_PRINT("Using NUMA core allocation algorithm\n");
+		}
 	}
 
-	D_PRINT("Using NUMA core allocation algorithm\n");
 	return 0;
+failed:
+	D_FREE(numa_info);
+	return rc;
 }
 
 static ABT_mutex		server_init_state_mutex;
@@ -825,7 +857,7 @@ server_init(int argc, char *argv[])
 		DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr,
 		dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname);
 
-	if (numa_obj)
+	if (numa_info && dss_numa_node != -1)
 		D_PRINT("Using NUMA node: %d", dss_numa_node);
 
 	return 0;
@@ -904,6 +936,7 @@ server_fini(bool force)
 		pl_fini();
 		daos_hhash_fini();
 	}
+	D_FREE(numa_info);
 	D_INFO("daos_fini() or pl_fini() done\n");
 	crt_finalize();
 	D_INFO("crt_finalize() done\n");
diff --git a/src/engine/srv.c b/src/engine/srv.c
index 986d8ed04c4..0d71eddb1ed 100644
--- a/src/engine/srv.c
+++ b/src/engine/srv.c
@@ -75,10 +75,18 @@
 #define DRPC_XS_NR	(1)
 /** Number of offload XS */
 unsigned int	dss_tgt_offload_xs_nr;
+/** Number of offload per socket */
+unsigned int            dss_tgt_offload_per_numa_xs_nr;
+/** Number of target per socket */
+unsigned int            dss_tgt_per_numa_nr;
 /** Number of target (XS set) per engine */
 unsigned int	dss_tgt_nr;
 /** Number of system XS */
 unsigned int	dss_sys_xs_nr = DAOS_TGT0_OFFSET + DRPC_XS_NR;
+/** Normally set to 1. In "multi-socket" mode, will be the number of
+ *  numa nodes.
+ */
+unsigned int            dss_numa_nr = 1;
 /**
  * Flag of helper XS as a pool.
  * false - the helper XS is near its main IO service XS. When there is one or
@@ -965,14 +973,41 @@ dss_start_xs_id(int tag, int xs_id)
 {
 	hwloc_obj_t	obj;
 	int		rc;
+	int                   tgt;
 	int		xs_core_offset;
-	unsigned	idx;
+	unsigned int          idx;
 	char		*cpuset;
+	struct dss_numa_info *ninfo;
+	bool                  clear = false;
 
 	D_DEBUG(DB_TRACE, "start xs_id called for %d.  ", xs_id);
 	/* if we are NUMA aware, use the NUMA information */
-	if (numa_obj) {
-		idx = hwloc_bitmap_first(core_allocation_bitmap);
+	if (numa_info) {
+		if (dss_numa_node == -1) {
+			tgt = dss_xs2tgt(xs_id);
+			if (xs_id == 1) {
+				D_INFO("Swim\n");
+				ninfo = &numa_info[1];
+			} else if (tgt != -1) {
+				D_INFO("target #%d\n", tgt);
+				ninfo = &numa_info[tgt / dss_tgt_per_numa_nr];
+			} else if (xs_id > 2) {
+				tgt = xs_id - dss_sys_xs_nr - dss_tgt_nr;
+				D_INFO("offload #%d\n", tgt);
+				ninfo = &numa_info[tgt / dss_tgt_offload_per_numa_xs_nr];
+			} else {
+				D_INFO("system  %d\n", xs_id);
+				ninfo = &numa_info[0];
+			}
+			if (xs_id != 0)
+				clear = true;
+		} else {
+			ninfo = &numa_info[dss_numa_node];
+			if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
+				clear = true;
+		}
+
+		idx = hwloc_bitmap_first(ninfo->ni_core_allocation_bitmap);
 		if (idx == -1) {
 			D_ERROR("No core available for XS: %d", xs_id);
 			return -DER_INVAL;
@@ -983,8 +1018,8 @@ dss_start_xs_id(int tag, int xs_id)
 		 * All system XS will reuse the first XS' core, but
 		 * the SWIM and DRPC XS will use separate core if enough cores
 		 */
-		if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
-			hwloc_bitmap_clr(core_allocation_bitmap, idx);
+		if (clear)
+			hwloc_bitmap_clr(ninfo->ni_core_allocation_bitmap, idx);
 
 		obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx);
 		if (obj == NULL) {
@@ -993,7 +1028,7 @@ dss_start_xs_id(int tag, int xs_id)
 		}
 
 		hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
-		D_DEBUG(DB_TRACE, "Using CPU set %s\n", cpuset);
+		D_DEBUG(DB_TRACE, "Using CPU set (numa %d) %s\n", ninfo->ni_idx, cpuset);
 		free(cpuset);
 	} else {
 		D_DEBUG(DB_TRACE, "Using non-NUMA aware core allocation\n");
@@ -1076,9 +1111,8 @@ dss_xstreams_init(void)
 		dss_core_nr, dss_tgt_nr);
 
 	if (dss_numa_node != -1) {
-		D_DEBUG(DB_TRACE,
-			"Detected %d cores on NUMA node %d\n",
-			dss_num_cores_numa_node, dss_numa_node);
+		D_DEBUG(DB_TRACE, "Detected %d cores on NUMA node %d\n",
+			numa_info[dss_numa_node].ni_core_nr, dss_numa_node);
 	}
 
 	xstream_data.xd_xs_nr = DSS_XS_NR_TOTAL;
diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h
index 8621175b44f..d641dc58a2c 100644
--- a/src/engine/srv_internal.h
+++ b/src/engine/srv_internal.h
@@ -132,18 +132,30 @@ extern int		dss_core_nr;
 extern unsigned int	dss_core_offset;
 /** NUMA node to bind to */
 extern int		dss_numa_node;
-/** bitmap describing core allocation */
-extern hwloc_bitmap_t	core_allocation_bitmap;
-/** a copy of the NUMA node object in the topology */
-extern hwloc_obj_t	numa_obj;
-/** number of cores in the given NUMA node */
-extern int		dss_num_cores_numa_node;
+struct dss_numa_info {
+	/** a copy of the NUMA node object in the topology */
+	hwloc_obj_t    ni_obj;
+	/** numa index for this node */
+	int            ni_idx;
+	/** number of cores in the given NUMA node */
+	unsigned int   ni_core_nr;
+	/** Allocation bitmap for numa node */
+	hwloc_bitmap_t ni_core_allocation_bitmap;
+};
+/** Cached numa information */
+extern struct dss_numa_info *numa_info;
 /** Number of offload XS */
 extern unsigned int	dss_tgt_offload_xs_nr;
+/** Number of offload XS per socket */
+extern unsigned int          dss_tgt_offload_per_numa_xs_nr;
+/** Number of tgt XS per socket */
+extern unsigned int          dss_tgt_per_numa_nr;
 /** number of system XS */
 extern unsigned int	dss_sys_xs_nr;
 /** Flag of helper XS as a pool */
 extern bool		dss_helper_pool;
+/** Number of numa nodes in multi-socket mode (always 1 otherwise) */
+extern unsigned int          dss_numa_nr;
 
 /** Shadow dss_get_module_info */
 struct dss_module_info *get_module_info(void);
diff --git a/src/engine/ult.c b/src/engine/ult.c
index 47c3b504f8d..6a36e5b1721 100644
--- a/src/engine/ult.c
+++ b/src/engine/ult.c
@@ -324,6 +324,33 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags)
 
 /* ============== ULT create functions =================================== */
 
+static inline int
+sched_ult2xs_multisocket(int xs_type, int tgt_id)
+{
+	int socket;
+
+	/** Keep it simple for now.  Only support fewer helper threads than
+	 * io threads */
+	D_ASSERTF(dss_tgt_offload_xs_nr < dss_tgt_nr,
+		  "Must have fewer helper threads than targets in multi-socket mode");
+	switch (xs_type) {
+	case DSS_XS_IOFW:
+		/** Fall through */
+	case DSS_XS_OFFLOAD:
+		/* No helper threads */
+		if (dss_tgt_offload_xs_nr == 0)
+			return DSS_XS_SELF;
+		socket = tgt_id / dss_numa_nr;
+		/** tgt and offload xstreams are split among sockets evenly */
+		return dss_sys_xs_nr + dss_tgt_nr + (socket * dss_tgt_offload_per_numa_xs_nr) +
+		       tgt_id % dss_tgt_offload_per_numa_xs_nr;
+	default:
+		D_ASSERT(0);
+	};
+
+	return DSS_XS_SELF;
+}
+
 static inline int
 sched_ult2xs(int xs_type, int tgt_id)
 {
@@ -341,6 +368,10 @@ sched_ult2xs(int xs_type, int tgt_id)
 	case DSS_XS_DRPC:
 		return 2;
 	case DSS_XS_IOFW:
+		if (dss_numa_nr > 1) {
+			xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
+			break;
+		}
 		if (!dss_helper_pool) {
 			if (dss_tgt_offload_xs_nr > 0)
 				xs_id = DSS_MAIN_XS_ID(tgt_id) + 1;
@@ -379,6 +410,10 @@ sched_ult2xs(int xs_type, int tgt_id)
 			xs_id = (DSS_MAIN_XS_ID(tgt_id) + 1) % dss_tgt_nr;
 		break;
 	case DSS_XS_OFFLOAD:
+		if (dss_numa_nr > 1) {
+			xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
+			break;
+		}
 		if (!dss_helper_pool) {
 			if (dss_tgt_offload_xs_nr > 0)
 				xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;