diff --git a/src/engine/init.c b/src/engine/init.c index c4dfb6e1997..5b2900d1615 100644 --- a/src/engine/init.c +++ b/src/engine/init.c @@ -75,11 +75,9 @@ int dss_core_nr; unsigned int dss_core_offset; /** NUMA node to bind to */ int dss_numa_node = -1; -hwloc_bitmap_t core_allocation_bitmap; -/** a copy of the NUMA node object in the topology */ -hwloc_obj_t numa_obj; -/** number of cores in the given NUMA node */ -int dss_num_cores_numa_node; +/** Cached numa information */ +struct dss_numa_info *numa_info; + /** Module facility bitmask */ static uint64_t dss_mod_facs; /** Number of storage tiers: 2 for SCM and NVMe */ @@ -306,8 +304,11 @@ dss_topo_init() int num_cores_visited; char *cpuset; int k; + int i; + int rc = 0; hwloc_obj_t corenode; bool tgt_oversub = false; + bool multi_socket = false; hwloc_topology_init(&dss_topo); hwloc_topology_load(dss_topo); @@ -317,11 +318,14 @@ dss_topo_init() depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE); numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth); d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub); + d_getenv_bool("DAOS_MULTISOCKET", &multi_socket); + + if (multi_socket && numa_node_nr > 1) + dss_numa_nr = numa_node_nr; /* if no NUMA node was specified, or NUMA data unavailable */ /* fall back to the legacy core allocation algorithm */ if (dss_numa_node == -1 || numa_node_nr <= 0) { - D_PRINT("Using legacy core allocation algorithm\n"); dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads, tgt_oversub); @@ -332,62 +336,90 @@ dss_topo_init() dss_core_offset, dss_core_nr - 1); return -DER_INVAL; } - return 0; + + if (dss_numa_nr == 1) { + D_PRINT("Using legacy core allocation algorithm\n"); + return 0; + } + + if ((dss_tgt_offload_xs_nr % numa_node_nr) != 0) { + D_ERROR("helper count must be evenly divisible by numa count\n"); + return -DER_INVAL; + } + if ((dss_tgt_nr % numa_node_nr) != 0) { + D_ERROR("tgt count must be evenly divisible by numa count\n"); + return -DER_INVAL; + } + dss_tgt_offload_per_numa_xs_nr = dss_tgt_offload_xs_nr / numa_node_nr; + dss_tgt_per_numa_nr = dss_tgt_nr / numa_node_nr; + D_PRINT("Using multi-socket core allocation algorithm nr=%d target_per=%d " + "offload_per=%d\n", + numa_node_nr, dss_tgt_per_numa_nr, dss_tgt_offload_per_numa_xs_nr); } - if (dss_numa_node > numa_node_nr) { + if (!multi_socket && dss_numa_node > numa_node_nr) { D_ERROR("Invalid NUMA node selected. " "Must be no larger than %d\n", numa_node_nr); return -DER_INVAL; } - numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node); - if (numa_obj == NULL) { - D_ERROR("NUMA node %d was not found in the topology", - dss_numa_node); - return -DER_INVAL; - } + D_ALLOC_ARRAY(numa_info, numa_node_nr); + if (numa_info == NULL) + return -DER_NOMEM; - /* create an empty bitmap, then set each bit as we */ - /* find a core that matches */ - core_allocation_bitmap = hwloc_bitmap_alloc(); - if (core_allocation_bitmap == NULL) { - D_ERROR("Unable to allocate core allocation bitmap\n"); - return -DER_INVAL; - } + for (i = 0; i < numa_node_nr; i++) { + hwloc_obj_t numa_obj; + numa_info[i].ni_idx = i; + numa_obj = numa_info[i].ni_obj = hwloc_get_obj_by_depth(dss_topo, depth, 0); + if (numa_obj == NULL) { + D_ERROR("NUMA node %d was not found in the topology", i); + D_GOTO(failed, rc = -DER_INVAL); + } - dss_num_cores_numa_node = 0; - num_cores_visited = 0; + /* create an empty bitmap, then set each bit as we */ + /* find a core that matches */ + numa_info[i].ni_core_allocation_bitmap = hwloc_bitmap_alloc(); + if (numa_info[i].ni_core_allocation_bitmap == NULL) { + D_ERROR("Unable to allocate core allocation bitmap\n"); + D_GOTO(failed, rc = -DER_INVAL); + } - for (k = 0; k < dss_core_nr; k++) { - corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k); - if (corenode == NULL) - continue; - if (hwloc_bitmap_isincluded(corenode->cpuset, - numa_obj->cpuset) != 0) { - if (num_cores_visited++ >= dss_core_offset) { - hwloc_bitmap_set(core_allocation_bitmap, k); - hwloc_bitmap_asprintf(&cpuset, - corenode->cpuset); + numa_info[i].ni_core_nr = 0; + num_cores_visited = 0; + + for (k = 0; k < dss_core_nr; k++) { + corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k); + if (corenode == NULL) + continue; + if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) { + if (num_cores_visited++ >= dss_core_offset) { + hwloc_bitmap_set(numa_info[i].ni_core_allocation_bitmap, k); + hwloc_bitmap_asprintf(&cpuset, corenode->cpuset); + } + numa_info[i].ni_core_nr++; } - dss_num_cores_numa_node++; } - } - hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap); - free(cpuset); - - dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads, - tgt_oversub); - if (dss_core_offset >= dss_num_cores_numa_node) { - D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), " - "should within range [0, %d]", dss_core_offset, - dss_num_cores_numa_node - 1); - return -DER_INVAL; + hwloc_bitmap_asprintf(&cpuset, numa_info[i].ni_core_allocation_bitmap); + free(cpuset); + + if (i == dss_numa_node) { + dss_tgt_nr = + dss_tgt_nr_get(numa_info[i].ni_core_nr, nr_threads, tgt_oversub); + if (dss_core_offset >= numa_info[i].ni_core_nr) { + D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), " + "should within range [0, %d]", + dss_core_offset, numa_info[i].ni_core_nr - 1); + D_GOTO(failed, rc = -DER_INVAL); + } + D_PRINT("Using NUMA core allocation algorithm\n"); + } } - D_PRINT("Using NUMA core allocation algorithm\n"); return 0; +failed: + D_FREE(numa_info); + return rc; } static ABT_mutex server_init_state_mutex; @@ -825,7 +857,7 @@ server_init(int argc, char *argv[]) DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr, dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname); - if (numa_obj) + if (numa_info && dss_numa_node != -1) D_PRINT("Using NUMA node: %d", dss_numa_node); return 0; @@ -904,6 +936,7 @@ server_fini(bool force) pl_fini(); daos_hhash_fini(); } + D_FREE(numa_info); D_INFO("daos_fini() or pl_fini() done\n"); crt_finalize(); D_INFO("crt_finalize() done\n"); diff --git a/src/engine/srv.c b/src/engine/srv.c index 986d8ed04c4..0d71eddb1ed 100644 --- a/src/engine/srv.c +++ b/src/engine/srv.c @@ -75,10 +75,18 @@ #define DRPC_XS_NR (1) /** Number of offload XS */ unsigned int dss_tgt_offload_xs_nr; +/** Number of offload per socket */ +unsigned int dss_tgt_offload_per_numa_xs_nr; +/** Number of target per socket */ +unsigned int dss_tgt_per_numa_nr; /** Number of target (XS set) per engine */ unsigned int dss_tgt_nr; /** Number of system XS */ unsigned int dss_sys_xs_nr = DAOS_TGT0_OFFSET + DRPC_XS_NR; +/** Normally set to 1. In "multi-socket" mode, will be the number of + * numa nodes. + */ +unsigned int dss_numa_nr = 1; /** * Flag of helper XS as a pool. * false - the helper XS is near its main IO service XS. When there is one or @@ -965,14 +973,41 @@ dss_start_xs_id(int tag, int xs_id) { hwloc_obj_t obj; int rc; + int tgt; int xs_core_offset; - unsigned idx; + unsigned int idx; char *cpuset; + struct dss_numa_info *ninfo; + bool clear = false; D_DEBUG(DB_TRACE, "start xs_id called for %d. ", xs_id); /* if we are NUMA aware, use the NUMA information */ - if (numa_obj) { - idx = hwloc_bitmap_first(core_allocation_bitmap); + if (numa_info) { + if (dss_numa_node == -1) { + tgt = dss_xs2tgt(xs_id); + if (xs_id == 1) { + D_INFO("Swim\n"); + ninfo = &numa_info[1]; + } else if (tgt != -1) { + D_INFO("target #%d\n", tgt); + ninfo = &numa_info[tgt / dss_tgt_per_numa_nr]; + } else if (xs_id > 2) { + tgt = xs_id - dss_sys_xs_nr - dss_tgt_nr; + D_INFO("offload #%d\n", tgt); + ninfo = &numa_info[tgt / dss_tgt_offload_per_numa_xs_nr]; + } else { + D_INFO("system %d\n", xs_id); + ninfo = &numa_info[0]; + } + if (xs_id != 0) + clear = true; + } else { + ninfo = &numa_info[dss_numa_node]; + if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr)) + clear = true; + } + + idx = hwloc_bitmap_first(ninfo->ni_core_allocation_bitmap); if (idx == -1) { D_ERROR("No core available for XS: %d", xs_id); return -DER_INVAL; @@ -983,8 +1018,8 @@ dss_start_xs_id(int tag, int xs_id) * All system XS will reuse the first XS' core, but * the SWIM and DRPC XS will use separate core if enough cores */ - if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr)) - hwloc_bitmap_clr(core_allocation_bitmap, idx); + if (clear) + hwloc_bitmap_clr(ninfo->ni_core_allocation_bitmap, idx); obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx); if (obj == NULL) { @@ -993,7 +1028,7 @@ dss_start_xs_id(int tag, int xs_id) } hwloc_bitmap_asprintf(&cpuset, obj->cpuset); - D_DEBUG(DB_TRACE, "Using CPU set %s\n", cpuset); + D_DEBUG(DB_TRACE, "Using CPU set (numa %d) %s\n", ninfo->ni_idx, cpuset); free(cpuset); } else { D_DEBUG(DB_TRACE, "Using non-NUMA aware core allocation\n"); @@ -1076,9 +1111,8 @@ dss_xstreams_init(void) dss_core_nr, dss_tgt_nr); if (dss_numa_node != -1) { - D_DEBUG(DB_TRACE, - "Detected %d cores on NUMA node %d\n", - dss_num_cores_numa_node, dss_numa_node); + D_DEBUG(DB_TRACE, "Detected %d cores on NUMA node %d\n", + numa_info[dss_numa_node].ni_core_nr, dss_numa_node); } xstream_data.xd_xs_nr = DSS_XS_NR_TOTAL; diff --git a/src/engine/srv_internal.h b/src/engine/srv_internal.h index 8621175b44f..d641dc58a2c 100644 --- a/src/engine/srv_internal.h +++ b/src/engine/srv_internal.h @@ -132,18 +132,30 @@ extern int dss_core_nr; extern unsigned int dss_core_offset; /** NUMA node to bind to */ extern int dss_numa_node; -/** bitmap describing core allocation */ -extern hwloc_bitmap_t core_allocation_bitmap; -/** a copy of the NUMA node object in the topology */ -extern hwloc_obj_t numa_obj; -/** number of cores in the given NUMA node */ -extern int dss_num_cores_numa_node; +struct dss_numa_info { + /** a copy of the NUMA node object in the topology */ + hwloc_obj_t ni_obj; + /** numa index for this node */ + int ni_idx; + /** number of cores in the given NUMA node */ + unsigned int ni_core_nr; + /** Allocation bitmap for numa node */ + hwloc_bitmap_t ni_core_allocation_bitmap; +}; +/** Cached numa information */ +extern struct dss_numa_info *numa_info; /** Number of offload XS */ extern unsigned int dss_tgt_offload_xs_nr; +/** Number of offload XS per socket */ +extern unsigned int dss_tgt_offload_per_numa_xs_nr; +/** Number of tgt XS per socket */ +extern unsigned int dss_tgt_per_numa_nr; /** number of system XS */ extern unsigned int dss_sys_xs_nr; /** Flag of helper XS as a pool */ extern bool dss_helper_pool; +/** Number of numa nodes in multi-socket mode (always 1 otherwise) */ +extern unsigned int dss_numa_nr; /** Shadow dss_get_module_info */ struct dss_module_info *get_module_info(void); diff --git a/src/engine/ult.c b/src/engine/ult.c index 47c3b504f8d..6a36e5b1721 100644 --- a/src/engine/ult.c +++ b/src/engine/ult.c @@ -324,6 +324,33 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags) /* ============== ULT create functions =================================== */ +static inline int +sched_ult2xs_multisocket(int xs_type, int tgt_id) +{ + int socket; + + /** Keep it simple for now. Only support fewer helper threads than + * io threads */ + D_ASSERTF(dss_tgt_offload_xs_nr < dss_tgt_nr, + "Must have fewer helper threads than targets in multi-socket mode"); + switch (xs_type) { + case DSS_XS_IOFW: + /** Fall through */ + case DSS_XS_OFFLOAD: + /* No helper threads */ + if (dss_tgt_offload_xs_nr == 0) + return DSS_XS_SELF; + socket = tgt_id / dss_numa_nr; + /** tgt and offload xstreams are split among sockets evenly */ + return dss_sys_xs_nr + dss_tgt_nr + (socket * dss_tgt_offload_per_numa_xs_nr) + + tgt_id % dss_tgt_offload_per_numa_xs_nr; + default: + D_ASSERT(0); + }; + + return DSS_XS_SELF; +} + static inline int sched_ult2xs(int xs_type, int tgt_id) { @@ -341,6 +368,10 @@ sched_ult2xs(int xs_type, int tgt_id) case DSS_XS_DRPC: return 2; case DSS_XS_IOFW: + if (dss_numa_nr > 1) { + xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); + break; + } if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + 1; @@ -379,6 +410,10 @@ sched_ult2xs(int xs_type, int tgt_id) xs_id = (DSS_MAIN_XS_ID(tgt_id) + 1) % dss_tgt_nr; break; case DSS_XS_OFFLOAD: + if (dss_numa_nr > 1) { + xs_id = sched_ult2xs_multisocket(xs_type, tgt_id); + break; + } if (!dss_helper_pool) { if (dss_tgt_offload_xs_nr > 0) xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;