Skip to content

Commit

Permalink
DAOS-XXX engine: Add single engine multi-socket mode
Browse files Browse the repository at this point in the history
For simplicity, target and helper count must be a multiple
of the number of sockets and there must be fewer helpers
than targets.

Let's say K is the number of xstreams of each type that map to
a numa node. They are assigned in order with K going to the
first node, K to the second, etc.

Enabled with DAOS_MULTISOCKET=1

If not enabled, it should work as it did before.

Required-githooks: true

Change-Id: Ia7801289f4cc9e2cca4649d5132a1a5c0ea2f299
Signed-off-by: Jeff Olivier <jeffolivier@google.com>
  • Loading branch information
jolivier23 committed Apr 22, 2024
1 parent 98c953a commit 15a116e
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 62 deletions.
127 changes: 80 additions & 47 deletions src/engine/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,9 @@ int dss_core_nr;
unsigned int dss_core_offset;
/** NUMA node to bind to */
int dss_numa_node = -1;
hwloc_bitmap_t core_allocation_bitmap;
/** a copy of the NUMA node object in the topology */
hwloc_obj_t numa_obj;
/** number of cores in the given NUMA node */
int dss_num_cores_numa_node;
/** Cached numa information */
struct dss_numa_info *numa_info;

/** Module facility bitmask */
static uint64_t dss_mod_facs;
/** Number of storage tiers: 2 for SCM and NVMe */
Expand Down Expand Up @@ -306,8 +304,11 @@ dss_topo_init()
int num_cores_visited;
char *cpuset;
int k;
int i;
int rc = 0;
hwloc_obj_t corenode;
bool tgt_oversub = false;
bool multi_socket = false;

hwloc_topology_init(&dss_topo);
hwloc_topology_load(dss_topo);
Expand All @@ -317,11 +318,14 @@ dss_topo_init()
depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
d_getenv_bool("DAOS_MULTISOCKET", &multi_socket);

if (multi_socket && numa_node_nr > 1)
dss_numa_nr = numa_node_nr;

/* if no NUMA node was specified, or NUMA data unavailable */
/* fall back to the legacy core allocation algorithm */
if (dss_numa_node == -1 || numa_node_nr <= 0) {
D_PRINT("Using legacy core allocation algorithm\n");
dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads,
tgt_oversub);

Expand All @@ -332,62 +336,90 @@ dss_topo_init()
dss_core_offset, dss_core_nr - 1);
return -DER_INVAL;
}
return 0;

if (dss_numa_nr == 1) {
D_PRINT("Using legacy core allocation algorithm\n");
return 0;
}

if ((dss_tgt_offload_xs_nr % numa_node_nr) != 0) {
D_ERROR("helper count must be evenly divisible by numa count\n");
return -DER_INVAL;
}
if ((dss_tgt_nr % numa_node_nr) != 0) {
D_ERROR("tgt count must be evenly divisible by numa count\n");
return -DER_INVAL;
}
dss_tgt_offload_per_numa_xs_nr = dss_tgt_offload_xs_nr / numa_node_nr;
dss_tgt_per_numa_nr = dss_tgt_nr / numa_node_nr;
D_PRINT("Using multi-socket core allocation algorithm nr=%d target_per=%d "
"offload_per=%d\n",
numa_node_nr, dss_tgt_per_numa_nr, dss_tgt_offload_per_numa_xs_nr);
}

if (dss_numa_node > numa_node_nr) {
if (!multi_socket && dss_numa_node > numa_node_nr) {
D_ERROR("Invalid NUMA node selected. "
"Must be no larger than %d\n",
numa_node_nr);
return -DER_INVAL;
}

numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology",
dss_numa_node);
return -DER_INVAL;
}
D_ALLOC_ARRAY(numa_info, numa_node_nr);
if (numa_info == NULL)
return -DER_NOMEM;

/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
core_allocation_bitmap = hwloc_bitmap_alloc();
if (core_allocation_bitmap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
return -DER_INVAL;
}
for (i = 0; i < numa_node_nr; i++) {
hwloc_obj_t numa_obj;
numa_info[i].ni_idx = i;
numa_obj = numa_info[i].ni_obj = hwloc_get_obj_by_depth(dss_topo, depth, 0);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology", i);
D_GOTO(failed, rc = -DER_INVAL);
}

dss_num_cores_numa_node = 0;
num_cores_visited = 0;
/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
numa_info[i].ni_core_allocation_bitmap = hwloc_bitmap_alloc();
if (numa_info[i].ni_core_allocation_bitmap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
D_GOTO(failed, rc = -DER_INVAL);
}

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset,
numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset) {
hwloc_bitmap_set(core_allocation_bitmap, k);
hwloc_bitmap_asprintf(&cpuset,
corenode->cpuset);
numa_info[i].ni_core_nr = 0;
num_cores_visited = 0;

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset) {
hwloc_bitmap_set(numa_info[i].ni_core_allocation_bitmap, k);
hwloc_bitmap_asprintf(&cpuset, corenode->cpuset);
}
numa_info[i].ni_core_nr++;
}
dss_num_cores_numa_node++;
}
}
hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
free(cpuset);

dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads,
tgt_oversub);
if (dss_core_offset >= dss_num_cores_numa_node) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
"should within range [0, %d]", dss_core_offset,
dss_num_cores_numa_node - 1);
return -DER_INVAL;
hwloc_bitmap_asprintf(&cpuset, numa_info[i].ni_core_allocation_bitmap);
free(cpuset);

if (i == dss_numa_node) {
dss_tgt_nr =
dss_tgt_nr_get(numa_info[i].ni_core_nr, nr_threads, tgt_oversub);
if (dss_core_offset >= numa_info[i].ni_core_nr) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
"should within range [0, %d]",
dss_core_offset, numa_info[i].ni_core_nr - 1);
D_GOTO(failed, rc = -DER_INVAL);
}
D_PRINT("Using NUMA core allocation algorithm\n");
}
}

D_PRINT("Using NUMA core allocation algorithm\n");
return 0;
failed:
D_FREE(numa_info);
return rc;
}

static ABT_mutex server_init_state_mutex;
Expand Down Expand Up @@ -825,7 +857,7 @@ server_init(int argc, char *argv[])
DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr,
dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname);

if (numa_obj)
if (numa_info && dss_numa_node != -1)
D_PRINT("Using NUMA node: %d", dss_numa_node);

return 0;
Expand Down Expand Up @@ -904,6 +936,7 @@ server_fini(bool force)
pl_fini();
daos_hhash_fini();
}
D_FREE(numa_info);
D_INFO("daos_fini() or pl_fini() done\n");
crt_finalize();
D_INFO("crt_finalize() done\n");
Expand Down
52 changes: 43 additions & 9 deletions src/engine/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,18 @@
#define DRPC_XS_NR (1)
/** Number of offload XS */
unsigned int dss_tgt_offload_xs_nr;
/** Number of offload per socket */
unsigned int dss_tgt_offload_per_numa_xs_nr;
/** Number of target per socket */
unsigned int dss_tgt_per_numa_nr;
/** Number of target (XS set) per engine */
unsigned int dss_tgt_nr;
/** Number of system XS */
unsigned int dss_sys_xs_nr = DAOS_TGT0_OFFSET + DRPC_XS_NR;
/** Normally set to 1. In "multi-socket" mode, will be the number of
* numa nodes.
*/
unsigned int dss_numa_nr = 1;
/**
* Flag of helper XS as a pool.
* false - the helper XS is near its main IO service XS. When there is one or
Expand Down Expand Up @@ -965,14 +973,41 @@ dss_start_xs_id(int tag, int xs_id)
{
hwloc_obj_t obj;
int rc;
int tgt;
int xs_core_offset;
unsigned idx;
unsigned int idx;
char *cpuset;
struct dss_numa_info *ninfo;
bool clear = false;

D_DEBUG(DB_TRACE, "start xs_id called for %d. ", xs_id);
/* if we are NUMA aware, use the NUMA information */
if (numa_obj) {
idx = hwloc_bitmap_first(core_allocation_bitmap);
if (numa_info) {
if (dss_numa_node == -1) {
tgt = dss_xs2tgt(xs_id);
if (xs_id == 1) {
D_INFO("Swim\n");
ninfo = &numa_info[1];
} else if (tgt != -1) {
D_INFO("target #%d\n", tgt);
ninfo = &numa_info[tgt / dss_tgt_per_numa_nr];
} else if (xs_id > 2) {
tgt = xs_id - dss_sys_xs_nr - dss_tgt_nr;
D_INFO("offload #%d\n", tgt);
ninfo = &numa_info[tgt / dss_tgt_offload_per_numa_xs_nr];
} else {
D_INFO("system %d\n", xs_id);
ninfo = &numa_info[0];
}
if (xs_id != 0)
clear = true;
} else {
ninfo = &numa_info[dss_numa_node];
if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
clear = true;
}

idx = hwloc_bitmap_first(ninfo->ni_core_allocation_bitmap);
if (idx == -1) {
D_ERROR("No core available for XS: %d", xs_id);
return -DER_INVAL;
Expand All @@ -983,8 +1018,8 @@ dss_start_xs_id(int tag, int xs_id)
* All system XS will reuse the first XS' core, but
* the SWIM and DRPC XS will use separate core if enough cores
*/
if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
hwloc_bitmap_clr(core_allocation_bitmap, idx);
if (clear)
hwloc_bitmap_clr(ninfo->ni_core_allocation_bitmap, idx);

obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx);
if (obj == NULL) {
Expand All @@ -993,7 +1028,7 @@ dss_start_xs_id(int tag, int xs_id)
}

hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
D_DEBUG(DB_TRACE, "Using CPU set %s\n", cpuset);
D_DEBUG(DB_TRACE, "Using CPU set (numa %d) %s\n", ninfo->ni_idx, cpuset);
free(cpuset);
} else {
D_DEBUG(DB_TRACE, "Using non-NUMA aware core allocation\n");
Expand Down Expand Up @@ -1076,9 +1111,8 @@ dss_xstreams_init(void)
dss_core_nr, dss_tgt_nr);

if (dss_numa_node != -1) {
D_DEBUG(DB_TRACE,
"Detected %d cores on NUMA node %d\n",
dss_num_cores_numa_node, dss_numa_node);
D_DEBUG(DB_TRACE, "Detected %d cores on NUMA node %d\n",
numa_info[dss_numa_node].ni_core_nr, dss_numa_node);
}

xstream_data.xd_xs_nr = DSS_XS_NR_TOTAL;
Expand Down
24 changes: 18 additions & 6 deletions src/engine/srv_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,30 @@ extern int dss_core_nr;
extern unsigned int dss_core_offset;
/** NUMA node to bind to */
extern int dss_numa_node;
/** bitmap describing core allocation */
extern hwloc_bitmap_t core_allocation_bitmap;
/** a copy of the NUMA node object in the topology */
extern hwloc_obj_t numa_obj;
/** number of cores in the given NUMA node */
extern int dss_num_cores_numa_node;
struct dss_numa_info {
/** a copy of the NUMA node object in the topology */
hwloc_obj_t ni_obj;
/** numa index for this node */
int ni_idx;
/** number of cores in the given NUMA node */
unsigned int ni_core_nr;
/** Allocation bitmap for numa node */
hwloc_bitmap_t ni_core_allocation_bitmap;
};
/** Cached numa information */
extern struct dss_numa_info *numa_info;
/** Number of offload XS */
extern unsigned int dss_tgt_offload_xs_nr;
/** Number of offload XS per socket */
extern unsigned int dss_tgt_offload_per_numa_xs_nr;
/** Number of tgt XS per socket */
extern unsigned int dss_tgt_per_numa_nr;
/** number of system XS */
extern unsigned int dss_sys_xs_nr;
/** Flag of helper XS as a pool */
extern bool dss_helper_pool;
/** Number of numa nodes in multi-socket mode (always 1 otherwise) */
extern unsigned int dss_numa_nr;

/** Shadow dss_get_module_info */
struct dss_module_info *get_module_info(void);
Expand Down
35 changes: 35 additions & 0 deletions src/engine/ult.c
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,33 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags)

/* ============== ULT create functions =================================== */

static inline int
sched_ult2xs_multisocket(int xs_type, int tgt_id)
{
int socket;

/** Keep it simple for now. Only support fewer helper threads than
* io threads */
D_ASSERTF(dss_tgt_offload_xs_nr < dss_tgt_nr,
"Must have fewer helper threads than targets in multi-socket mode");
switch (xs_type) {
case DSS_XS_IOFW:
/** Fall through */
case DSS_XS_OFFLOAD:
/* No helper threads */
if (dss_tgt_offload_xs_nr == 0)
return DSS_XS_SELF;
socket = tgt_id / dss_numa_nr;
/** tgt and offload xstreams are split among sockets evenly */
return dss_sys_xs_nr + dss_tgt_nr + (socket * dss_tgt_offload_per_numa_xs_nr) +
tgt_id % dss_tgt_offload_per_numa_xs_nr;
default:
D_ASSERT(0);
};

return DSS_XS_SELF;
}

static inline int
sched_ult2xs(int xs_type, int tgt_id)
{
Expand All @@ -341,6 +368,10 @@ sched_ult2xs(int xs_type, int tgt_id)
case DSS_XS_DRPC:
return 2;
case DSS_XS_IOFW:
if (dss_numa_nr > 1) {
xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
break;
}
if (!dss_helper_pool) {
if (dss_tgt_offload_xs_nr > 0)
xs_id = DSS_MAIN_XS_ID(tgt_id) + 1;
Expand Down Expand Up @@ -379,6 +410,10 @@ sched_ult2xs(int xs_type, int tgt_id)
xs_id = (DSS_MAIN_XS_ID(tgt_id) + 1) % dss_tgt_nr;
break;
case DSS_XS_OFFLOAD:
if (dss_numa_nr > 1) {
xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
break;
}
if (!dss_helper_pool) {
if (dss_tgt_offload_xs_nr > 0)
xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;
Expand Down

0 comments on commit 15a116e

Please sign in to comment.