Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DAOS-XXX engine: Add single engine multi-socket mode #14160

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 80 additions & 47 deletions src/engine/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,9 @@ int dss_core_nr;
unsigned int dss_core_offset;
/** NUMA node to bind to */
int dss_numa_node = -1;
hwloc_bitmap_t core_allocation_bitmap;
/** a copy of the NUMA node object in the topology */
hwloc_obj_t numa_obj;
/** number of cores in the given NUMA node */
int dss_num_cores_numa_node;
/** Cached numa information */
struct dss_numa_info *numa_info;

/** Module facility bitmask */
static uint64_t dss_mod_facs;
/** Number of storage tiers: 2 for SCM and NVMe */
Expand Down Expand Up @@ -306,8 +304,11 @@ dss_topo_init()
int num_cores_visited;
char *cpuset;
int k;
int i;
int rc = 0;
hwloc_obj_t corenode;
bool tgt_oversub = false;
bool multi_socket = false;

hwloc_topology_init(&dss_topo);
hwloc_topology_load(dss_topo);
Expand All @@ -317,11 +318,14 @@ dss_topo_init()
depth = hwloc_get_type_depth(dss_topo, HWLOC_OBJ_NUMANODE);
numa_node_nr = hwloc_get_nbobjs_by_depth(dss_topo, depth);
d_getenv_bool("DAOS_TARGET_OVERSUBSCRIBE", &tgt_oversub);
d_getenv_bool("DAOS_MULTISOCKET", &multi_socket);

if (multi_socket && numa_node_nr > 1)
dss_numa_nr = numa_node_nr;

/* if no NUMA node was specified, or NUMA data unavailable */
/* fall back to the legacy core allocation algorithm */
if (dss_numa_node == -1 || numa_node_nr <= 0) {
D_PRINT("Using legacy core allocation algorithm\n");
dss_tgt_nr = dss_tgt_nr_get(dss_core_nr, nr_threads,
tgt_oversub);

Expand All @@ -332,62 +336,90 @@ dss_topo_init()
dss_core_offset, dss_core_nr - 1);
return -DER_INVAL;
}
return 0;

if (dss_numa_nr == 1) {
D_PRINT("Using legacy core allocation algorithm\n");
return 0;
}

if ((dss_tgt_offload_xs_nr % numa_node_nr) != 0) {
D_ERROR("helper count must be evenly divisible by numa count\n");
return -DER_INVAL;
}
if ((dss_tgt_nr % numa_node_nr) != 0) {
D_ERROR("tgt count must be evenly divisible by numa count\n");
return -DER_INVAL;
}
dss_tgt_offload_per_numa_xs_nr = dss_tgt_offload_xs_nr / numa_node_nr;
dss_tgt_per_numa_nr = dss_tgt_nr / numa_node_nr;
D_PRINT("Using multi-socket core allocation algorithm nr=%d target_per=%d "
"offload_per=%d\n",
numa_node_nr, dss_tgt_per_numa_nr, dss_tgt_offload_per_numa_xs_nr);
}

if (dss_numa_node > numa_node_nr) {
if (!multi_socket && dss_numa_node > numa_node_nr) {
D_ERROR("Invalid NUMA node selected. "
"Must be no larger than %d\n",
numa_node_nr);
return -DER_INVAL;
}

numa_obj = hwloc_get_obj_by_depth(dss_topo, depth, dss_numa_node);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology",
dss_numa_node);
return -DER_INVAL;
}
D_ALLOC_ARRAY(numa_info, numa_node_nr);
if (numa_info == NULL)
return -DER_NOMEM;

/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
core_allocation_bitmap = hwloc_bitmap_alloc();
if (core_allocation_bitmap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
return -DER_INVAL;
}
for (i = 0; i < numa_node_nr; i++) {
hwloc_obj_t numa_obj;
numa_info[i].ni_idx = i;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
numa_info[i].ni_idx = i;
hwloc_obj_t numa_obj;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
numa_info[i].ni_idx = i;
hwloc_obj_t numa_obj;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
numa_info[i].ni_idx = i;
hwloc_obj_t numa_obj;

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
numa_info[i].ni_idx = i;
hwloc_obj_t numa_obj;

numa_obj = numa_info[i].ni_obj = hwloc_get_obj_by_depth(dss_topo, depth, 0);
if (numa_obj == NULL) {
D_ERROR("NUMA node %d was not found in the topology", i);
D_GOTO(failed, rc = -DER_INVAL);
}

dss_num_cores_numa_node = 0;
num_cores_visited = 0;
/* create an empty bitmap, then set each bit as we */
/* find a core that matches */
numa_info[i].ni_core_allocation_bitmap = hwloc_bitmap_alloc();
if (numa_info[i].ni_core_allocation_bitmap == NULL) {
D_ERROR("Unable to allocate core allocation bitmap\n");
D_GOTO(failed, rc = -DER_INVAL);
}

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset,
numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset) {
hwloc_bitmap_set(core_allocation_bitmap, k);
hwloc_bitmap_asprintf(&cpuset,
corenode->cpuset);
numa_info[i].ni_core_nr = 0;
num_cores_visited = 0;

for (k = 0; k < dss_core_nr; k++) {
corenode = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, k);
if (corenode == NULL)
continue;
if (hwloc_bitmap_isincluded(corenode->cpuset, numa_obj->cpuset) != 0) {
if (num_cores_visited++ >= dss_core_offset) {
hwloc_bitmap_set(numa_info[i].ni_core_allocation_bitmap, k);
hwloc_bitmap_asprintf(&cpuset, corenode->cpuset);
}
numa_info[i].ni_core_nr++;
}
dss_num_cores_numa_node++;
}
}
hwloc_bitmap_asprintf(&cpuset, core_allocation_bitmap);
free(cpuset);

dss_tgt_nr = dss_tgt_nr_get(dss_num_cores_numa_node, nr_threads,
tgt_oversub);
if (dss_core_offset >= dss_num_cores_numa_node) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
"should within range [0, %d]", dss_core_offset,
dss_num_cores_numa_node - 1);
return -DER_INVAL;
hwloc_bitmap_asprintf(&cpuset, numa_info[i].ni_core_allocation_bitmap);
free(cpuset);

if (i == dss_numa_node) {
dss_tgt_nr =
dss_tgt_nr_get(numa_info[i].ni_core_nr, nr_threads, tgt_oversub);
if (dss_core_offset >= numa_info[i].ni_core_nr) {
D_ERROR("invalid dss_core_offset %d (set by \"-f\" option), "
"should within range [0, %d]",
dss_core_offset, numa_info[i].ni_core_nr - 1);
D_GOTO(failed, rc = -DER_INVAL);
}
D_PRINT("Using NUMA core allocation algorithm\n");
}
}

D_PRINT("Using NUMA core allocation algorithm\n");
return 0;
failed:
D_FREE(numa_info);
return rc;
}

static ABT_mutex server_init_state_mutex;
Expand Down Expand Up @@ -825,7 +857,7 @@ server_init(int argc, char *argv[])
DAOS_VERSION, getpid(), dss_self_rank(), dss_tgt_nr,
dss_tgt_offload_xs_nr, dss_core_offset, dss_hostname);

if (numa_obj)
if (numa_info && dss_numa_node != -1)
D_PRINT("Using NUMA node: %d", dss_numa_node);

return 0;
Expand Down Expand Up @@ -904,6 +936,7 @@ server_fini(bool force)
pl_fini();
daos_hhash_fini();
}
D_FREE(numa_info);
D_INFO("daos_fini() or pl_fini() done\n");
crt_finalize();
D_INFO("crt_finalize() done\n");
Expand Down
52 changes: 43 additions & 9 deletions src/engine/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,18 @@
#define DRPC_XS_NR (1)
/** Number of offload XS */
unsigned int dss_tgt_offload_xs_nr;
/** Number of offload per socket */
unsigned int dss_tgt_offload_per_numa_xs_nr;
/** Number of target per socket */
unsigned int dss_tgt_per_numa_nr;
/** Number of target (XS set) per engine */
unsigned int dss_tgt_nr;
/** Number of system XS */
unsigned int dss_sys_xs_nr = DAOS_TGT0_OFFSET + DRPC_XS_NR;
/** Normally set to 1. In "multi-socket" mode, will be the number of
* numa nodes.
*/
unsigned int dss_numa_nr = 1;
/**
* Flag of helper XS as a pool.
* false - the helper XS is near its main IO service XS. When there is one or
Expand Down Expand Up @@ -965,14 +973,41 @@ dss_start_xs_id(int tag, int xs_id)
{
hwloc_obj_t obj;
int rc;
int tgt;
int xs_core_offset;
unsigned idx;
unsigned int idx;
char *cpuset;
struct dss_numa_info *ninfo;
bool clear = false;

D_DEBUG(DB_TRACE, "start xs_id called for %d. ", xs_id);
/* if we are NUMA aware, use the NUMA information */
if (numa_obj) {
idx = hwloc_bitmap_first(core_allocation_bitmap);
if (numa_info) {
if (dss_numa_node == -1) {
tgt = dss_xs2tgt(xs_id);
if (xs_id == 1) {
D_INFO("Swim\n");
ninfo = &numa_info[1];
} else if (tgt != -1) {
D_INFO("target #%d\n", tgt);
ninfo = &numa_info[tgt / dss_tgt_per_numa_nr];
} else if (xs_id > 2) {
tgt = xs_id - dss_sys_xs_nr - dss_tgt_nr;
D_INFO("offload #%d\n", tgt);
ninfo = &numa_info[tgt / dss_tgt_offload_per_numa_xs_nr];
} else {
D_INFO("system %d\n", xs_id);
ninfo = &numa_info[0];
}
if (xs_id != 0)
clear = true;
} else {
ninfo = &numa_info[dss_numa_node];
if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
clear = true;
}

idx = hwloc_bitmap_first(ninfo->ni_core_allocation_bitmap);
if (idx == -1) {
D_ERROR("No core available for XS: %d", xs_id);
return -DER_INVAL;
Expand All @@ -983,8 +1018,8 @@ dss_start_xs_id(int tag, int xs_id)
* All system XS will reuse the first XS' core, but
* the SWIM and DRPC XS will use separate core if enough cores
*/
if (xs_id > 1 || (xs_id == 0 && dss_core_nr > dss_tgt_nr))
hwloc_bitmap_clr(core_allocation_bitmap, idx);
if (clear)
hwloc_bitmap_clr(ninfo->ni_core_allocation_bitmap, idx);

obj = hwloc_get_obj_by_depth(dss_topo, dss_core_depth, idx);
if (obj == NULL) {
Expand All @@ -993,7 +1028,7 @@ dss_start_xs_id(int tag, int xs_id)
}

hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
D_DEBUG(DB_TRACE, "Using CPU set %s\n", cpuset);
D_DEBUG(DB_TRACE, "Using CPU set (numa %d) %s\n", ninfo->ni_idx, cpuset);
free(cpuset);
} else {
D_DEBUG(DB_TRACE, "Using non-NUMA aware core allocation\n");
Expand Down Expand Up @@ -1076,9 +1111,8 @@ dss_xstreams_init(void)
dss_core_nr, dss_tgt_nr);

if (dss_numa_node != -1) {
D_DEBUG(DB_TRACE,
"Detected %d cores on NUMA node %d\n",
dss_num_cores_numa_node, dss_numa_node);
D_DEBUG(DB_TRACE, "Detected %d cores on NUMA node %d\n",
numa_info[dss_numa_node].ni_core_nr, dss_numa_node);
}

xstream_data.xd_xs_nr = DSS_XS_NR_TOTAL;
Expand Down
24 changes: 18 additions & 6 deletions src/engine/srv_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -132,18 +132,30 @@ extern int dss_core_nr;
extern unsigned int dss_core_offset;
/** NUMA node to bind to */
extern int dss_numa_node;
/** bitmap describing core allocation */
extern hwloc_bitmap_t core_allocation_bitmap;
/** a copy of the NUMA node object in the topology */
extern hwloc_obj_t numa_obj;
/** number of cores in the given NUMA node */
extern int dss_num_cores_numa_node;
struct dss_numa_info {
/** a copy of the NUMA node object in the topology */
hwloc_obj_t ni_obj;
/** numa index for this node */
int ni_idx;
/** number of cores in the given NUMA node */
unsigned int ni_core_nr;
/** Allocation bitmap for numa node */
hwloc_bitmap_t ni_core_allocation_bitmap;
};
/** Cached numa information */
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/** Cached numa information */
};

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/** Cached numa information */
};

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/** Cached numa information */
};

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/** Cached numa information */
};

extern struct dss_numa_info *numa_info;
/** Number of offload XS */
extern unsigned int dss_tgt_offload_xs_nr;
/** Number of offload XS per socket */
extern unsigned int dss_tgt_offload_per_numa_xs_nr;
/** Number of tgt XS per socket */
extern unsigned int dss_tgt_per_numa_nr;
/** number of system XS */
extern unsigned int dss_sys_xs_nr;
/** Flag of helper XS as a pool */
extern bool dss_helper_pool;
/** Number of numa nodes in multi-socket mode (always 1 otherwise) */
extern unsigned int dss_numa_nr;

/** Shadow dss_get_module_info */
struct dss_module_info *get_module_info(void);
Expand Down
35 changes: 35 additions & 0 deletions src/engine/ult.c
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,33 @@ dss_thread_collective(int (*func)(void *), void *arg, unsigned int flags)

/* ============== ULT create functions =================================== */

static inline int
sched_ult2xs_multisocket(int xs_type, int tgt_id)
{
int socket;

/** Keep it simple for now. Only support fewer helper threads than
* io threads */
D_ASSERTF(dss_tgt_offload_xs_nr < dss_tgt_nr,
"Must have fewer helper threads than targets in multi-socket mode");
switch (xs_type) {
case DSS_XS_IOFW:
/** Fall through */
case DSS_XS_OFFLOAD:
/* No helper threads */
if (dss_tgt_offload_xs_nr == 0)
return DSS_XS_SELF;
socket = tgt_id / dss_numa_nr;
/** tgt and offload xstreams are split among sockets evenly */
return dss_sys_xs_nr + dss_tgt_nr + (socket * dss_tgt_offload_per_numa_xs_nr) +
tgt_id % dss_tgt_offload_per_numa_xs_nr;
default:
D_ASSERT(0);
};

return DSS_XS_SELF;
}

static inline int
sched_ult2xs(int xs_type, int tgt_id)
{
Expand All @@ -341,6 +368,10 @@ sched_ult2xs(int xs_type, int tgt_id)
case DSS_XS_DRPC:
return 2;
case DSS_XS_IOFW:
if (dss_numa_nr > 1) {
xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
break;
}
if (!dss_helper_pool) {
if (dss_tgt_offload_xs_nr > 0)
xs_id = DSS_MAIN_XS_ID(tgt_id) + 1;
Expand Down Expand Up @@ -379,6 +410,10 @@ sched_ult2xs(int xs_type, int tgt_id)
xs_id = (DSS_MAIN_XS_ID(tgt_id) + 1) % dss_tgt_nr;
break;
case DSS_XS_OFFLOAD:
if (dss_numa_nr > 1) {
xs_id = sched_ult2xs_multisocket(xs_type, tgt_id);
break;
}
if (!dss_helper_pool) {
if (dss_tgt_offload_xs_nr > 0)
xs_id = DSS_MAIN_XS_ID(tgt_id) + dss_tgt_offload_xs_nr / dss_tgt_nr;
Expand Down
Loading