From f1e02d2089bc8b5fc4ad48865811d3c7bdbdd268 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Thu, 3 Oct 2024 13:07:56 -0500 Subject: [PATCH 1/2] NA OFI: fix cxi domain ops settings Also disable PROV_KEY_CACHE Add NA_OFI_SKIP_DOMAIN_OPS env variable to skip domain ops --- src/na/na_ofi.c | 134 +++++++++++++++++++++++++++--------------------- 1 file changed, 75 insertions(+), 59 deletions(-) diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c index c833dc14..098ac8b6 100644 --- a/src/na/na_ofi.c +++ b/src/na/na_ofi.c @@ -97,9 +97,6 @@ # define FI_ADDR_CXI (FI_ADDR_OPX + 1) # define FI_PROTO_CXI (FI_PROTO_OPX + 1) #endif -#ifndef NA_OFI_HAS_EXT_CXI_H -# define FI_CXI_DOM_OPS_6 "dom_ops_v6" -#endif /* Fallback for undefined XNET values */ #if FI_VERSION_LT(FI_COMPILE_VERSION, FI_VERSION(1, 17)) @@ -717,19 +714,25 @@ struct fi_gni_auth_key { #endif #ifndef NA_OFI_HAS_EXT_CXI_H -/* CXI domain ops (v6) */ -struct fi_cxi_dom_ops { - int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value, - struct timespec *ts); - int (*topology)(struct fid *fid, unsigned int *group_id, - unsigned int *switch_id, unsigned int *port_id); - int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable); - size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep, - struct fi_cq_tagged_entry *entry, size_t count, fi_addr_t *src_addr, - size_t *ux_count); - int (*get_dwq_depth)(struct fid *fid, size_t *depth); - int (*enable_mr_match_events)(struct fid *fid, bool enable); - int (*enable_optimized_mrs)(struct fid *fid, bool enable); +/* + * TODO: The following should be integrated into the include/rdma/fi_ext.h + * and are use for provider specific fi_control() operations. + */ +# define FI_PROV_SPECIFIC_CXI (0xccc << 16) + +enum { + FI_OPT_CXI_SET_TCLASS = -FI_PROV_SPECIFIC_CXI, /* uint32_t */ + FI_OPT_CXI_SET_MSG_ORDER, /* uint64_t */ + + /* fid_nic control operation to refresh NIC attributes. */ + FI_OPT_CXI_NIC_REFRESH_ATTR, + + FI_OPT_CXI_SET_MR_MATCH_EVENTS, /* bool */ + FI_OPT_CXI_GET_MR_MATCH_EVENTS, /* bool */ + FI_OPT_CXI_SET_OPTIMIZED_MRS, /* bool */ + FI_OPT_CXI_GET_OPTIMIZED_MRS, /* bool */ + FI_OPT_CXI_SET_PROV_KEY_CACHE, /* bool */ + FI_OPT_CXI_GET_PROV_KEY_CACHE, /* bool */ }; /* CXI Authorization Key */ @@ -1152,13 +1155,11 @@ static na_return_t na_ofi_gni_set_domain_ops(struct na_ofi_domain *na_ofi_domain); #endif -#ifdef FI_CXI_DOM_OPS_6 /** * Set CXI specific domain ops. */ -static na_return_t +static void na_ofi_cxi_set_domain_ops(struct na_ofi_domain *na_ofi_domain); -#endif /** * Parse auth key. @@ -4240,11 +4241,8 @@ na_ofi_set_domain_ops( return NA_PROTONOSUPPORT; #endif case NA_OFI_PROV_CXI: -#ifdef FI_CXI_DOM_OPS_6 - return na_ofi_cxi_set_domain_ops(na_ofi_domain); -#else - (void) na_ofi_domain; -#endif + na_ofi_cxi_set_domain_ops(na_ofi_domain); + break; case NA_OFI_PROV_SHM: case NA_OFI_PROV_SOCKETS: case NA_OFI_PROV_TCP: @@ -4252,7 +4250,7 @@ na_ofi_set_domain_ops( case NA_OFI_PROV_PSM2: case NA_OFI_PROV_OPX: case NA_OFI_PROV_VERBS_RXM: - return NA_SUCCESS; + break; case NA_OFI_PROV_NULL: default: NA_LOG_SUBSYS_ERROR(fatal, @@ -4260,6 +4258,8 @@ na_ofi_set_domain_ops( na_ofi_prov_name[prov_type]); return NA_PROTONOSUPPORT; } + + return NA_SUCCESS; } /*---------------------------------------------------------------------------*/ @@ -4346,43 +4346,52 @@ na_ofi_gni_set_domain_ops(struct na_ofi_domain *na_ofi_domain) #endif /*---------------------------------------------------------------------------*/ -#ifdef FI_CXI_DOM_OPS_6 -static na_return_t +static void na_ofi_cxi_set_domain_ops(struct na_ofi_domain *na_ofi_domain) { - struct fi_cxi_dom_ops *dom_ops; - na_return_t ret; + bool val = false; int rc; - rc = fi_open_ops(&na_ofi_domain->fi_domain->fid, FI_CXI_DOM_OPS_6, 0, - (void **) &dom_ops, NULL); - if (rc != 0) { - /* Silently ignore */ - NA_LOG_SUBSYS_DEBUG( - cls, "fi_open_ops() failed, rc: %d (%s)", rc, fi_strerror(-rc)); - return NA_SUCCESS; - } - - /* Prevent potential memory corruption by ensuring that memory backing MRs - * cannot be accessed after invoking fi_close() even if that memory remains - * in the MR cache. */ - rc = dom_ops->enable_mr_match_events(&na_ofi_domain->fi_domain->fid, true); - NA_CHECK_SUBSYS_ERROR(cls, rc != 0, error, ret, na_ofi_errno_to_na(-rc), - "enable_mr_match_events() failed, rc: %d (%s)", rc, fi_strerror(-rc)); - - /* Disable use of optimized MRs to prevent quick recycle of MR keys (when - * using FI_MR_PROV_KEY). This prevents potential memory corruptions when - * multiple regions could end up using the same key. */ - rc = dom_ops->enable_optimized_mrs(&na_ofi_domain->fi_domain->fid, false); - NA_CHECK_SUBSYS_ERROR(cls, rc != 0, error, ret, na_ofi_errno_to_na(-rc), - "enable_optimized_mrs() failed, rc: %d (%s)", rc, fi_strerror(-rc)); - - return NA_SUCCESS; - -error: - return ret; + /* PROV_KEY_CACHE: The provider key cache is a performance optimization for + * FI_MR_PROV_KEY. The performance gain is fi_mr_close() becomes a no-op but + * at the cost of the corresponding MR being left exposed to the network. + * This is intended to be used for applications where fi_mr_close() is on + * the critical path. For storage use-cases, leaving MRs exposed is an + * issue. This could result in MR operations unexpectedly completing and + * reading/writing to unknown memory. */ + rc = fi_control( + &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, &val); + NA_CHECK_SUBSYS_WARNING(cls, rc != 0, + "could not set CXI PROV_KEY_CACHE property (%s)", fi_strerror(-rc)); + + /* OPTIMIZED_MRS: Optimized MRs offer a higher operation rate over + * standard/unoptimized MRs. Because optimized MR allocation/deallocation is + * expensive (i.e., it always requires calls into the kernel), optimized MRs + * should only be used for persistent MRs. This typically maps to MPI/SHMEM + * RMA windows which are persistent. For Mercury, since MRs are ephemeral + * and allocation/deallocation may be on the critical path, optimized MRs + * should be disabled. Optimized MRs also present a risk for the + * recycling of MR keys (when using FI_MR_PROV_KEY) where multiple regions + * could end up using the same key by allocating/deallocating the MR, + * leading to potential memory corruptions. */ + rc = fi_control( + &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_OPTIMIZED_MRS, &val); + NA_CHECK_SUBSYS_WARNING(cls, rc != 0, + "could not set CXI OPTIMIZED_MRS property (%s)", fi_strerror(-rc)); + + /* MR_MATCH_EVENTS: While standard/unoptimized MRs do not have a call into + * the kernel for MR allocation, there is still a call into the kernel for + * MR deallocation. To avoid this kernel call, MR_MATCH_EVENTS needs to be + * enabled. The cost MR_MATCH_EVENTS introduces is where the target of an + * RMA operation was previously passive (i.e., no events), this will enable + * MR events. This requires the owner of the MR to process event queues in a + * timely manner or have large event queue buffers. */ + val = true; + rc = fi_control( + &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_MR_MATCH_EVENTS, &val); + NA_CHECK_SUBSYS_WARNING(cls, rc != 0, + "could not set CXI MR_MATCH_EVENTS property (%s)", fi_strerror(-rc)); } -#endif /*---------------------------------------------------------------------------*/ static na_return_t @@ -4749,6 +4758,8 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric, struct fi_domain_attr *domain_attr = fi_info->domain_attr; union na_ofi_auth_key base_auth_key; const union na_ofi_auth_key *base_auth_key_p = NULL; + char *env; + bool skip_domain_ops = false; na_return_t ret; int rc; @@ -4842,8 +4853,13 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric, fi_tostr(domain_attr, FI_TYPE_DOMAIN_ATTR)); /* Set optional domain ops */ - ret = na_ofi_set_domain_ops(na_ofi_fabric->prov_type, na_ofi_domain); - NA_CHECK_SUBSYS_NA_ERROR(cls, error, ret, "Could not set domain ops"); + env = getenv("NA_OFI_SKIP_DOMAIN_OPS"); + if (env != NULL) + skip_domain_ops = (atoi(env) != 0); + if (!skip_domain_ops) { + ret = na_ofi_set_domain_ops(na_ofi_fabric->prov_type, na_ofi_domain); + NA_CHECK_SUBSYS_NA_ERROR(cls, error, ret, "Could not set domain ops"); + } #if FI_VERSION_GE(FI_COMPILE_VERSION, FI_VERSION(1, 20)) /* Check if we can use FI_AV_USER_ID */ From a4597b19ca2841685596595a8859269c833eeb14 Mon Sep 17 00:00:00 2001 From: Jerome Soumagne Date: Fri, 4 Oct 2024 17:13:03 -0500 Subject: [PATCH 2/2] HG Perf: support forced registration in hg_bw_read/write --- Testing/perf/hg/hg_bw_read.c | 21 +++++++++++++++++ Testing/perf/hg/hg_bw_write.c | 21 +++++++++++++++++ Testing/perf/hg/mercury_perf.c | 43 +++++++++++++++++++++++----------- Testing/perf/hg/mercury_perf.h | 1 + 4 files changed, 72 insertions(+), 14 deletions(-) diff --git a/Testing/perf/hg/hg_bw_read.c b/Testing/perf/hg/hg_bw_read.c index 5756aa21..6cbe0ef2 100644 --- a/Testing/perf/hg/hg_bw_read.c +++ b/Testing/perf/hg/hg_bw_read.c @@ -54,8 +54,22 @@ hg_perf_run(const struct hg_test_info *hg_test_info, hg_time_get_current(&t1); } + if (hg_test_info->na_test_info.force_register) { + for (j = 0; j < info->handle_max; j++) { + hg_size_t bulk_size = info->buf_size_max * info->bulk_count; + ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[j], + &bulk_size, HG_BULK_READ_ONLY, + &info->local_bulk_handles[j]); + HG_TEST_CHECK_HG_ERROR(error, ret, + "HG_Bulk_create() failed (%s)", HG_Error_to_string(ret)); + } + } + for (j = 0; j < info->handle_max; j++) { struct hg_perf_bulk_info in_struct = { + .bulk = (hg_test_info->na_test_info.force_register) + ? info->local_bulk_handles[j] + : HG_BULK_NULL, .handle_id = (uint32_t) ((comm_rank + j * comm_size) / info->target_addr_max), .size = (uint32_t) buf_size}; @@ -83,6 +97,13 @@ hg_perf_run(const struct hg_test_info *hg_test_info, } } } + + if (hg_test_info->na_test_info.force_register) { + for (j = 0; j < info->handle_max; j++) { + (void) HG_Bulk_free(info->local_bulk_handles[j]); + info->local_bulk_handles[j] = HG_BULK_NULL; + } + } } if (comm_size > 1) diff --git a/Testing/perf/hg/hg_bw_write.c b/Testing/perf/hg/hg_bw_write.c index 91eceb9a..625e808d 100644 --- a/Testing/perf/hg/hg_bw_write.c +++ b/Testing/perf/hg/hg_bw_write.c @@ -54,8 +54,22 @@ hg_perf_run(const struct hg_test_info *hg_test_info, hg_time_get_current(&t1); } + if (hg_test_info->na_test_info.force_register) { + for (j = 0; j < info->handle_max; j++) { + hg_size_t bulk_size = info->buf_size_max * info->bulk_count; + ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[j], + &bulk_size, HG_BULK_READ_ONLY, + &info->local_bulk_handles[j]); + HG_TEST_CHECK_HG_ERROR(error, ret, + "HG_Bulk_create() failed (%s)", HG_Error_to_string(ret)); + } + } + for (j = 0; j < info->handle_max; j++) { struct hg_perf_bulk_info in_struct = { + .bulk = (hg_test_info->na_test_info.force_register) + ? info->local_bulk_handles[j] + : HG_BULK_NULL, .handle_id = (uint32_t) ((comm_rank + j * comm_size) / info->target_addr_max), .size = (uint32_t) buf_size}; @@ -69,6 +83,13 @@ hg_perf_run(const struct hg_test_info *hg_test_info, ret = hg_perf_request_wait(info, &request, HG_MAX_IDLE_TIME, NULL); HG_TEST_CHECK_HG_ERROR(error, ret, "hg_perf_request_wait() failed (%s)", HG_Error_to_string(ret)); + + if (hg_test_info->na_test_info.force_register) { + for (j = 0; j < info->handle_max; j++) { + (void) HG_Bulk_free(info->local_bulk_handles[j]); + info->local_bulk_handles[j] = HG_BULK_NULL; + } + } } if (comm_size > 1) diff --git a/Testing/perf/hg/mercury_perf.c b/Testing/perf/hg/mercury_perf.c index c60f944a..1d715abe 100644 --- a/Testing/perf/hg/mercury_perf.c +++ b/Testing/perf/hg/mercury_perf.c @@ -60,8 +60,8 @@ static void hg_perf_class_cleanup(struct hg_perf_class_info *info); static hg_return_t -hg_perf_bulk_buf_alloc( - struct hg_perf_class_info *info, uint8_t bulk_flags, bool init_data); +hg_perf_bulk_buf_alloc(struct hg_perf_class_info *info, uint8_t bulk_flags, + bool init_data, bool bulk_create); static void hg_perf_bulk_buf_free(struct hg_perf_class_info *info); @@ -413,8 +413,8 @@ hg_perf_class_cleanup(struct hg_perf_class_info *info) /*---------------------------------------------------------------------------*/ static hg_return_t -hg_perf_bulk_buf_alloc( - struct hg_perf_class_info *info, uint8_t bulk_flags, bool init_data) +hg_perf_bulk_buf_alloc(struct hg_perf_class_info *info, uint8_t bulk_flags, + bool init_data, bool bulk_create) { size_t page_size = (size_t) hg_mem_get_page_size(); hg_return_t ret; @@ -442,10 +442,12 @@ hg_perf_bulk_buf_alloc( if (init_data) hg_perf_init_data(info->bulk_bufs[i], alloc_size); - ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[i], - &alloc_size, bulk_flags, &info->local_bulk_handles[i]); - HG_TEST_CHECK_HG_ERROR(error, ret, "HG_Bulk_create() failed (%s)", - HG_Error_to_string(ret)); + if (bulk_create) { + ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[i], + &alloc_size, bulk_flags, &info->local_bulk_handles[i]); + HG_TEST_CHECK_HG_ERROR(error, ret, "HG_Bulk_create() failed (%s)", + HG_Error_to_string(ret)); + } } return HG_SUCCESS; @@ -640,7 +642,8 @@ hg_perf_bulk_buf_init(const struct hg_test_info *hg_test_info, hg_return_t ret; size_t i; - ret = hg_perf_bulk_buf_alloc(info, bulk_flags, bulk_op == HG_BULK_PULL); + ret = hg_perf_bulk_buf_alloc(info, bulk_flags, bulk_op == HG_BULK_PULL, + !hg_test_info->na_test_info.force_register); HG_TEST_CHECK_HG_ERROR(error, ret, "hg_perf_bulk_buf_alloc() failed (%s)", HG_Error_to_string(ret)); @@ -648,7 +651,9 @@ hg_perf_bulk_buf_init(const struct hg_test_info *hg_test_info, size_t handle_global_id = comm_rank + i * comm_size, target_rank = handle_global_id % info->target_addr_max; struct hg_perf_bulk_init_info bulk_info = { - .bulk = info->local_bulk_handles[i], + .bulk = (hg_test_info->na_test_info.force_register) + ? HG_BULK_NULL + : info->local_bulk_handles[i], .bulk_op = bulk_op, .handle_id = (uint32_t) (handle_global_id / info->target_addr_max), .bulk_count = (uint32_t) info->bulk_count, @@ -786,6 +791,8 @@ hg_perf_print_header_bw(const struct hg_test_info *hg_test_info, info->handle_max, (size_t) info->bulk_count); if (info->verify) printf("# WARNING verifying data, output will be slower\n"); + if (hg_test_info->na_test_info.force_register) + printf("# WARNING forcing registration on every iteration\n"); if (hg_test_info->na_test_info.mbps) printf("%-*s%*s%*s\n", 10, "# Size", NWIDTH, "Bandwidth (MB/s)", NWIDTH, "Time (us)"); @@ -906,6 +913,10 @@ hg_perf_proc_bulk_info(hg_proc_t proc, void *arg) struct hg_perf_bulk_info *info = (struct hg_perf_bulk_info *) arg; hg_return_t ret; + ret = hg_proc_hg_bulk_t(proc, &info->bulk); + HG_TEST_CHECK_HG_ERROR( + error, ret, "hg_proc_hg_bulk_t() failed (%s)", HG_Error_to_string(ret)); + ret = hg_proc_uint32_t(proc, &info->handle_id); HG_TEST_CHECK_HG_ERROR( error, ret, "hg_proc_uint32_t() failed (%s)", HG_Error_to_string(ret)); @@ -1044,7 +1055,7 @@ hg_perf_bulk_init_cb(hg_handle_t handle) info->buf_size_max = bulk_info.size_max; ret = hg_perf_bulk_buf_alloc( - info, bulk_flags, bulk_info.bulk_op == HG_BULK_PUSH); + info, bulk_flags, bulk_info.bulk_op == HG_BULK_PUSH, true); HG_TEST_CHECK_HG_ERROR(error_free, ret, "hg_perf_bulk_buf_alloc() failed (%s)", HG_Error_to_string(ret)); @@ -1060,7 +1071,8 @@ hg_perf_bulk_init_cb(hg_handle_t handle) info->class_id, bulk_info.target_rank, bulk_info.handle_id, info->handle_max); info->remote_bulk_handles[bulk_info.handle_id] = bulk_info.bulk; - HG_Bulk_ref_incr(bulk_info.bulk); + if (bulk_info.bulk != HG_BULK_NULL) + HG_Bulk_ref_incr(bulk_info.bulk); /* Send response back */ ret = HG_Respond(handle, NULL, NULL, NULL); @@ -1103,6 +1115,7 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op) struct hg_perf_request *request = (struct hg_perf_request *) HG_Get_data(handle); struct hg_perf_bulk_info bulk_info; + hg_bulk_t remote_bulk; hg_return_t ret; size_t i; @@ -1110,6 +1123,9 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op) ret = HG_Get_input(handle, &bulk_info); HG_TEST_CHECK_HG_ERROR( error, ret, "HG_Get_input() failed (%s)", HG_Error_to_string(ret)); + remote_bulk = bulk_info.bulk != HG_BULK_NULL + ? bulk_info.bulk + : info->remote_bulk_handles[bulk_info.handle_id]; /* Initialize request */ *request = (struct hg_perf_request){.complete_count = 0, @@ -1119,8 +1135,7 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op) /* Post bulk push */ for (i = 0; i < info->bulk_count; i++) { ret = HG_Bulk_transfer(info->context, hg_perf_bulk_transfer_cb, handle, - op, hg_info->addr, info->remote_bulk_handles[bulk_info.handle_id], - i * info->buf_size_max, + op, hg_info->addr, remote_bulk, i * info->buf_size_max, info->local_bulk_handles[bulk_info.handle_id], i * info->buf_size_max, bulk_info.size, HG_OP_ID_IGNORE); HG_TEST_CHECK_HG_ERROR(error_free, ret, diff --git a/Testing/perf/hg/mercury_perf.h b/Testing/perf/hg/mercury_perf.h index 8e7eee46..8285a946 100644 --- a/Testing/perf/hg/mercury_perf.h +++ b/Testing/perf/hg/mercury_perf.h @@ -80,6 +80,7 @@ struct hg_perf_bulk_init_info { }; struct hg_perf_bulk_info { + hg_bulk_t bulk; /* Bulk handle */ uint32_t handle_id; /* Source handle ID */ uint32_t size; /* Transfer size*/ };