From f1e02d2089bc8b5fc4ad48865811d3c7bdbdd268 Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Thu, 3 Oct 2024 13:07:56 -0500
Subject: [PATCH 1/2] NA OFI: fix cxi domain ops settings

Also disable PROV_KEY_CACHE

Add NA_OFI_SKIP_DOMAIN_OPS env variable to skip domain ops
---
 src/na/na_ofi.c | 134 +++++++++++++++++++++++++++---------------------
 1 file changed, 75 insertions(+), 59 deletions(-)

diff --git a/src/na/na_ofi.c b/src/na/na_ofi.c
index c833dc14..098ac8b6 100644
--- a/src/na/na_ofi.c
+++ b/src/na/na_ofi.c
@@ -97,9 +97,6 @@
 #    define FI_ADDR_CXI  (FI_ADDR_OPX + 1)
 #    define FI_PROTO_CXI (FI_PROTO_OPX + 1)
 #endif
-#ifndef NA_OFI_HAS_EXT_CXI_H
-#    define FI_CXI_DOM_OPS_6 "dom_ops_v6"
-#endif
 
 /* Fallback for undefined XNET values */
 #if FI_VERSION_LT(FI_COMPILE_VERSION, FI_VERSION(1, 17))
@@ -717,19 +714,25 @@ struct fi_gni_auth_key {
 #endif
 
 #ifndef NA_OFI_HAS_EXT_CXI_H
-/* CXI domain ops (v6) */
-struct fi_cxi_dom_ops {
-    int (*cntr_read)(struct fid *fid, unsigned int cntr, uint64_t *value,
-        struct timespec *ts);
-    int (*topology)(struct fid *fid, unsigned int *group_id,
-        unsigned int *switch_id, unsigned int *port_id);
-    int (*enable_hybrid_mr_desc)(struct fid *fid, bool enable);
-    size_t (*ep_get_unexp_msgs)(struct fid_ep *fid_ep,
-        struct fi_cq_tagged_entry *entry, size_t count, fi_addr_t *src_addr,
-        size_t *ux_count);
-    int (*get_dwq_depth)(struct fid *fid, size_t *depth);
-    int (*enable_mr_match_events)(struct fid *fid, bool enable);
-    int (*enable_optimized_mrs)(struct fid *fid, bool enable);
+/*
+ * TODO: The following should be integrated into the include/rdma/fi_ext.h
+ * and are use for provider specific fi_control() operations.
+ */
+#    define FI_PROV_SPECIFIC_CXI (0xccc << 16)
+
+enum {
+    FI_OPT_CXI_SET_TCLASS = -FI_PROV_SPECIFIC_CXI, /* uint32_t */
+    FI_OPT_CXI_SET_MSG_ORDER,                      /* uint64_t */
+
+    /* fid_nic control operation to refresh NIC attributes. */
+    FI_OPT_CXI_NIC_REFRESH_ATTR,
+
+    FI_OPT_CXI_SET_MR_MATCH_EVENTS, /* bool */
+    FI_OPT_CXI_GET_MR_MATCH_EVENTS, /* bool */
+    FI_OPT_CXI_SET_OPTIMIZED_MRS,   /* bool */
+    FI_OPT_CXI_GET_OPTIMIZED_MRS,   /* bool */
+    FI_OPT_CXI_SET_PROV_KEY_CACHE,  /* bool */
+    FI_OPT_CXI_GET_PROV_KEY_CACHE,  /* bool */
 };
 
 /* CXI Authorization Key */
@@ -1152,13 +1155,11 @@ static na_return_t
 na_ofi_gni_set_domain_ops(struct na_ofi_domain *na_ofi_domain);
 #endif
 
-#ifdef FI_CXI_DOM_OPS_6
 /**
  * Set CXI specific domain ops.
  */
-static na_return_t
+static void
 na_ofi_cxi_set_domain_ops(struct na_ofi_domain *na_ofi_domain);
-#endif
 
 /**
  * Parse auth key.
@@ -4240,11 +4241,8 @@ na_ofi_set_domain_ops(
             return NA_PROTONOSUPPORT;
 #endif
         case NA_OFI_PROV_CXI:
-#ifdef FI_CXI_DOM_OPS_6
-            return na_ofi_cxi_set_domain_ops(na_ofi_domain);
-#else
-            (void) na_ofi_domain;
-#endif
+            na_ofi_cxi_set_domain_ops(na_ofi_domain);
+            break;
         case NA_OFI_PROV_SHM:
         case NA_OFI_PROV_SOCKETS:
         case NA_OFI_PROV_TCP:
@@ -4252,7 +4250,7 @@ na_ofi_set_domain_ops(
         case NA_OFI_PROV_PSM2:
         case NA_OFI_PROV_OPX:
         case NA_OFI_PROV_VERBS_RXM:
-            return NA_SUCCESS;
+            break;
         case NA_OFI_PROV_NULL:
         default:
             NA_LOG_SUBSYS_ERROR(fatal,
@@ -4260,6 +4258,8 @@ na_ofi_set_domain_ops(
                 na_ofi_prov_name[prov_type]);
             return NA_PROTONOSUPPORT;
     }
+
+    return NA_SUCCESS;
 }
 
 /*---------------------------------------------------------------------------*/
@@ -4346,43 +4346,52 @@ na_ofi_gni_set_domain_ops(struct na_ofi_domain *na_ofi_domain)
 #endif
 
 /*---------------------------------------------------------------------------*/
-#ifdef FI_CXI_DOM_OPS_6
-static na_return_t
+static void
 na_ofi_cxi_set_domain_ops(struct na_ofi_domain *na_ofi_domain)
 {
-    struct fi_cxi_dom_ops *dom_ops;
-    na_return_t ret;
+    bool val = false;
     int rc;
 
-    rc = fi_open_ops(&na_ofi_domain->fi_domain->fid, FI_CXI_DOM_OPS_6, 0,
-        (void **) &dom_ops, NULL);
-    if (rc != 0) {
-        /* Silently ignore */
-        NA_LOG_SUBSYS_DEBUG(
-            cls, "fi_open_ops() failed, rc: %d (%s)", rc, fi_strerror(-rc));
-        return NA_SUCCESS;
-    }
-
-    /* Prevent potential memory corruption by ensuring that memory backing MRs
-     * cannot be accessed after invoking fi_close() even if that memory remains
-     * in the MR cache. */
-    rc = dom_ops->enable_mr_match_events(&na_ofi_domain->fi_domain->fid, true);
-    NA_CHECK_SUBSYS_ERROR(cls, rc != 0, error, ret, na_ofi_errno_to_na(-rc),
-        "enable_mr_match_events() failed, rc: %d (%s)", rc, fi_strerror(-rc));
-
-    /* Disable use of optimized MRs to prevent quick recycle of MR keys (when
-     * using FI_MR_PROV_KEY). This prevents potential memory corruptions when
-     * multiple regions could end up using the same key. */
-    rc = dom_ops->enable_optimized_mrs(&na_ofi_domain->fi_domain->fid, false);
-    NA_CHECK_SUBSYS_ERROR(cls, rc != 0, error, ret, na_ofi_errno_to_na(-rc),
-        "enable_optimized_mrs() failed, rc: %d (%s)", rc, fi_strerror(-rc));
-
-    return NA_SUCCESS;
-
-error:
-    return ret;
+    /* PROV_KEY_CACHE: The provider key cache is a performance optimization for
+     * FI_MR_PROV_KEY. The performance gain is fi_mr_close() becomes a no-op but
+     * at the cost of the corresponding MR being left exposed to the network.
+     * This is intended to be used for applications where fi_mr_close() is on
+     * the critical path. For storage use-cases, leaving MRs exposed is an
+     * issue. This could result in MR operations unexpectedly completing and
+     * reading/writing to unknown memory. */
+    rc = fi_control(
+        &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_PROV_KEY_CACHE, &val);
+    NA_CHECK_SUBSYS_WARNING(cls, rc != 0,
+        "could not set CXI PROV_KEY_CACHE property (%s)", fi_strerror(-rc));
+
+    /* OPTIMIZED_MRS: Optimized MRs offer a higher operation rate over
+     * standard/unoptimized MRs. Because optimized MR allocation/deallocation is
+     * expensive (i.e., it always requires calls into the kernel), optimized MRs
+     * should only be used for persistent MRs. This typically maps to MPI/SHMEM
+     * RMA windows which are persistent. For Mercury, since MRs are ephemeral
+     * and allocation/deallocation may be on the critical path, optimized MRs
+     * should be disabled. Optimized MRs also present a risk for the
+     * recycling of MR keys (when using FI_MR_PROV_KEY) where multiple regions
+     * could end up using the same key by allocating/deallocating the MR,
+     * leading to potential memory corruptions. */
+    rc = fi_control(
+        &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_OPTIMIZED_MRS, &val);
+    NA_CHECK_SUBSYS_WARNING(cls, rc != 0,
+        "could not set CXI OPTIMIZED_MRS property (%s)", fi_strerror(-rc));
+
+    /* MR_MATCH_EVENTS: While standard/unoptimized MRs do not have a call into
+     * the kernel for MR allocation, there is still a call into the kernel for
+     * MR deallocation. To avoid this kernel call, MR_MATCH_EVENTS needs to be
+     * enabled. The cost MR_MATCH_EVENTS introduces is where the target of an
+     * RMA operation was previously passive (i.e., no events), this will enable
+     * MR events. This requires the owner of the MR to process event queues in a
+     * timely manner or have large event queue buffers. */
+    val = true;
+    rc = fi_control(
+        &na_ofi_domain->fi_domain->fid, FI_OPT_CXI_SET_MR_MATCH_EVENTS, &val);
+    NA_CHECK_SUBSYS_WARNING(cls, rc != 0,
+        "could not set CXI MR_MATCH_EVENTS property (%s)", fi_strerror(-rc));
 }
-#endif
 
 /*---------------------------------------------------------------------------*/
 static na_return_t
@@ -4749,6 +4758,8 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric,
     struct fi_domain_attr *domain_attr = fi_info->domain_attr;
     union na_ofi_auth_key base_auth_key;
     const union na_ofi_auth_key *base_auth_key_p = NULL;
+    char *env;
+    bool skip_domain_ops = false;
     na_return_t ret;
     int rc;
 
@@ -4842,8 +4853,13 @@ na_ofi_domain_open(const struct na_ofi_fabric *na_ofi_fabric,
         fi_tostr(domain_attr, FI_TYPE_DOMAIN_ATTR));
 
     /* Set optional domain ops */
-    ret = na_ofi_set_domain_ops(na_ofi_fabric->prov_type, na_ofi_domain);
-    NA_CHECK_SUBSYS_NA_ERROR(cls, error, ret, "Could not set domain ops");
+    env = getenv("NA_OFI_SKIP_DOMAIN_OPS");
+    if (env != NULL)
+        skip_domain_ops = (atoi(env) != 0);
+    if (!skip_domain_ops) {
+        ret = na_ofi_set_domain_ops(na_ofi_fabric->prov_type, na_ofi_domain);
+        NA_CHECK_SUBSYS_NA_ERROR(cls, error, ret, "Could not set domain ops");
+    }
 
 #if FI_VERSION_GE(FI_COMPILE_VERSION, FI_VERSION(1, 20))
     /* Check if we can use FI_AV_USER_ID */

From a4597b19ca2841685596595a8859269c833eeb14 Mon Sep 17 00:00:00 2001
From: Jerome Soumagne <jerome.soumagne@intel.com>
Date: Fri, 4 Oct 2024 17:13:03 -0500
Subject: [PATCH 2/2] HG Perf: support forced registration in hg_bw_read/write

---
 Testing/perf/hg/hg_bw_read.c   | 21 +++++++++++++++++
 Testing/perf/hg/hg_bw_write.c  | 21 +++++++++++++++++
 Testing/perf/hg/mercury_perf.c | 43 +++++++++++++++++++++++-----------
 Testing/perf/hg/mercury_perf.h |  1 +
 4 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/Testing/perf/hg/hg_bw_read.c b/Testing/perf/hg/hg_bw_read.c
index 5756aa21..6cbe0ef2 100644
--- a/Testing/perf/hg/hg_bw_read.c
+++ b/Testing/perf/hg/hg_bw_read.c
@@ -54,8 +54,22 @@ hg_perf_run(const struct hg_test_info *hg_test_info,
             hg_time_get_current(&t1);
         }
 
+        if (hg_test_info->na_test_info.force_register) {
+            for (j = 0; j < info->handle_max; j++) {
+                hg_size_t bulk_size = info->buf_size_max * info->bulk_count;
+                ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[j],
+                    &bulk_size, HG_BULK_READ_ONLY,
+                    &info->local_bulk_handles[j]);
+                HG_TEST_CHECK_HG_ERROR(error, ret,
+                    "HG_Bulk_create() failed (%s)", HG_Error_to_string(ret));
+            }
+        }
+
         for (j = 0; j < info->handle_max; j++) {
             struct hg_perf_bulk_info in_struct = {
+                .bulk = (hg_test_info->na_test_info.force_register)
+                            ? info->local_bulk_handles[j]
+                            : HG_BULK_NULL,
                 .handle_id = (uint32_t) ((comm_rank + j * comm_size) /
                                          info->target_addr_max),
                 .size = (uint32_t) buf_size};
@@ -83,6 +97,13 @@ hg_perf_run(const struct hg_test_info *hg_test_info,
                 }
             }
         }
+
+        if (hg_test_info->na_test_info.force_register) {
+            for (j = 0; j < info->handle_max; j++) {
+                (void) HG_Bulk_free(info->local_bulk_handles[j]);
+                info->local_bulk_handles[j] = HG_BULK_NULL;
+            }
+        }
     }
 
     if (comm_size > 1)
diff --git a/Testing/perf/hg/hg_bw_write.c b/Testing/perf/hg/hg_bw_write.c
index 91eceb9a..625e808d 100644
--- a/Testing/perf/hg/hg_bw_write.c
+++ b/Testing/perf/hg/hg_bw_write.c
@@ -54,8 +54,22 @@ hg_perf_run(const struct hg_test_info *hg_test_info,
             hg_time_get_current(&t1);
         }
 
+        if (hg_test_info->na_test_info.force_register) {
+            for (j = 0; j < info->handle_max; j++) {
+                hg_size_t bulk_size = info->buf_size_max * info->bulk_count;
+                ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[j],
+                    &bulk_size, HG_BULK_READ_ONLY,
+                    &info->local_bulk_handles[j]);
+                HG_TEST_CHECK_HG_ERROR(error, ret,
+                    "HG_Bulk_create() failed (%s)", HG_Error_to_string(ret));
+            }
+        }
+
         for (j = 0; j < info->handle_max; j++) {
             struct hg_perf_bulk_info in_struct = {
+                .bulk = (hg_test_info->na_test_info.force_register)
+                            ? info->local_bulk_handles[j]
+                            : HG_BULK_NULL,
                 .handle_id = (uint32_t) ((comm_rank + j * comm_size) /
                                          info->target_addr_max),
                 .size = (uint32_t) buf_size};
@@ -69,6 +83,13 @@ hg_perf_run(const struct hg_test_info *hg_test_info,
         ret = hg_perf_request_wait(info, &request, HG_MAX_IDLE_TIME, NULL);
         HG_TEST_CHECK_HG_ERROR(error, ret, "hg_perf_request_wait() failed (%s)",
             HG_Error_to_string(ret));
+
+        if (hg_test_info->na_test_info.force_register) {
+            for (j = 0; j < info->handle_max; j++) {
+                (void) HG_Bulk_free(info->local_bulk_handles[j]);
+                info->local_bulk_handles[j] = HG_BULK_NULL;
+            }
+        }
     }
 
     if (comm_size > 1)
diff --git a/Testing/perf/hg/mercury_perf.c b/Testing/perf/hg/mercury_perf.c
index c60f944a..1d715abe 100644
--- a/Testing/perf/hg/mercury_perf.c
+++ b/Testing/perf/hg/mercury_perf.c
@@ -60,8 +60,8 @@ static void
 hg_perf_class_cleanup(struct hg_perf_class_info *info);
 
 static hg_return_t
-hg_perf_bulk_buf_alloc(
-    struct hg_perf_class_info *info, uint8_t bulk_flags, bool init_data);
+hg_perf_bulk_buf_alloc(struct hg_perf_class_info *info, uint8_t bulk_flags,
+    bool init_data, bool bulk_create);
 
 static void
 hg_perf_bulk_buf_free(struct hg_perf_class_info *info);
@@ -413,8 +413,8 @@ hg_perf_class_cleanup(struct hg_perf_class_info *info)
 
 /*---------------------------------------------------------------------------*/
 static hg_return_t
-hg_perf_bulk_buf_alloc(
-    struct hg_perf_class_info *info, uint8_t bulk_flags, bool init_data)
+hg_perf_bulk_buf_alloc(struct hg_perf_class_info *info, uint8_t bulk_flags,
+    bool init_data, bool bulk_create)
 {
     size_t page_size = (size_t) hg_mem_get_page_size();
     hg_return_t ret;
@@ -442,10 +442,12 @@ hg_perf_bulk_buf_alloc(
         if (init_data)
             hg_perf_init_data(info->bulk_bufs[i], alloc_size);
 
-        ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[i],
-            &alloc_size, bulk_flags, &info->local_bulk_handles[i]);
-        HG_TEST_CHECK_HG_ERROR(error, ret, "HG_Bulk_create() failed (%s)",
-            HG_Error_to_string(ret));
+        if (bulk_create) {
+            ret = HG_Bulk_create(info->hg_class, 1, &info->bulk_bufs[i],
+                &alloc_size, bulk_flags, &info->local_bulk_handles[i]);
+            HG_TEST_CHECK_HG_ERROR(error, ret, "HG_Bulk_create() failed (%s)",
+                HG_Error_to_string(ret));
+        }
     }
 
     return HG_SUCCESS;
@@ -640,7 +642,8 @@ hg_perf_bulk_buf_init(const struct hg_test_info *hg_test_info,
     hg_return_t ret;
     size_t i;
 
-    ret = hg_perf_bulk_buf_alloc(info, bulk_flags, bulk_op == HG_BULK_PULL);
+    ret = hg_perf_bulk_buf_alloc(info, bulk_flags, bulk_op == HG_BULK_PULL,
+        !hg_test_info->na_test_info.force_register);
     HG_TEST_CHECK_HG_ERROR(error, ret, "hg_perf_bulk_buf_alloc() failed (%s)",
         HG_Error_to_string(ret));
 
@@ -648,7 +651,9 @@ hg_perf_bulk_buf_init(const struct hg_test_info *hg_test_info,
         size_t handle_global_id = comm_rank + i * comm_size,
                target_rank = handle_global_id % info->target_addr_max;
         struct hg_perf_bulk_init_info bulk_info = {
-            .bulk = info->local_bulk_handles[i],
+            .bulk = (hg_test_info->na_test_info.force_register)
+                        ? HG_BULK_NULL
+                        : info->local_bulk_handles[i],
             .bulk_op = bulk_op,
             .handle_id = (uint32_t) (handle_global_id / info->target_addr_max),
             .bulk_count = (uint32_t) info->bulk_count,
@@ -786,6 +791,8 @@ hg_perf_print_header_bw(const struct hg_test_info *hg_test_info,
         info->handle_max, (size_t) info->bulk_count);
     if (info->verify)
         printf("# WARNING verifying data, output will be slower\n");
+    if (hg_test_info->na_test_info.force_register)
+        printf("# WARNING forcing registration on every iteration\n");
     if (hg_test_info->na_test_info.mbps)
         printf("%-*s%*s%*s\n", 10, "# Size", NWIDTH, "Bandwidth (MB/s)", NWIDTH,
             "Time (us)");
@@ -906,6 +913,10 @@ hg_perf_proc_bulk_info(hg_proc_t proc, void *arg)
     struct hg_perf_bulk_info *info = (struct hg_perf_bulk_info *) arg;
     hg_return_t ret;
 
+    ret = hg_proc_hg_bulk_t(proc, &info->bulk);
+    HG_TEST_CHECK_HG_ERROR(
+        error, ret, "hg_proc_hg_bulk_t() failed (%s)", HG_Error_to_string(ret));
+
     ret = hg_proc_uint32_t(proc, &info->handle_id);
     HG_TEST_CHECK_HG_ERROR(
         error, ret, "hg_proc_uint32_t() failed (%s)", HG_Error_to_string(ret));
@@ -1044,7 +1055,7 @@ hg_perf_bulk_init_cb(hg_handle_t handle)
         info->buf_size_max = bulk_info.size_max;
 
         ret = hg_perf_bulk_buf_alloc(
-            info, bulk_flags, bulk_info.bulk_op == HG_BULK_PUSH);
+            info, bulk_flags, bulk_info.bulk_op == HG_BULK_PUSH, true);
         HG_TEST_CHECK_HG_ERROR(error_free, ret,
             "hg_perf_bulk_buf_alloc() failed (%s)", HG_Error_to_string(ret));
 
@@ -1060,7 +1071,8 @@ hg_perf_bulk_init_cb(hg_handle_t handle)
         info->class_id, bulk_info.target_rank, bulk_info.handle_id,
         info->handle_max);
     info->remote_bulk_handles[bulk_info.handle_id] = bulk_info.bulk;
-    HG_Bulk_ref_incr(bulk_info.bulk);
+    if (bulk_info.bulk != HG_BULK_NULL)
+        HG_Bulk_ref_incr(bulk_info.bulk);
 
     /* Send response back */
     ret = HG_Respond(handle, NULL, NULL, NULL);
@@ -1103,6 +1115,7 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op)
     struct hg_perf_request *request =
         (struct hg_perf_request *) HG_Get_data(handle);
     struct hg_perf_bulk_info bulk_info;
+    hg_bulk_t remote_bulk;
     hg_return_t ret;
     size_t i;
 
@@ -1110,6 +1123,9 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op)
     ret = HG_Get_input(handle, &bulk_info);
     HG_TEST_CHECK_HG_ERROR(
         error, ret, "HG_Get_input() failed (%s)", HG_Error_to_string(ret));
+    remote_bulk = bulk_info.bulk != HG_BULK_NULL
+                      ? bulk_info.bulk
+                      : info->remote_bulk_handles[bulk_info.handle_id];
 
     /* Initialize request */
     *request = (struct hg_perf_request){.complete_count = 0,
@@ -1119,8 +1135,7 @@ hg_perf_bulk_common(hg_handle_t handle, hg_bulk_op_t op)
     /* Post bulk push */
     for (i = 0; i < info->bulk_count; i++) {
         ret = HG_Bulk_transfer(info->context, hg_perf_bulk_transfer_cb, handle,
-            op, hg_info->addr, info->remote_bulk_handles[bulk_info.handle_id],
-            i * info->buf_size_max,
+            op, hg_info->addr, remote_bulk, i * info->buf_size_max,
             info->local_bulk_handles[bulk_info.handle_id],
             i * info->buf_size_max, bulk_info.size, HG_OP_ID_IGNORE);
         HG_TEST_CHECK_HG_ERROR(error_free, ret,
diff --git a/Testing/perf/hg/mercury_perf.h b/Testing/perf/hg/mercury_perf.h
index 8e7eee46..8285a946 100644
--- a/Testing/perf/hg/mercury_perf.h
+++ b/Testing/perf/hg/mercury_perf.h
@@ -80,6 +80,7 @@ struct hg_perf_bulk_init_info {
 };
 
 struct hg_perf_bulk_info {
+    hg_bulk_t bulk;     /* Bulk handle */
     uint32_t handle_id; /* Source handle ID */
     uint32_t size;      /* Transfer size*/
 };