From 4632550f67ce2c32431c3be77944f423fc71f777 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 22:27:16 -0600 Subject: [PATCH] ch4/ofi: use internal tag for pipeline chunk match_bits Follow a similar approach as nonblocking collectives, internal pipeline chunks use separate tag space (MPIDI_OFI_GPU_PIPELINE_SEND) and incrementing tags to avoid mismatch with regular messages. --- src/mpid/ch4/netmod/ofi/ofi_comm.c | 3 ++ src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 45 +++++++++++++--------- src/mpid/ch4/netmod/ofi/ofi_impl.h | 2 +- src/mpid/ch4/netmod/ofi/ofi_pre.h | 3 ++ src/mpid/ch4/netmod/ofi/ofi_send.h | 3 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 30 +-------------- 6 files changed, 38 insertions(+), 48 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_comm.c b/src/mpid/ch4/netmod/ofi/ofi_comm.c index 57b9cb131de..8936941498a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_comm.c +++ b/src/mpid/ch4/netmod/ofi/ofi_comm.c @@ -145,6 +145,9 @@ int MPIDI_OFI_mpi_comm_commit_pre_hook(MPIR_Comm * comm) MPIDI_OFI_COMM(comm).enable_hashing = 0; MPIDI_OFI_COMM(comm).pref_nic = NULL; + /* Initialize tag for gpu_pipeline chunks; incremented by sender. */ + MPIDI_OFI_COMM(comm).pipeline_tag = 0; + if (comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] == -1) { comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] = MPIR_CVAR_CH4_OFI_ENABLE_MULTI_NIC_STRIPING; diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index b2c11b00c8f..b442890775b 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -14,6 +14,11 @@ struct chunk_req { void *buf; }; +struct pipeline_header { + int n_chunks; + int pipeline_tag; +}; + static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, const void *buf, MPI_Aint chunk_sz); static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks); @@ -39,12 +44,11 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz, uint64_t cq_data, fi_addr_t remote_addr, - int vci_local, int ctx_idx, uint64_t match_bits) + int vci_local, int ctx_idx, uint64_t match_bits, int pipeline_tag) { int mpi_errno = MPI_SUCCESS; uint32_t n_chunks = 0; - uint64_t is_packed = 0; /* always 0 ? */ MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; if (data_sz <= chunk_sz) { /* data fits in a single chunk */ @@ -56,8 +60,6 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, n_chunks++; } } - MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); - MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) = n_chunks; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data) = cq_data; @@ -65,9 +67,15 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local) = vci_local; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx) = ctx_idx; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) = match_bits; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.pipeline_tag) = pipeline_tag; + + struct pipeline_header hdr; + hdr.n_chunks = n_chunks; + hdr.pipeline_tag = pipeline_tag; /* Send the initial empty packet for matching */ - MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, NULL, 0, cq_data, + MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, + &hdr, sizeof(hdr), cq_data | MPIDI_OFI_IDATA_PIPELINE, remote_addr, match_bits), vci_local, tinjectdata); struct send_alloc *p; @@ -197,7 +205,7 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch int ctx_idx = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx); fi_addr_t remote_addr = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr); uint64_t cq_data = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data); - uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) | + uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.pipeline_tag) | MPIDI_OFI_GPU_PIPELINE_SEND; MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci_local).lock); MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, @@ -318,7 +326,6 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr); int ctx_idx = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.ctx_idx); int vci = MPIDI_Request_get_vci(rreq); - uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.mask_bits); struct chunk_req *chunk_req; @@ -327,10 +334,14 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) chunk_req->parent = rreq; chunk_req->buf = host_buf; + + uint64_t match_bits; if (p->n_chunks == -1) { + match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; } else { - match_bits |= MPIDI_OFI_GPU_PIPELINE_SEND; + match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.pipeline_tag) | + MPIDI_OFI_GPU_PIPELINE_SEND; chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; } MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); @@ -380,24 +391,22 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = true; } - uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); - uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); - /* ? - Not sure why sender cannot send packed data */ - MPIR_Assert(packed == 0); - if (wc->len > 0) { + bool is_pipeline = (wc->data & MPIDI_OFI_IDATA_PIPELINE); + if (!is_pipeline) { /* message from a normal send */ - MPIR_Assert(n_chunks == 0); MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 1; mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); } else { - MPIR_Assert(n_chunks > 0); - MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = n_chunks; + struct pipeline_header *p_hdr = host_buf; + MPIR_Assert(p_hdr->n_chunks > 0); + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = p_hdr->n_chunks; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.pipeline_tag) = p_hdr->pipeline_tag; /* There is no data in the init chunk, free the buffer */ MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); /* Post recv for the remaining chunks. */ - for (int i = 0; i < n_chunks; i++) { - mpi_errno = start_recv_chunk(rreq, i, n_chunks); + for (int i = 0; i < p_hdr->n_chunks; i++) { + mpi_errno = start_recv_chunk(rreq, i, p_hdr->n_chunks); MPIR_ERR_CHECK(mpi_errno); } } diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index b4c959b8f35..698d68fac8e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -831,7 +831,7 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz, uint64_t cq_data, fi_addr_t remote_addr, - int vci_local, int ctx_idx, uint64_t match_bits); + int vci_local, int ctx_idx, uint64_t match_bits, int pipeline_tag); int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, void *recv_buf, MPI_Aint count, MPI_Datatype datatype, fi_addr_t remote_addr, int vci_local, diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 07b999ca808..48beae17fe2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -48,6 +48,7 @@ typedef struct { int enable_striping; /* Flag to enable striping per communicator. */ int enable_hashing; /* Flag to enable hashing per communicator. */ int *pref_nic; /* Array to specify the preferred NIC for each rank (if needed) */ + int pipeline_tag; /* match_bits for gpu_pipeline chunks */ } MPIDI_OFI_comm_t; enum { MPIDI_AMTYPE_NONE = 0, @@ -223,6 +224,7 @@ typedef struct { fi_addr_t remote_addr; uint64_t cq_data; uint64_t match_bits; + int pipeline_tag; int num_remain; } send; struct { @@ -232,6 +234,7 @@ typedef struct { uint64_t match_bits; uint64_t mask_bits; MPI_Aint offset; + int pipeline_tag; int num_inrecv; int num_remain; bool is_sync; diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 855970e8927..dc30330a84c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -275,9 +275,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { /* Pipeline path */ fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(addr, receiver_nic, vci_remote); + MPIDI_OFI_COMM(comm).pipeline_tag += 1; mpi_errno = MPIDI_OFI_gpu_pipeline_send(sreq, buf, count, datatype, attr, data_sz, cq_data, remote_addr, vci_local, ctx_idx, - match_bits); + match_bits, MPIDI_OFI_COMM(comm).pipeline_tag); MPIR_ERR_CHECK(mpi_errno); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 46f49ea4478..657fe08261a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -37,14 +37,12 @@ #define MPIDI_OFI_IDATA_ERROR_BITS (2) /* The number of bits in the immediate data field allocated to the source rank and error propagation. */ #define MPIDI_OFI_IDATA_SRC_ERROR_BITS (MPIDI_OFI_IDATA_SRC_BITS + MPIDI_OFI_IDATA_ERROR_BITS) -/* The number of bits in the immediate data field allocated to MPI_Packed datatype for GPU. */ -#define MPIDI_OFI_IDATA_GPU_PACKED_BITS (1) -/* The offset of bits in the immediate data field allocated to number of message chunks. */ -#define MPIDI_OFI_IDATA_GPUCHUNK_OFFSET (MPIDI_OFI_IDATA_SRC_ERROR_BITS + MPIDI_OFI_IDATA_GPU_PACKED_BITS) /* Bit mask for MPIR_ERR_OTHER */ #define MPIDI_OFI_ERR_OTHER (0x1ULL) /* Bit mask for MPIR_PROC_FAILED */ #define MPIDI_OFI_ERR_PROC_FAILED (0x2ULL) +/* Bit mask for gpu pipeline */ +#define MPIDI_OFI_IDATA_PIPELINE (1ULL << 32) /* Set the error bits */ MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_idata_set_error_bits(uint64_t * data_field, @@ -75,30 +73,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_idata_get_error_bits(uint64_t idata) } } -/* Set the gpu packed bit */ -static inline void MPIDI_OFI_idata_set_gpu_packed_bit(uint64_t * data_field, uint64_t is_packed) -{ - *data_field = (*data_field) | (is_packed << MPIDI_OFI_IDATA_SRC_ERROR_BITS); -} - -/* Get the gpu packed bit from the OFI data field. */ -static inline uint32_t MPIDI_OFI_idata_get_gpu_packed_bit(uint64_t idata) -{ - return (idata >> MPIDI_OFI_IDATA_SRC_ERROR_BITS) & 0x1ULL; -} - -/* Set gpu chunk bits */ -static inline void MPIDI_OFI_idata_set_gpuchunk_bits(uint64_t * data_field, uint64_t n_chunks) -{ - *data_field = (*data_field) | (n_chunks << MPIDI_OFI_IDATA_GPUCHUNK_OFFSET); -} - -/* Get gpu chunks from the OFI data field. */ -static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata) -{ - return (idata >> MPIDI_OFI_IDATA_GPUCHUNK_OFFSET); -} - /* There are 4 protocol bits: * - MPIDI_DYNPROC_SEND * - MPIDI_OFI_HUGE_SEND