From 0f74170454e827cbff74457c3e934dd9fc114325 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 1 Aug 2024 12:48:30 -0500 Subject: [PATCH 1/5] typerep: add fastpath for H2H typerep copy/pack/unpack Add new flag MPIR_TYPEREP_FLAG_H2H can be passed to choose this path. If typerep user knows the buffers are all host, they can using this to bypass buffer attribute checking and related branches. This can be useful for SHM pipelined transfer where send/recv buffer attribute can be cached in request. --- src/include/mpir_typerep.h | 1 + .../datatype/typerep/src/typerep_yaksa_pack.c | 251 ++++++++++++++++-- 2 files changed, 224 insertions(+), 28 deletions(-) diff --git a/src/include/mpir_typerep.h b/src/include/mpir_typerep.h index cde3794484f..86c0a6ecb05 100644 --- a/src/include/mpir_typerep.h +++ b/src/include/mpir_typerep.h @@ -62,6 +62,7 @@ int MPIR_Typerep_iov_len(MPI_Aint count, MPI_Datatype type, MPI_Aint max_iov_byt #define MPIR_TYPEREP_FLAG_NONE 0x0UL #define MPIR_TYPEREP_FLAG_STREAM 0x1UL +#define MPIR_TYPEREP_FLAG_H2H 0x2UL int MPIR_Typerep_copy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, uint32_t flags); int MPIR_Typerep_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, diff --git a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c index a6f03201e02..e8c0985a5f8 100644 --- a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c +++ b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c @@ -87,31 +87,35 @@ static int typerep_do_copy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, goto fn_exit; } + if (flags & MPIR_TYPEREP_FLAG_H2H) { + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy(outbuf, inbuf, num_bytes); + } else { + MPIR_Memcpy_stream(outbuf, inbuf, num_bytes); + } + } + MPL_pointer_attr_t inattr, outattr; MPIR_GPU_query_pointer_attr(inbuf, &inattr); MPIR_GPU_query_pointer_attr(outbuf, &outattr); - if (IS_HOST(inattr) && IS_HOST(outattr)) { - MPIR_Memcpy(outbuf, inbuf, num_bytes); + uintptr_t actual_pack_bytes; + + yaksa_info_t info = MPII_yaksa_get_info(&inattr, &outattr); + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_ipack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, + &actual_pack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + MPIR_Assert(actual_pack_bytes == num_bytes); } else { - uintptr_t actual_pack_bytes; - - yaksa_info_t info = MPII_yaksa_get_info(&inattr, &outattr); - if (typerep_req) { - typerep_req->info = info; - rc = yaksa_ipack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, - &actual_pack_bytes, info, YAKSA_OP__REPLACE, - (yaksa_request_t *) & typerep_req->req); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - MPIR_Assert(actual_pack_bytes == num_bytes); - } else { - rc = yaksa_pack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, - &actual_pack_bytes, info, YAKSA_OP__REPLACE); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - MPIR_Assert(actual_pack_bytes == num_bytes); - rc = MPII_yaksa_free_info(info); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - } + rc = yaksa_pack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, + &actual_pack_bytes, info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + MPIR_Assert(actual_pack_bytes == num_bytes); + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); } fn_exit: @@ -213,6 +217,94 @@ static int typerep_do_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype dat goto fn_exit; } +static int typerep_do_pack_h2h(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, + MPI_Aint inoffset, void *outbuf, MPI_Aint max_pack_bytes, + MPI_Aint * actual_pack_bytes, MPIR_Typerep_req * typerep_req, + uint32_t flags) +{ + MPIR_FUNC_ENTER; + + int mpi_errno = MPI_SUCCESS; + int rc; + + if (typerep_req) { + typerep_req->req = MPIR_TYPEREP_REQ_NULL; + } + + if (incount == 0) { + *actual_pack_bytes = 0; + goto fn_exit; + } + + MPIR_Assert(datatype != MPI_DATATYPE_NULL); + + int is_contig = 0; + int element_size = -1; + const void *inbuf_ptr; /* adjusted by true_lb */ + MPI_Aint total_size = 0; + if (HANDLE_IS_BUILTIN(datatype)) { + is_contig = 1; + element_size = MPIR_Datatype_get_basic_size(datatype); + inbuf_ptr = inbuf; + total_size = incount * element_size; + } else { + MPIR_Datatype *dtp; + MPIR_Datatype_get_ptr(datatype, dtp); + is_contig = dtp->is_contig; + element_size = dtp->builtin_element_size; + inbuf_ptr = MPIR_get_contig_ptr(inbuf, dtp->true_lb); + total_size = incount * dtp->size; + } + + /* only query the pointer attributes in case of relative addressing */ + // bool rel_addressing = (inbuf != MPI_BOTTOM); + // if (rel_addressing) { + // MPIR_GPU_query_pointer_attr(inbuf_ptr, &inattr); + // MPIR_GPU_query_pointer_attr(outbuf, &outattr); + // } + + if (is_contig) { + MPI_Aint real_bytes = MPL_MIN(total_size - inoffset, max_pack_bytes); + /* Make sure we never pack partial element */ + real_bytes -= real_bytes % element_size; + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy_stream(outbuf, MPIR_get_contig_ptr(inbuf_ptr, inoffset), real_bytes); + } else { + MPIR_Memcpy(outbuf, MPIR_get_contig_ptr(inbuf_ptr, inoffset), real_bytes); + } + *actual_pack_bytes = real_bytes; + goto fn_exit; + } + + yaksa_type_t type = MPII_Typerep_get_yaksa_type(datatype); + yaksa_info_t info = MPII_yaksa_info_nogpu; + + uintptr_t real_pack_bytes; + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_ipack(inbuf, incount, type, inoffset, outbuf, max_pack_bytes, + &real_pack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } else { + rc = yaksa_pack(inbuf, incount, type, inoffset, outbuf, max_pack_bytes, + &real_pack_bytes, info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + if (info) { + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } + } + + *actual_pack_bytes = (MPI_Aint) real_pack_bytes; + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* This function checks whether the operation is supported in yaksa for the * provided datatype */ int MPIR_Typerep_reduce_is_supported(MPI_Op op, MPI_Aint count, MPI_Datatype datatype) @@ -372,6 +464,89 @@ static int typerep_do_unpack(const void *inbuf, MPI_Aint insize, void *outbuf, M goto fn_exit; } +static int typerep_do_unpack_h2h(const void *inbuf, MPI_Aint insize, void *outbuf, + MPI_Aint outcount, MPI_Datatype datatype, MPI_Aint outoffset, + MPI_Aint * actual_unpack_bytes, MPIR_Typerep_req * typerep_req, + uint32_t flags) +{ + MPIR_FUNC_ENTER; + + int mpi_errno = MPI_SUCCESS; + int rc; + + if (typerep_req) { + typerep_req->req = MPIR_TYPEREP_REQ_NULL; + } + + if (insize == 0) { + *actual_unpack_bytes = 0; + goto fn_exit; + } + + MPIR_Assert(datatype != MPI_DATATYPE_NULL); + + int is_contig = 0; + int element_size = -1; + const void *outbuf_ptr; /* adjusted by true_lb */ + MPI_Aint total_size = 0; + if (HANDLE_IS_BUILTIN(datatype)) { + is_contig = 1; + element_size = MPIR_Datatype_get_basic_size(datatype); + outbuf_ptr = outbuf; + total_size = outcount * element_size; + } else { + MPIR_Datatype *dtp; + MPIR_Datatype_get_ptr(datatype, dtp); + is_contig = dtp->is_contig; + element_size = dtp->builtin_element_size; + outbuf_ptr = MPIR_get_contig_ptr(outbuf, dtp->true_lb); + total_size = outcount * dtp->size; + } + + if (is_contig) { + *actual_unpack_bytes = MPL_MIN(total_size - outoffset, insize); + /* We assume the amount we unpack is multiple of element_size */ + MPIR_Assert(element_size < 0 || *actual_unpack_bytes % element_size == 0); + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy_stream(MPIR_get_contig_ptr(outbuf_ptr, outoffset), inbuf, + *actual_unpack_bytes); + } else { + MPIR_Memcpy(MPIR_get_contig_ptr(outbuf_ptr, outoffset), inbuf, *actual_unpack_bytes); + } + goto fn_exit; + } + + yaksa_type_t type = MPII_Typerep_get_yaksa_type(datatype); + yaksa_info_t info = (outbuf != MPI_BOTTOM) ? MPII_yaksa_info_nogpu : NULL; + + uintptr_t real_insize = MPL_MIN(total_size - outoffset, insize); + + uintptr_t real_unpack_bytes; + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_iunpack(inbuf, real_insize, outbuf, outcount, type, outoffset, + &real_unpack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } else { + rc = yaksa_unpack(inbuf, real_insize, outbuf, outcount, type, outoffset, &real_unpack_bytes, + info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + if (info) { + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } + } + + *actual_unpack_bytes = (MPI_Aint) real_unpack_bytes; + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Typerep_icopy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, MPIR_Typerep_req * typerep_req, uint32_t flags) { @@ -402,8 +577,13 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, - actual_pack_bytes, typerep_req, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_pack_h2h(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, typerep_req, flags); + } else { + mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, typerep_req, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -416,8 +596,13 @@ int MPIR_Typerep_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, - actual_pack_bytes, NULL, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_pack_h2h(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, NULL, flags); + } else { + mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, NULL, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -430,8 +615,13 @@ int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_A MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, - actual_unpack_bytes, typerep_req, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_unpack_h2h(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, typerep_req, flags); + } else { + mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, typerep_req, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -444,8 +634,13 @@ int MPIR_Typerep_unpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Ai MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, - actual_unpack_bytes, NULL, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_unpack_h2h(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, NULL, flags); + } else { + mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, NULL, flags); + } MPIR_FUNC_EXIT; return mpi_errno; From 85ce189fc7d682c03e00ce16eec18352f58ead4d Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 1 Aug 2024 13:11:06 -0500 Subject: [PATCH 2/5] mpl/gpu: make pointer attribute type flag like The checking on whether a buffer is on host can be simplified to one check because the enum of unregistered host and registered host can be combined. --- src/mpi/datatype/typerep/src/typerep_yaksa_pack.c | 3 +-- src/mpl/include/mpl_gpu.h | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c index e8c0985a5f8..dfa9323beed 100644 --- a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c +++ b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c @@ -66,8 +66,7 @@ */ #define IS_HOST(attr) \ - ((attr).type == MPL_GPU_POINTER_UNREGISTERED_HOST || \ - (attr).type == MPL_GPU_POINTER_REGISTERED_HOST) + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) /* When a returned typerep_req is expected, using the nonblocking yaksa routine and * return the request; otherwise use the blocking yaksa routine. */ diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index 16b35ac8bf6..ba25f411b3b 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -23,10 +23,10 @@ #endif typedef enum { - MPL_GPU_POINTER_UNREGISTERED_HOST = 0, - MPL_GPU_POINTER_REGISTERED_HOST, - MPL_GPU_POINTER_DEV, - MPL_GPU_POINTER_MANAGED + MPL_GPU_POINTER_UNREGISTERED_HOST = 0x1, + MPL_GPU_POINTER_REGISTERED_HOST = 0x2, + MPL_GPU_POINTER_DEV = 0x4, + MPL_GPU_POINTER_MANAGED = 0x8 } MPL_pointer_type_t; typedef enum { From b75dc37ab44f9ebd23873d3a01612e5a36aee6da Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 1 Aug 2024 13:14:16 -0500 Subject: [PATCH 3/5] mpl/gpu: code format cleanup --- src/mpl/include/mpl_gpu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index ba25f411b3b..7d48e3e061d 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -61,7 +61,7 @@ typedef enum { MPL_GPU_COPY_H2D, MPL_GPU_COPY_D2D_INCOMING, /* copy from remote to local */ MPL_GPU_COPY_D2D_OUTGOING, /* copy from local to remote */ - MPL_GPU_COPY_DIRECTION_NONE, /* copy in any direction and to/from any buffer type */ + MPL_GPU_COPY_DIRECTION_NONE, /* copy in any direction and to/from any buffer type */ } MPL_gpu_copy_direction_t; #define MPL_GPU_COPY_DIRECTION_TYPES 4 From 174033a21dd4ba498b759e6acf9d38b83442ac34 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 1 Aug 2024 13:18:45 -0500 Subject: [PATCH 4/5] ch4/mpidig: cache destination buffer attribute in request We check and cache the recv buffer attribute when posting recv request. This info is used to use the H2H fast path in typerep. --- src/mpid/ch4/include/mpidpre.h | 2 ++ src/mpid/ch4/src/mpidig_recv.h | 3 +++ src/mpid/ch4/src/mpidig_recv_utils.h | 20 ++++++++++++++------ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 6fc6bb6ec0b..a5c8526000d 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -164,6 +164,7 @@ typedef struct MPIDIG_req_async { struct iovec iov_one; /* used with MPIDIG_RECV_CONTIG */ MPIDIG_recv_data_copy_cb data_copy_cb; /* called in recv_init/recv_type_init for async * data copying */ + int typerep_flags; } MPIDIG_rreq_async_t; typedef struct MPIDIG_sreq_async { @@ -209,6 +210,7 @@ typedef struct MPIDIG_req_t { void *buffer; MPI_Aint count; MPI_Datatype datatype; + MPL_pointer_attr_t buf_attr; union { struct { int dest; diff --git a/src/mpid/ch4/src/mpidig_recv.h b/src/mpid/ch4/src/mpidig_recv.h index 56c30112f9e..d992d13414b 100644 --- a/src/mpid/ch4/src/mpidig_recv.h +++ b/src/mpid/ch4/src/mpidig_recv.h @@ -151,6 +151,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_handle_unexpected(void *buf, MPI_Aint count, MPIDIG_REQUEST(rreq, datatype) = datatype; MPIDIG_REQUEST(rreq, buffer) = buf; MPIDIG_REQUEST(rreq, count) = count; + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); MPIDIG_recv_type_init(unexp_data_sz, rreq); } } @@ -256,6 +257,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data MPIDIG_REQUEST(unexp_req, buffer) = buf; MPIDIG_REQUEST(unexp_req, count) = count; MPIDIG_REQUEST(unexp_req, req->status) &= ~MPIDIG_REQ_UNEXPECTED; + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); /* MPIDIG_recv_type_init will call the callback to finish the rndv protocol */ mpi_errno = MPIDIG_recv_type_init(data_sz, unexp_req); } else { @@ -284,6 +286,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data MPIR_Datatype_add_ref_if_not_builtin(datatype); MPIDIG_prepare_recv_req(rank, tag, context_id, buf, count, datatype, rreq); + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); MPIDIG_enqueue_request(rreq, &MPIDI_global.per_vci[vci].posted_list, MPIDIG_PT2PT_POSTED); } diff --git a/src/mpid/ch4/src/mpidig_recv_utils.h b/src/mpid/ch4/src/mpidig_recv_utils.h index c72029016bb..ade0071107b 100644 --- a/src/mpid/ch4/src/mpidig_recv_utils.h +++ b/src/mpid/ch4/src/mpidig_recv_utils.h @@ -219,6 +219,9 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre { MPIDIG_rreq_async_t *p = &(MPIDIG_REQUEST(rreq, req->recv_async)); MPI_Aint in_data_sz = p->in_data_sz; + int flags = (MPIDIG_REQUEST(rreq, buf_attr).type & + (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + ? MPIR_TYPEREP_FLAG_H2H : MPIR_TYPEREP_FLAG_NONE; if (in_data_sz == 0) { /* otherwise if recv size = 0, it is at least a truncation error */ MPIR_STATUS_SET_COUNT(rreq->status, 0); @@ -228,7 +231,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), - 0, &actual_unpack_bytes, MPIR_TYPEREP_FLAG_NONE); + 0, &actual_unpack_bytes, flags & MPIR_TYPEREP_FLAG_NONE); if (!rreq->status.MPI_ERROR && in_data_sz > actual_unpack_bytes) { /* Truncation error has been checked at MPIDIG_recv_type_init. * If the receive buffer had enough space, but we still @@ -251,7 +254,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre } data_sz = MPL_MIN(data_sz, in_data_sz); - MPIR_Typerep_copy(data, in_data, data_sz, MPIR_TYPEREP_FLAG_NONE); + MPIR_Typerep_copy(data, in_data, data_sz, flags & MPIR_TYPEREP_FLAG_NONE); MPIR_STATUS_SET_COUNT(rreq->status, data_sz); } else { /* noncontig case */ @@ -263,7 +266,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre for (int i = 0; i < iov_len && rem > 0; i++) { int curr_len = MPL_MIN(rem, iov[i].iov_len); MPIR_Typerep_copy(iov[i].iov_base, (char *) in_data + done, curr_len, - MPIR_TYPEREP_FLAG_NONE); + flags & MPIR_TYPEREP_FLAG_NONE); rem -= curr_len; done += curr_len; } @@ -282,6 +285,10 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_setup(MPIR_Request * rreq) { MPIDIG_rreq_async_t *p = &(MPIDIG_REQUEST(rreq, req->recv_async)); + MPIDIG_REQUEST(rreq, req->recv_async).typerep_flags = + (MPIDIG_REQUEST(rreq, buf_attr).type & + (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) ? + MPIR_TYPEREP_FLAG_H2H : MPIR_TYPEREP_FLAG_NONE; p->offset = 0; if (p->recv_type == MPIDIG_RECV_DATATYPE) { /* it's ready, rreq status to be set */ @@ -330,7 +337,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), - p->offset, &actual_unpack_bytes, MPIR_TYPEREP_FLAG_NONE); + p->offset, &actual_unpack_bytes, + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); p->offset += payload_sz; if (payload_sz > actual_unpack_bytes) { /* basic element size mismatch */ @@ -353,7 +361,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa for (int i = 0; i < p->iov_num; i++) { if (payload_sz < p->iov_ptr[i].iov_len) { MPIR_Typerep_copy(p->iov_ptr[i].iov_base, payload, payload_sz, - MPIR_TYPEREP_FLAG_NONE); + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); p->iov_ptr[i].iov_base = (char *) p->iov_ptr[i].iov_base + payload_sz; p->iov_ptr[i].iov_len -= payload_sz; /* not done */ @@ -361,7 +369,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa } else { /* fill one iov */ MPIR_Typerep_copy(p->iov_ptr[i].iov_base, payload, p->iov_ptr[i].iov_len, - MPIR_TYPEREP_FLAG_NONE); + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); payload = (char *) payload + p->iov_ptr[i].iov_len; payload_sz -= p->iov_ptr[i].iov_len; iov_done++; From c4d1e02aafeca4ace44140c70a3d541f01b0fef8 Mon Sep 17 00:00:00 2001 From: Yanfei Guo Date: Thu, 1 Aug 2024 13:56:58 -0500 Subject: [PATCH 5/5] ch4/shm: add fast path for host buffer Check if source buffer is on host and choose typerep fast path for H2H. --- .../ch4/shm/posix/eager/iqueue/iqueue_send.h | 21 ++++++++++++------- src/mpid/ch4/shm/posix/posix_am.h | 16 ++++++++++++-- src/mpid/ch4/shm/posix/posix_pre.h | 4 +++- src/mpid/ch4/shm/posix/posix_progress.h | 2 ++ src/mpid/ch4/shm/posix/posix_send.h | 12 ++++++++++- src/mpid/ch4/src/mpidig_send_utils.h | 4 ++++ 6 files changed, 47 insertions(+), 12 deletions(-) diff --git a/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h b/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h index 0ec3931cd93..8193a00b7a6 100644 --- a/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h +++ b/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h @@ -95,9 +95,11 @@ MPIDI_POSIX_eager_send(int grank, MPIDI_POSIX_am_header_t * msg_hdr, const void cell->type = MPIDI_POSIX_EAGER_IQUEUE_CELL_TYPE_HDR; /* send am_hdr if this is the first segment */ if (is_topo_local) { - MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, MPIR_TYPEREP_FLAG_NONE); + MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, + MPIR_TYPEREP_FLAG_H2H | MPIR_TYPEREP_FLAG_NONE); } else { - MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, MPIR_TYPEREP_FLAG_STREAM); + MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, + MPIR_TYPEREP_FLAG_H2H | MPIR_TYPEREP_FLAG_STREAM); } /* make sure the data region starts at the boundary of MAX_ALIGNMENT */ payload = payload + resized_am_hdr_sz; @@ -114,13 +116,16 @@ MPIDI_POSIX_eager_send(int grank, MPIDI_POSIX_am_header_t * msg_hdr, const void * not reliable because the derived datatype could have zero block size which contains no * data. */ if (bytes_sent) { - if (is_topo_local) { - MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, - MPIR_TYPEREP_FLAG_NONE); - } else { - MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, - MPIR_TYPEREP_FLAG_STREAM); + int typerep_flags = MPIR_TYPEREP_NONE; + if (msg_hdr == MPIDI_POSIX_AM_TYPE__SHORT_HOST + || msg_hdr == MPIDI_POSIX_AM_TYPE__PIPELINE_HOST) { + typerep_flags |= MPIR_TYPEREP_FLAG_H2H; + } + if (!is_topo_local) { + typerep_flags |= MPIR_TYPEREP_FLAG_STREAM; } + MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, + typerep_flags); cell->payload_size += packed_size; *bytes_sent = packed_size; } diff --git a/src/mpid/ch4/shm/posix/posix_am.h b/src/mpid/ch4/shm/posix/posix_am.h index 5ddc2a45ec1..2b29b3801cb 100644 --- a/src/mpid/ch4/shm/posix/posix_am.h +++ b/src/mpid/ch4/shm/posix/posix_am.h @@ -11,6 +11,10 @@ #include "posix_eager.h" #include "mpidu_genq.h" +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + MPL_STATIC_INLINE_PREFIX MPI_Aint MPIDI_POSIX_am_eager_limit(void) { return MPIDI_POSIX_eager_payload_limit() - MAX_ALIGNMENT; @@ -284,9 +288,17 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_do_am_isend(int grank, msg_hdr_p = msg_hdr; if (data_sz + am_hdr_sz <= MPIDI_POSIX_am_eager_limit()) { - msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT; + if (IS_HOST(MPIDIG_REQUEST(sreq, buf_attr))) { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT_HOST; + } else { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT; + } } else { - msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE; + if (IS_HOST(MPIDIG_REQUEST(sreq, buf_attr))) { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE_HOST; + } else { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE; + } } MPIDIG_am_send_async_init(sreq, datatype, data_sz); diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index 1357eed5895..8e25e5be9af 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -22,7 +22,9 @@ typedef enum { typedef enum { MPIDI_POSIX_AM_TYPE__HDR, MPIDI_POSIX_AM_TYPE__SHORT, - MPIDI_POSIX_AM_TYPE__PIPELINE + MPIDI_POSIX_AM_TYPE__SHORT_HOST, + MPIDI_POSIX_AM_TYPE__PIPELINE, + MPIDI_POSIX_AM_TYPE__PIPELINE_HOST } MPIDI_POSIX_am_type_t; struct MPIR_Request; diff --git a/src/mpid/ch4/shm/posix/posix_progress.h b/src/mpid/ch4/shm/posix/posix_progress.h index f0096bb6e04..e65ccc70c7a 100644 --- a/src/mpid/ch4/shm/posix/posix_progress.h +++ b/src/mpid/ch4/shm/posix/posix_progress.h @@ -57,11 +57,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_progress_recv(int vci, int *made_progre switch (msg_hdr->am_type) { case MPIDI_POSIX_AM_TYPE__HDR: case MPIDI_POSIX_AM_TYPE__SHORT: + case MPIDI_POSIX_AM_TYPE__SHORT_HOST: MPIDIG_global.target_msg_cbs[msg_hdr->handler_id] (am_hdr, payload, payload_left, attr, NULL); MPIDI_POSIX_eager_recv_commit(&transaction); goto fn_exit; case MPIDI_POSIX_AM_TYPE__PIPELINE: + case MPIDI_POSIX_AM_TYPE__PIPELINE_HOST: MPIDIG_global.target_msg_cbs[msg_hdr->handler_id] (am_hdr, NULL, payload_left, attr | MPIDIG_AM_ATTR__IS_ASYNC, &rreq); diff --git a/src/mpid/ch4/shm/posix/posix_send.h b/src/mpid/ch4/shm/posix/posix_send.h index 0796f671568..6a7e390de01 100644 --- a/src/mpid/ch4/shm/posix/posix_send.h +++ b/src/mpid/ch4/shm/posix/posix_send.h @@ -16,6 +16,10 @@ #include "posix_impl.h" +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + #define MPIDI_POSIX_SEND_VSIS(vci_src_, vci_dst_) \ do { \ MPIDI_EXPLICIT_VCIS(comm, attr, comm->rank, rank, vci_src_, vci_dst_); \ @@ -52,7 +56,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_isend(const void *buf, MPI_Aint cou MPIDI_POSIX_am_header_t msg_hdr; msg_hdr.handler_id = MPIDIG_SEND; msg_hdr.am_hdr_sz = sizeof(MPIDIG_hdr_t); - msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT; + MPL_pointer_attr_t attr; + MPIR_GPU_query_pointer_attr(buf, &attr); + if (IS_HOST(attr)) { + msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT_HOST; + } else { + msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT; + } MPIDIG_hdr_t am_hdr; am_hdr.src_rank = comm->rank; diff --git a/src/mpid/ch4/src/mpidig_send_utils.h b/src/mpid/ch4/src/mpidig_send_utils.h index f950aecbeef..c8018d424b3 100644 --- a/src/mpid/ch4/src/mpidig_send_utils.h +++ b/src/mpid/ch4/src/mpidig_send_utils.h @@ -6,6 +6,10 @@ #ifndef MPIDIG_SEND_UTILS_H_INCLUDED #define MPIDIG_SEND_UTILS_H_INCLUDED +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + /* This file is for supporting routines used for pipelined data send. These routines mainly is for * managing the send request counters, completion counters and DT refcount */