diff --git a/src/include/mpir_typerep.h b/src/include/mpir_typerep.h index cde3794484f..86c0a6ecb05 100644 --- a/src/include/mpir_typerep.h +++ b/src/include/mpir_typerep.h @@ -62,6 +62,7 @@ int MPIR_Typerep_iov_len(MPI_Aint count, MPI_Datatype type, MPI_Aint max_iov_byt #define MPIR_TYPEREP_FLAG_NONE 0x0UL #define MPIR_TYPEREP_FLAG_STREAM 0x1UL +#define MPIR_TYPEREP_FLAG_H2H 0x2UL int MPIR_Typerep_copy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, uint32_t flags); int MPIR_Typerep_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, diff --git a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c index a6f03201e02..dfa9323beed 100644 --- a/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c +++ b/src/mpi/datatype/typerep/src/typerep_yaksa_pack.c @@ -66,8 +66,7 @@ */ #define IS_HOST(attr) \ - ((attr).type == MPL_GPU_POINTER_UNREGISTERED_HOST || \ - (attr).type == MPL_GPU_POINTER_REGISTERED_HOST) + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) /* When a returned typerep_req is expected, using the nonblocking yaksa routine and * return the request; otherwise use the blocking yaksa routine. */ @@ -87,31 +86,35 @@ static int typerep_do_copy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, goto fn_exit; } + if (flags & MPIR_TYPEREP_FLAG_H2H) { + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy(outbuf, inbuf, num_bytes); + } else { + MPIR_Memcpy_stream(outbuf, inbuf, num_bytes); + } + } + MPL_pointer_attr_t inattr, outattr; MPIR_GPU_query_pointer_attr(inbuf, &inattr); MPIR_GPU_query_pointer_attr(outbuf, &outattr); - if (IS_HOST(inattr) && IS_HOST(outattr)) { - MPIR_Memcpy(outbuf, inbuf, num_bytes); + uintptr_t actual_pack_bytes; + + yaksa_info_t info = MPII_yaksa_get_info(&inattr, &outattr); + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_ipack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, + &actual_pack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + MPIR_Assert(actual_pack_bytes == num_bytes); } else { - uintptr_t actual_pack_bytes; - - yaksa_info_t info = MPII_yaksa_get_info(&inattr, &outattr); - if (typerep_req) { - typerep_req->info = info; - rc = yaksa_ipack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, - &actual_pack_bytes, info, YAKSA_OP__REPLACE, - (yaksa_request_t *) & typerep_req->req); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - MPIR_Assert(actual_pack_bytes == num_bytes); - } else { - rc = yaksa_pack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, - &actual_pack_bytes, info, YAKSA_OP__REPLACE); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - MPIR_Assert(actual_pack_bytes == num_bytes); - rc = MPII_yaksa_free_info(info); - MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); - } + rc = yaksa_pack(inbuf, num_bytes, YAKSA_TYPE__BYTE, 0, outbuf, num_bytes, + &actual_pack_bytes, info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + MPIR_Assert(actual_pack_bytes == num_bytes); + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); } fn_exit: @@ -213,6 +216,94 @@ static int typerep_do_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype dat goto fn_exit; } +static int typerep_do_pack_h2h(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, + MPI_Aint inoffset, void *outbuf, MPI_Aint max_pack_bytes, + MPI_Aint * actual_pack_bytes, MPIR_Typerep_req * typerep_req, + uint32_t flags) +{ + MPIR_FUNC_ENTER; + + int mpi_errno = MPI_SUCCESS; + int rc; + + if (typerep_req) { + typerep_req->req = MPIR_TYPEREP_REQ_NULL; + } + + if (incount == 0) { + *actual_pack_bytes = 0; + goto fn_exit; + } + + MPIR_Assert(datatype != MPI_DATATYPE_NULL); + + int is_contig = 0; + int element_size = -1; + const void *inbuf_ptr; /* adjusted by true_lb */ + MPI_Aint total_size = 0; + if (HANDLE_IS_BUILTIN(datatype)) { + is_contig = 1; + element_size = MPIR_Datatype_get_basic_size(datatype); + inbuf_ptr = inbuf; + total_size = incount * element_size; + } else { + MPIR_Datatype *dtp; + MPIR_Datatype_get_ptr(datatype, dtp); + is_contig = dtp->is_contig; + element_size = dtp->builtin_element_size; + inbuf_ptr = MPIR_get_contig_ptr(inbuf, dtp->true_lb); + total_size = incount * dtp->size; + } + + /* only query the pointer attributes in case of relative addressing */ + // bool rel_addressing = (inbuf != MPI_BOTTOM); + // if (rel_addressing) { + // MPIR_GPU_query_pointer_attr(inbuf_ptr, &inattr); + // MPIR_GPU_query_pointer_attr(outbuf, &outattr); + // } + + if (is_contig) { + MPI_Aint real_bytes = MPL_MIN(total_size - inoffset, max_pack_bytes); + /* Make sure we never pack partial element */ + real_bytes -= real_bytes % element_size; + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy_stream(outbuf, MPIR_get_contig_ptr(inbuf_ptr, inoffset), real_bytes); + } else { + MPIR_Memcpy(outbuf, MPIR_get_contig_ptr(inbuf_ptr, inoffset), real_bytes); + } + *actual_pack_bytes = real_bytes; + goto fn_exit; + } + + yaksa_type_t type = MPII_Typerep_get_yaksa_type(datatype); + yaksa_info_t info = MPII_yaksa_info_nogpu; + + uintptr_t real_pack_bytes; + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_ipack(inbuf, incount, type, inoffset, outbuf, max_pack_bytes, + &real_pack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } else { + rc = yaksa_pack(inbuf, incount, type, inoffset, outbuf, max_pack_bytes, + &real_pack_bytes, info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + if (info) { + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } + } + + *actual_pack_bytes = (MPI_Aint) real_pack_bytes; + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + /* This function checks whether the operation is supported in yaksa for the * provided datatype */ int MPIR_Typerep_reduce_is_supported(MPI_Op op, MPI_Aint count, MPI_Datatype datatype) @@ -372,6 +463,89 @@ static int typerep_do_unpack(const void *inbuf, MPI_Aint insize, void *outbuf, M goto fn_exit; } +static int typerep_do_unpack_h2h(const void *inbuf, MPI_Aint insize, void *outbuf, + MPI_Aint outcount, MPI_Datatype datatype, MPI_Aint outoffset, + MPI_Aint * actual_unpack_bytes, MPIR_Typerep_req * typerep_req, + uint32_t flags) +{ + MPIR_FUNC_ENTER; + + int mpi_errno = MPI_SUCCESS; + int rc; + + if (typerep_req) { + typerep_req->req = MPIR_TYPEREP_REQ_NULL; + } + + if (insize == 0) { + *actual_unpack_bytes = 0; + goto fn_exit; + } + + MPIR_Assert(datatype != MPI_DATATYPE_NULL); + + int is_contig = 0; + int element_size = -1; + const void *outbuf_ptr; /* adjusted by true_lb */ + MPI_Aint total_size = 0; + if (HANDLE_IS_BUILTIN(datatype)) { + is_contig = 1; + element_size = MPIR_Datatype_get_basic_size(datatype); + outbuf_ptr = outbuf; + total_size = outcount * element_size; + } else { + MPIR_Datatype *dtp; + MPIR_Datatype_get_ptr(datatype, dtp); + is_contig = dtp->is_contig; + element_size = dtp->builtin_element_size; + outbuf_ptr = MPIR_get_contig_ptr(outbuf, dtp->true_lb); + total_size = outcount * dtp->size; + } + + if (is_contig) { + *actual_unpack_bytes = MPL_MIN(total_size - outoffset, insize); + /* We assume the amount we unpack is multiple of element_size */ + MPIR_Assert(element_size < 0 || *actual_unpack_bytes % element_size == 0); + if (flags & MPIR_TYPEREP_FLAG_STREAM) { + MPIR_Memcpy_stream(MPIR_get_contig_ptr(outbuf_ptr, outoffset), inbuf, + *actual_unpack_bytes); + } else { + MPIR_Memcpy(MPIR_get_contig_ptr(outbuf_ptr, outoffset), inbuf, *actual_unpack_bytes); + } + goto fn_exit; + } + + yaksa_type_t type = MPII_Typerep_get_yaksa_type(datatype); + yaksa_info_t info = (outbuf != MPI_BOTTOM) ? MPII_yaksa_info_nogpu : NULL; + + uintptr_t real_insize = MPL_MIN(total_size - outoffset, insize); + + uintptr_t real_unpack_bytes; + if (typerep_req) { + typerep_req->info = info; + rc = yaksa_iunpack(inbuf, real_insize, outbuf, outcount, type, outoffset, + &real_unpack_bytes, info, YAKSA_OP__REPLACE, + (yaksa_request_t *) & typerep_req->req); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } else { + rc = yaksa_unpack(inbuf, real_insize, outbuf, outcount, type, outoffset, &real_unpack_bytes, + info, YAKSA_OP__REPLACE); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + if (info) { + rc = MPII_yaksa_free_info(info); + MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_INTERN, "**yaksa"); + } + } + + *actual_unpack_bytes = (MPI_Aint) real_unpack_bytes; + + fn_exit: + MPIR_FUNC_EXIT; + return mpi_errno; + fn_fail: + goto fn_exit; +} + int MPIR_Typerep_icopy(void *outbuf, const void *inbuf, MPI_Aint num_bytes, MPIR_Typerep_req * typerep_req, uint32_t flags) { @@ -402,8 +576,13 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, - actual_pack_bytes, typerep_req, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_pack_h2h(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, typerep_req, flags); + } else { + mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, typerep_req, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -416,8 +595,13 @@ int MPIR_Typerep_pack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, - actual_pack_bytes, NULL, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_pack_h2h(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, NULL, flags); + } else { + mpi_errno = typerep_do_pack(inbuf, incount, datatype, inoffset, outbuf, max_pack_bytes, + actual_pack_bytes, NULL, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -430,8 +614,13 @@ int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_A MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, - actual_unpack_bytes, typerep_req, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_unpack_h2h(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, typerep_req, flags); + } else { + mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, typerep_req, flags); + } MPIR_FUNC_EXIT; return mpi_errno; @@ -444,8 +633,13 @@ int MPIR_Typerep_unpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Ai MPIR_FUNC_ENTER; int mpi_errno = MPI_SUCCESS; - mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, - actual_unpack_bytes, NULL, flags); + if (flags & MPIR_TYPEREP_FLAG_H2H) { + mpi_errno = typerep_do_unpack_h2h(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, NULL, flags); + } else { + mpi_errno = typerep_do_unpack(inbuf, insize, outbuf, outcount, datatype, outoffset, + actual_unpack_bytes, NULL, flags); + } MPIR_FUNC_EXIT; return mpi_errno; diff --git a/src/mpid/ch4/include/mpidpre.h b/src/mpid/ch4/include/mpidpre.h index 6fc6bb6ec0b..a5c8526000d 100644 --- a/src/mpid/ch4/include/mpidpre.h +++ b/src/mpid/ch4/include/mpidpre.h @@ -164,6 +164,7 @@ typedef struct MPIDIG_req_async { struct iovec iov_one; /* used with MPIDIG_RECV_CONTIG */ MPIDIG_recv_data_copy_cb data_copy_cb; /* called in recv_init/recv_type_init for async * data copying */ + int typerep_flags; } MPIDIG_rreq_async_t; typedef struct MPIDIG_sreq_async { @@ -209,6 +210,7 @@ typedef struct MPIDIG_req_t { void *buffer; MPI_Aint count; MPI_Datatype datatype; + MPL_pointer_attr_t buf_attr; union { struct { int dest; diff --git a/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h b/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h index 0ec3931cd93..8193a00b7a6 100644 --- a/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h +++ b/src/mpid/ch4/shm/posix/eager/iqueue/iqueue_send.h @@ -95,9 +95,11 @@ MPIDI_POSIX_eager_send(int grank, MPIDI_POSIX_am_header_t * msg_hdr, const void cell->type = MPIDI_POSIX_EAGER_IQUEUE_CELL_TYPE_HDR; /* send am_hdr if this is the first segment */ if (is_topo_local) { - MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, MPIR_TYPEREP_FLAG_NONE); + MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, + MPIR_TYPEREP_FLAG_H2H | MPIR_TYPEREP_FLAG_NONE); } else { - MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, MPIR_TYPEREP_FLAG_STREAM); + MPIR_Typerep_copy(payload, am_hdr, am_hdr_sz, + MPIR_TYPEREP_FLAG_H2H | MPIR_TYPEREP_FLAG_STREAM); } /* make sure the data region starts at the boundary of MAX_ALIGNMENT */ payload = payload + resized_am_hdr_sz; @@ -114,13 +116,16 @@ MPIDI_POSIX_eager_send(int grank, MPIDI_POSIX_am_header_t * msg_hdr, const void * not reliable because the derived datatype could have zero block size which contains no * data. */ if (bytes_sent) { - if (is_topo_local) { - MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, - MPIR_TYPEREP_FLAG_NONE); - } else { - MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, - MPIR_TYPEREP_FLAG_STREAM); + int typerep_flags = MPIR_TYPEREP_NONE; + if (msg_hdr == MPIDI_POSIX_AM_TYPE__SHORT_HOST + || msg_hdr == MPIDI_POSIX_AM_TYPE__PIPELINE_HOST) { + typerep_flags |= MPIR_TYPEREP_FLAG_H2H; + } + if (!is_topo_local) { + typerep_flags |= MPIR_TYPEREP_FLAG_STREAM; } + MPIR_Typerep_pack(buf, count, datatype, offset, payload, available, &packed_size, + typerep_flags); cell->payload_size += packed_size; *bytes_sent = packed_size; } diff --git a/src/mpid/ch4/shm/posix/posix_am.h b/src/mpid/ch4/shm/posix/posix_am.h index 5ddc2a45ec1..2b29b3801cb 100644 --- a/src/mpid/ch4/shm/posix/posix_am.h +++ b/src/mpid/ch4/shm/posix/posix_am.h @@ -11,6 +11,10 @@ #include "posix_eager.h" #include "mpidu_genq.h" +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + MPL_STATIC_INLINE_PREFIX MPI_Aint MPIDI_POSIX_am_eager_limit(void) { return MPIDI_POSIX_eager_payload_limit() - MAX_ALIGNMENT; @@ -284,9 +288,17 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_do_am_isend(int grank, msg_hdr_p = msg_hdr; if (data_sz + am_hdr_sz <= MPIDI_POSIX_am_eager_limit()) { - msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT; + if (IS_HOST(MPIDIG_REQUEST(sreq, buf_attr))) { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT_HOST; + } else { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__SHORT; + } } else { - msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE; + if (IS_HOST(MPIDIG_REQUEST(sreq, buf_attr))) { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE_HOST; + } else { + msg_hdr_p->am_type = MPIDI_POSIX_AM_TYPE__PIPELINE; + } } MPIDIG_am_send_async_init(sreq, datatype, data_sz); diff --git a/src/mpid/ch4/shm/posix/posix_pre.h b/src/mpid/ch4/shm/posix/posix_pre.h index 1357eed5895..8e25e5be9af 100644 --- a/src/mpid/ch4/shm/posix/posix_pre.h +++ b/src/mpid/ch4/shm/posix/posix_pre.h @@ -22,7 +22,9 @@ typedef enum { typedef enum { MPIDI_POSIX_AM_TYPE__HDR, MPIDI_POSIX_AM_TYPE__SHORT, - MPIDI_POSIX_AM_TYPE__PIPELINE + MPIDI_POSIX_AM_TYPE__SHORT_HOST, + MPIDI_POSIX_AM_TYPE__PIPELINE, + MPIDI_POSIX_AM_TYPE__PIPELINE_HOST } MPIDI_POSIX_am_type_t; struct MPIR_Request; diff --git a/src/mpid/ch4/shm/posix/posix_progress.h b/src/mpid/ch4/shm/posix/posix_progress.h index f0096bb6e04..e65ccc70c7a 100644 --- a/src/mpid/ch4/shm/posix/posix_progress.h +++ b/src/mpid/ch4/shm/posix/posix_progress.h @@ -57,11 +57,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_progress_recv(int vci, int *made_progre switch (msg_hdr->am_type) { case MPIDI_POSIX_AM_TYPE__HDR: case MPIDI_POSIX_AM_TYPE__SHORT: + case MPIDI_POSIX_AM_TYPE__SHORT_HOST: MPIDIG_global.target_msg_cbs[msg_hdr->handler_id] (am_hdr, payload, payload_left, attr, NULL); MPIDI_POSIX_eager_recv_commit(&transaction); goto fn_exit; case MPIDI_POSIX_AM_TYPE__PIPELINE: + case MPIDI_POSIX_AM_TYPE__PIPELINE_HOST: MPIDIG_global.target_msg_cbs[msg_hdr->handler_id] (am_hdr, NULL, payload_left, attr | MPIDIG_AM_ATTR__IS_ASYNC, &rreq); diff --git a/src/mpid/ch4/shm/posix/posix_send.h b/src/mpid/ch4/shm/posix/posix_send.h index 0796f671568..6a7e390de01 100644 --- a/src/mpid/ch4/shm/posix/posix_send.h +++ b/src/mpid/ch4/shm/posix/posix_send.h @@ -16,6 +16,10 @@ #include "posix_impl.h" +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + #define MPIDI_POSIX_SEND_VSIS(vci_src_, vci_dst_) \ do { \ MPIDI_EXPLICIT_VCIS(comm, attr, comm->rank, rank, vci_src_, vci_dst_); \ @@ -52,7 +56,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_POSIX_mpi_isend(const void *buf, MPI_Aint cou MPIDI_POSIX_am_header_t msg_hdr; msg_hdr.handler_id = MPIDIG_SEND; msg_hdr.am_hdr_sz = sizeof(MPIDIG_hdr_t); - msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT; + MPL_pointer_attr_t attr; + MPIR_GPU_query_pointer_attr(buf, &attr); + if (IS_HOST(attr)) { + msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT_HOST; + } else { + msg_hdr.am_type = MPIDI_POSIX_AM_TYPE__SHORT; + } MPIDIG_hdr_t am_hdr; am_hdr.src_rank = comm->rank; diff --git a/src/mpid/ch4/src/mpidig_recv.h b/src/mpid/ch4/src/mpidig_recv.h index 56c30112f9e..d992d13414b 100644 --- a/src/mpid/ch4/src/mpidig_recv.h +++ b/src/mpid/ch4/src/mpidig_recv.h @@ -151,6 +151,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_handle_unexpected(void *buf, MPI_Aint count, MPIDIG_REQUEST(rreq, datatype) = datatype; MPIDIG_REQUEST(rreq, buffer) = buf; MPIDIG_REQUEST(rreq, count) = count; + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); MPIDIG_recv_type_init(unexp_data_sz, rreq); } } @@ -256,6 +257,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data MPIDIG_REQUEST(unexp_req, buffer) = buf; MPIDIG_REQUEST(unexp_req, count) = count; MPIDIG_REQUEST(unexp_req, req->status) &= ~MPIDIG_REQ_UNEXPECTED; + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); /* MPIDIG_recv_type_init will call the callback to finish the rndv protocol */ mpi_errno = MPIDIG_recv_type_init(data_sz, unexp_req); } else { @@ -284,6 +286,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_do_irecv(void *buf, MPI_Aint count, MPI_Data MPIR_Datatype_add_ref_if_not_builtin(datatype); MPIDIG_prepare_recv_req(rank, tag, context_id, buf, count, datatype, rreq); + MPIR_GPU_query_pointer_attr(buf, &MPIDIG_REQUEST(rreq, buf_attr)); MPIDIG_enqueue_request(rreq, &MPIDI_global.per_vci[vci].posted_list, MPIDIG_PT2PT_POSTED); } diff --git a/src/mpid/ch4/src/mpidig_recv_utils.h b/src/mpid/ch4/src/mpidig_recv_utils.h index c72029016bb..ade0071107b 100644 --- a/src/mpid/ch4/src/mpidig_recv_utils.h +++ b/src/mpid/ch4/src/mpidig_recv_utils.h @@ -219,6 +219,9 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre { MPIDIG_rreq_async_t *p = &(MPIDIG_REQUEST(rreq, req->recv_async)); MPI_Aint in_data_sz = p->in_data_sz; + int flags = (MPIDIG_REQUEST(rreq, buf_attr).type & + (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + ? MPIR_TYPEREP_FLAG_H2H : MPIR_TYPEREP_FLAG_NONE; if (in_data_sz == 0) { /* otherwise if recv size = 0, it is at least a truncation error */ MPIR_STATUS_SET_COUNT(rreq->status, 0); @@ -228,7 +231,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), - 0, &actual_unpack_bytes, MPIR_TYPEREP_FLAG_NONE); + 0, &actual_unpack_bytes, flags & MPIR_TYPEREP_FLAG_NONE); if (!rreq->status.MPI_ERROR && in_data_sz > actual_unpack_bytes) { /* Truncation error has been checked at MPIDIG_recv_type_init. * If the receive buffer had enough space, but we still @@ -251,7 +254,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre } data_sz = MPL_MIN(data_sz, in_data_sz); - MPIR_Typerep_copy(data, in_data, data_sz, MPIR_TYPEREP_FLAG_NONE); + MPIR_Typerep_copy(data, in_data, data_sz, flags & MPIR_TYPEREP_FLAG_NONE); MPIR_STATUS_SET_COUNT(rreq->status, data_sz); } else { /* noncontig case */ @@ -263,7 +266,7 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre for (int i = 0; i < iov_len && rem > 0; i++) { int curr_len = MPL_MIN(rem, iov[i].iov_len); MPIR_Typerep_copy(iov[i].iov_base, (char *) in_data + done, curr_len, - MPIR_TYPEREP_FLAG_NONE); + flags & MPIR_TYPEREP_FLAG_NONE); rem -= curr_len; done += curr_len; } @@ -282,6 +285,10 @@ MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_copy(void *in_data, MPIR_Request * rre MPL_STATIC_INLINE_PREFIX void MPIDIG_recv_setup(MPIR_Request * rreq) { MPIDIG_rreq_async_t *p = &(MPIDIG_REQUEST(rreq, req->recv_async)); + MPIDIG_REQUEST(rreq, req->recv_async).typerep_flags = + (MPIDIG_REQUEST(rreq, buf_attr).type & + (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) ? + MPIR_TYPEREP_FLAG_H2H : MPIR_TYPEREP_FLAG_NONE; p->offset = 0; if (p->recv_type == MPIDIG_RECV_DATATYPE) { /* it's ready, rreq status to be set */ @@ -330,7 +337,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), - p->offset, &actual_unpack_bytes, MPIR_TYPEREP_FLAG_NONE); + p->offset, &actual_unpack_bytes, + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); p->offset += payload_sz; if (payload_sz > actual_unpack_bytes) { /* basic element size mismatch */ @@ -353,7 +361,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa for (int i = 0; i < p->iov_num; i++) { if (payload_sz < p->iov_ptr[i].iov_len) { MPIR_Typerep_copy(p->iov_ptr[i].iov_base, payload, payload_sz, - MPIR_TYPEREP_FLAG_NONE); + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); p->iov_ptr[i].iov_base = (char *) p->iov_ptr[i].iov_base + payload_sz; p->iov_ptr[i].iov_len -= payload_sz; /* not done */ @@ -361,7 +369,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDIG_recv_copy_seg(void *payload, MPI_Aint payloa } else { /* fill one iov */ MPIR_Typerep_copy(p->iov_ptr[i].iov_base, payload, p->iov_ptr[i].iov_len, - MPIR_TYPEREP_FLAG_NONE); + p->typerep_flags & MPIR_TYPEREP_FLAG_NONE); payload = (char *) payload + p->iov_ptr[i].iov_len; payload_sz -= p->iov_ptr[i].iov_len; iov_done++; diff --git a/src/mpid/ch4/src/mpidig_send_utils.h b/src/mpid/ch4/src/mpidig_send_utils.h index f950aecbeef..c8018d424b3 100644 --- a/src/mpid/ch4/src/mpidig_send_utils.h +++ b/src/mpid/ch4/src/mpidig_send_utils.h @@ -6,6 +6,10 @@ #ifndef MPIDIG_SEND_UTILS_H_INCLUDED #define MPIDIG_SEND_UTILS_H_INCLUDED +#undef IS_HOST +#define IS_HOST(attr) \ + ((attr).type & (MPL_GPU_POINTER_UNREGISTERED_HOST | MPL_GPU_POINTER_REGISTERED_HOST)) + /* This file is for supporting routines used for pipelined data send. These routines mainly is for * managing the send request counters, completion counters and DT refcount */ diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index 16b35ac8bf6..7d48e3e061d 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -23,10 +23,10 @@ #endif typedef enum { - MPL_GPU_POINTER_UNREGISTERED_HOST = 0, - MPL_GPU_POINTER_REGISTERED_HOST, - MPL_GPU_POINTER_DEV, - MPL_GPU_POINTER_MANAGED + MPL_GPU_POINTER_UNREGISTERED_HOST = 0x1, + MPL_GPU_POINTER_REGISTERED_HOST = 0x2, + MPL_GPU_POINTER_DEV = 0x4, + MPL_GPU_POINTER_MANAGED = 0x8 } MPL_pointer_type_t; typedef enum { @@ -61,7 +61,7 @@ typedef enum { MPL_GPU_COPY_H2D, MPL_GPU_COPY_D2D_INCOMING, /* copy from remote to local */ MPL_GPU_COPY_D2D_OUTGOING, /* copy from local to remote */ - MPL_GPU_COPY_DIRECTION_NONE, /* copy in any direction and to/from any buffer type */ + MPL_GPU_COPY_DIRECTION_NONE, /* copy in any direction and to/from any buffer type */ } MPL_gpu_copy_direction_t; #define MPL_GPU_COPY_DIRECTION_TYPES 4