Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ch4/ofi: refactor MPIDI_OFI_request_t #6895

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
7ab6c80
misc: rename MPIR_gpu_req to MPIR_async_req
hzhou Feb 2, 2024
0cc53d6
misc: add MPIR_async_test
hzhou Jan 31, 2024
ac6aa4c
ch4/ipc: refactor gpu_ipc_async_poll to use MPIR_async_test
hzhou Jan 31, 2024
b510cad
ch4/ofi: refactor pipeline recv async copy
hzhou Jan 31, 2024
6727ffc
ch4/ofi: refactor pipeline send async copy
hzhou Jan 31, 2024
6486c72
ch4/ofi: remove MPIDI_OFI_gpu_progress_task
hzhou Jan 31, 2024
5b21c7c
ch4/ofi: refactor pipeline send
hzhou Jan 31, 2024
3519d1d
ch4/ofi: refactor pipeline recv
hzhou Feb 1, 2024
eb317db
ch4/ofi: move gpu pipeline events into ofi_gpu_pipeline.c
hzhou Feb 5, 2024
b910e10
ch4/ofi: move all gpu pipeline code into ofi_gpu_pipeline.c
hzhou Feb 5, 2024
c41c4be
ch4/ofi: refactor pipeline_info into a union
hzhou Feb 5, 2024
1c4a9f0
ch4/ofi: use explicit counters to track gpu pipeline
hzhou Feb 6, 2024
ffafce1
ch4/ofi: use internal tag for pipeline chunk match_bits
hzhou Feb 6, 2024
4cc74c6
ch4/ofi: refactor gpu pipeline recv_alloc
hzhou Feb 8, 2024
c24a4a9
ch4/ofi: include ofi_impl.h in ofi_gpu_pipeline.c
hzhou Feb 8, 2024
bdd903e
ch4/ofi: move some inline util functions
hzhou Feb 7, 2024
207c5d3
---- START HERE ----
hzhou Feb 6, 2024
fa962d7
ch4/ofi: re-organize MPIDI_OFI_request_t noncontig union
hzhou Feb 6, 2024
59dc0fb
ch4/ofi: merge pipeline_info in MPIDI_OFI_request_t
hzhou Feb 6, 2024
e8ab242
ch4/ofi: add huge_send in MPIDI_OFI_request_t union
hzhou Feb 6, 2024
a297352
ch4/ofi: move huge.remote_info in MPIDI_OFI_request_t
hzhou Feb 6, 2024
fe6b73c
ch4/ofi: move inject_buf to MPIDI_OFI_request_t.u
hzhou Feb 6, 2024
ccd0b27
ch4/ofi: move the util.iov into MPIDI_OFI_request_t.u
hzhou Feb 6, 2024
79ae1fe
ch4/ofi: refactor MPIDI_OFI_dispatch_function to a big switch
hzhou Feb 7, 2024
66bfcc6
ch4/ofi: assert for unexpected pipeline data
hzhou Feb 7, 2024
647751e
ch4/ofi: Remove a redundant assignment
hzhou Feb 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dummy
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1
2 changes: 1 addition & 1 deletion src/include/mpiimpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ typedef struct MPIR_Stream MPIR_Stream;
/******************* PART 3: DEVICE INDEPENDENT HEADERS **********************/
/*****************************************************************************/

#include "mpir_misc.h"
#include "mpir_dbg.h"
#include "mpir_objects.h"
#include "mpir_strerror.h"
Expand All @@ -166,6 +165,7 @@ typedef struct MPIR_Stream MPIR_Stream;
#include "mpir_mem.h"
#include "mpir_info.h"
#include "mpir_errcodes.h"
#include "mpir_misc.h"
#include "mpir_errhandler.h"
#include "mpir_attr_generic.h"
#include "mpir_contextid.h"
Expand Down
27 changes: 22 additions & 5 deletions src/include/mpir_misc.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,6 @@ extern MPL_initlock_t MPIR_init_lock;

#include "typerep_pre.h" /* needed for MPIR_Typerep_req */

/* FIXME: bad names. Not gpu-specific, confusing with MPIR_Request.
* It's a general async handle.
*/
typedef enum {
MPIR_NULL_REQUEST = 0,
MPIR_TYPEREP_REQUEST,
Expand All @@ -64,7 +61,27 @@ typedef struct {
MPL_gpu_request gpu_req;
} u;
MPIR_request_type_t type;
} MPIR_gpu_req;
} MPIR_async_req;

MPL_STATIC_INLINE_PREFIX void MPIR_async_test(MPIR_async_req * areq, int *is_done)
{
int err;
switch (areq->type) {
case MPIR_NULL_REQUEST:
/* a dummy, immediately complete */
*is_done = 1;
break;
case MPIR_TYPEREP_REQUEST:
MPIR_Typerep_test(areq->u.y_req, is_done);
break;
case MPIR_GPU_REQUEST:
err = MPL_gpu_test(&areq->u.gpu_req, is_done);
MPIR_Assertp(err == MPL_SUCCESS);
break;
default:
MPIR_Assert(0);
}
}

int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype,
void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype);
Expand All @@ -82,7 +99,7 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
MPI_Aint sendoffset, MPL_pointer_attr_t * sendattr, void *recvbuf,
MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset,
MPL_pointer_attr_t * recvattr, MPL_gpu_copy_direction_t dir,
MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * req);
MPL_gpu_engine_type_t enginetype, bool commit, MPIR_async_req * req);

/* Contiguous datatype calculates buffer address with `(char *) buf + dt_true_lb`.
* However, dt_true_lb is treated as ptrdiff_t (signed), and when buf is MPI_BOTTOM
Expand Down
2 changes: 0 additions & 2 deletions src/include/mpir_typerep.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp
int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Aint outcount,
MPI_Datatype datatype, MPI_Aint outoffset, MPI_Aint * actual_unpack_bytes,
MPIR_Typerep_req * typerep_req, uint32_t flags);
int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);

int MPIR_Typerep_size_external32(MPI_Datatype type);
int MPIR_Typerep_pack_external(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype,
Expand Down
3 changes: 3 additions & 0 deletions src/mpi/datatype/typerep/src/typerep_pre.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,7 @@ typedef struct {
#define MPIR_TYPEREP_HANDLE_NULL NULL
#endif

int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req);
int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed);

#endif /* TYPEREP_PRE_H_INCLUDED */
31 changes: 16 additions & 15 deletions src/mpi/misc/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
MPI_Aint sendoffset, MPL_pointer_attr_t * send_attr, void *recvbuf,
MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset,
MPL_pointer_attr_t * recv_attr, MPL_gpu_copy_direction_t dir,
MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * gpu_req)
MPL_gpu_engine_type_t enginetype, bool commit,
MPIR_async_req * async_req)
{
int mpi_errno = MPI_SUCCESS;
int mpl_errno = MPL_SUCCESS;
Expand All @@ -200,8 +201,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp

MPIR_FUNC_ENTER;

if (gpu_req)
gpu_req->type = MPIR_NULL_REQUEST;
if (async_req)
async_req->type = MPIR_NULL_REQUEST;

MPIR_Datatype_get_size_macro(sendtype, sendsize);
MPIR_Datatype_get_size_macro(recvtype, recvsize);
Expand Down Expand Up @@ -260,7 +261,7 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
MPIR_ERR_CHKANDJUMP(dev_id == -1, mpi_errno, MPI_ERR_OTHER,
"**mpl_gpu_get_dev_id_from_attr");

if (gpu_req == NULL) {
if (async_req == NULL) {
MPL_gpu_request req;
mpl_errno =
MPL_gpu_imemcpy((char *) MPIR_get_contig_ptr(recvbuf, recvtype_true_lb) +
Expand All @@ -281,8 +282,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
recvoffset, (char *) MPIR_get_contig_ptr(sendbuf,
sendtype_true_lb) +
sendoffset, copy_sz, dev_id, dir, enginetype,
&gpu_req->u.gpu_req, commit);
gpu_req->type = MPIR_GPU_REQUEST;
&async_req->u.gpu_req, commit);
async_req->type = MPIR_GPU_REQUEST;
}
}
#else
Expand All @@ -300,15 +301,15 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
fn_fail:
goto fn_exit;
fn_fallback:
if (gpu_req) {
if (async_req) {
mpi_errno =
do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype,
recvoffset, LOCALCOPY_NONBLOCKING, &gpu_req->u.y_req);
recvoffset, LOCALCOPY_NONBLOCKING, &async_req->u.y_req);
MPIR_ERR_CHECK(mpi_errno);
if (gpu_req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) {
gpu_req->type = MPIR_NULL_REQUEST;
if (async_req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) {
async_req->type = MPIR_NULL_REQUEST;
} else {
gpu_req->type = MPIR_TYPEREP_REQUEST;
async_req->type = MPIR_TYPEREP_REQUEST;
}
} else {
mpi_errno =
Expand Down Expand Up @@ -414,7 +415,7 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
MPI_Aint sendoffset, MPL_pointer_attr_t * sendattr, void *recvbuf,
MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset,
MPL_pointer_attr_t * recvattr, MPL_gpu_copy_direction_t dir,
MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * req)
MPL_gpu_engine_type_t enginetype, bool commit, MPIR_async_req * async_req)
{
int mpi_errno = MPI_SUCCESS;

Expand All @@ -423,14 +424,14 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se
#ifdef MPL_HAVE_GPU
mpi_errno =
do_localcopy_gpu(sendbuf, sendcount, sendtype, sendoffset, sendattr, recvbuf, recvcount,
recvtype, recvoffset, recvattr, dir, enginetype, commit, req);
recvtype, recvoffset, recvattr, dir, enginetype, commit, async_req);
MPIR_ERR_CHECK(mpi_errno);
#else
mpi_errno =
do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype,
recvoffset, LOCALCOPY_NONBLOCKING, &req->u.y_req);
recvoffset, LOCALCOPY_NONBLOCKING, &async_req->u.y_req);
MPIR_ERR_CHECK(mpi_errno);
req->type = MPIR_TYPEREP_REQUEST;
async_req->type = MPIR_TYPEREP_REQUEST;
#endif

fn_exit:
Expand Down
1 change: 1 addition & 0 deletions src/mpid/ch4/netmod/ofi/Makefile.mk
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ mpi_core_sources += src/mpid/ch4/netmod/ofi/func_table.c \
src/mpid/ch4/netmod/ofi/ofi_progress.c \
src/mpid/ch4/netmod/ofi/ofi_am_events.c \
src/mpid/ch4/netmod/ofi/ofi_nic.c \
src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c \
src/mpid/ch4/netmod/ofi/globals.c \
src/mpid/ch4/netmod/ofi/init_provider.c \
src/mpid/ch4/netmod/ofi/init_settings.c \
Expand Down
4 changes: 2 additions & 2 deletions src/mpid/ch4/netmod/ofi/ofi_am_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_progress_do_queue(int vci_idx);
* The seq need be tracked between local (rank, vci) and remote (rank, vci).
* We don't need local rank since it is implicit on each process.
*
* LOCAL_ID is send to remote precess to identify self.
* LOCAL_ID is send to remote process to identify self.
* REMOTE_ID is used locally to track remote process.
* I realize the confusing part of the naming.
*
Expand Down Expand Up @@ -568,7 +568,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_emulated_inject(MPIR_Comm * comm, fi_a
memcpy(ibuf + sizeof(*msg_hdrp), am_hdr, am_hdr_sz);

MPIDI_OFI_REQUEST(sreq, event_id) = MPIDI_OFI_EVENT_INJECT_EMU;
MPIDI_OFI_REQUEST(sreq, util.inject_buf) = ibuf;
MPIDI_OFI_REQUEST(sreq, u.am_inject_emu.inject_buf) = ibuf;
MPIDI_OFI_global.per_vci[vci_src].am_inflight_inject_emus += 1;

MPIDI_OFI_CALL_RETRY_AM(fi_send(MPIDI_OFI_global.ctx[ctx_idx].tx, ibuf, len,
Expand Down
3 changes: 3 additions & 0 deletions src/mpid/ch4/netmod/ofi/ofi_comm.c
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,9 @@ int MPIDI_OFI_mpi_comm_commit_pre_hook(MPIR_Comm * comm)
MPIDI_OFI_COMM(comm).enable_hashing = 0;
MPIDI_OFI_COMM(comm).pref_nic = NULL;

/* Initialize tag for gpu_pipeline chunks; incremented by sender. */
MPIDI_OFI_COMM(comm).pipeline_tag = 0;

if (comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] == -1) {
comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] =
MPIR_CVAR_CH4_OFI_ENABLE_MULTI_NIC_STRIPING;
Expand Down
Loading