From 44e14624b1d4ed3961234f87c94b800517a2092e Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 16:00:34 +0800 Subject: [PATCH 01/28] Added win create library node and its test --- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_create.py | 82 ++++++++++++++++++++++++++ tests/library/mpi/win_create_test.py | 76 ++++++++++++++++++++++++ 3 files changed, 159 insertions(+) create mode 100644 dace/libraries/mpi/nodes/win_create.py create mode 100644 tests/library/mpi/win_create_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 0cd36cc82f..3a0b2e3348 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -13,3 +13,4 @@ from .alltoall import Alltoall from .dummy import Dummy from .redistribute import Redistribute +from .win_create import Win_create diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py new file mode 100644 index 0000000000..5d1bff89c6 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_create.py @@ -0,0 +1,82 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinCreateMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + win_buffer, win_buf_count_str = node.validate(parent_sdfg, parent_state) + win_buffer_dtype = dace.libraries.mpi.utils.MPI_DDT(win_buffer.dtype.base_type) + window_name = node.name + + node.fields = [ + f"MPI_Win {window_name}_window;" + ] + + comm = "MPI_COMM_WORLD" + if node.grid: + comm = f"__state->{node.grid}_comm" + + code = f""" + MPI_Win_create(_win_buffer, + {win_buf_count_str}, + sizeof({win_buffer_dtype}), + MPI_INFO_NULL, + {comm}, + &__state->{window_name}_window); + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + state_fields=node.fields, + language=dace.dtypes.Language.CPP, + side_effects=True) + + return tasklet + + +@dace.library.node +class Win_create(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinCreateMPI, + } + default_implementation = "MPI" + + grid = dace.properties.Property(dtype=str, allow_none=True, default=None) + + def __init__(self, name, grid=None, *args, **kwargs): + super().__init__(name, *args, inputs={"_win_buffer"}, outputs={"_out"}, **kwargs) + self.grid = grid + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer, root) of the three data descriptors in the + parent SDFG. + """ + + win_buffer = None + for e in state.in_edges(self): + if e.dst_conn == "_win_buffer": + win_buffer = sdfg.arrays[e.data.data] + + win_buf_count_str = "XXX" + for _, _, _, dst_conn, data in state.in_edges(self): + if dst_conn == '_win_buffer': + dims = [str(e) for e in data.subset.size_exact()] + win_buf_count_str = "*".join(dims) + + return win_buffer, win_buf_count_str diff --git a/tests/library/mpi/win_create_test.py b/tests/library/mpi/win_create_test.py new file mode 100644 index 0000000000..f5f8b58f78 --- /dev/null +++ b/tests/library/mpi/win_create_test.py @@ -0,0 +1,76 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_create") + state = sdfg.add_state("start") + + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + win_buffer = state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = state.add_write(window_name) + state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + + return sdfg + + +############################################################################### + + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_create(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + sdfg = make_sdfg(dtype) + mpi_func = utils.distributed_compile(sdfg, comm_world) + + window_size = 10 + win_buffer = np.arange(0, window_size, dtype=np_dtype) + + mpi_func(win_buffer=win_buffer, n=window_size) + + +############################################################################### + + +if __name__ == "__main__": + test_win_create(dace.float32) + test_win_create(dace.int32) From 4ff33c2ba86ae1aa8b3c776c670c4b35d064c94d Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 16:01:21 +0800 Subject: [PATCH 02/28] Added MPI RMA fence library node and its test --- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_fence.py | 43 ++++++++++ tests/library/mpi/win_fence_test.py | 108 ++++++++++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 dace/libraries/mpi/nodes/win_fence.py create mode 100644 tests/library/mpi/win_fence_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 3a0b2e3348..53097461d6 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -14,3 +14,4 @@ from .dummy import Dummy from .redistribute import Redistribute from .win_create import Win_create +from .win_fence import Win_fence diff --git a/dace/libraries/mpi/nodes/win_fence.py b/dace/libraries/mpi/nodes/win_fence.py new file mode 100644 index 0000000000..ae2d0a0dda --- /dev/null +++ b/dace/libraries/mpi/nodes/win_fence.py @@ -0,0 +1,43 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinFenceMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + window_name = node.window_name + code = f""" + MPI_Win_fence(_assertion, __state->{window_name}_window); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_fence(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinFenceMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_assertion"}, outputs={"_out"}, **kwargs) + self.window_name = window_name diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py new file mode 100644 index 0000000000..348945527e --- /dev/null +++ b/tests/library/mpi/win_fence_test.py @@ -0,0 +1,108 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_fence") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + fence_state = sdfg.add_state("win_fence") + + sdfg.add_edge(window_state, fence_state, dace.InterstateEdge()) + + fence_name = sdfg.add_fence() + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + window_node = fence_state.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + fence_state.add_edge(window_node, + None, + win_fence_node, + None, + Memlet.from_array(window_name, window_desc)) + + sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) + assertion_node = fence_state.add_access("assertion") + + fence_state.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state.add_write(fence_name) + fence_state.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_fence(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + sdfg = make_sdfg(dtype) + mpi_func = utils.distributed_compile(sdfg, comm_world) + + window_size = 10 + win_buffer = np.arange(0, window_size, dtype=np_dtype) + assertion = np.full([1], 0, dtype=np.int32) + + mpi_func(assertion=assertion, win_buffer=win_buffer, n=window_size) + +if __name__ == "__main__": + test_win_fence(dace.int32) + test_win_fence(dace.float32) From d3696e020bee85c7acab5ab90f23438cfafeff0a Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 16:02:07 +0800 Subject: [PATCH 03/28] Added RMA put library node and its test --- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_put.py | 80 +++++++++++ tests/library/mpi/win_put_test.py | 205 +++++++++++++++++++++++++++ 3 files changed, 286 insertions(+) create mode 100644 dace/libraries/mpi/nodes/win_put.py create mode 100644 tests/library/mpi/win_put_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 53097461d6..ba18c77bad 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -15,3 +15,4 @@ from .redistribute import Redistribute from .win_create import Win_create from .win_fence import Win_fence +from .win_put import Win_put diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py new file mode 100644 index 0000000000..815e1b5a12 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_put.py @@ -0,0 +1,80 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinPutMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + inbuffer, in_count_str = node.validate(parent_sdfg, parent_state) + mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type) + + window_name = node.window_name + + code = f""" + MPI_Put(_inbuffer, {in_count_str}, {mpi_dtype_str}, \ + _target_rank, 0, {in_count_str}, {mpi_dtype_str}, \ + __state->{window_name}_window); + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_put(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinPutMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs) + self.window_name = window_name + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer, root) of the three data descriptors in the + parent SDFG. + """ + + inbuffer, target_rank = None, None + for e in state.in_edges(self): + if e.dst_conn == "_inbuffer": + inbuffer = sdfg.arrays[e.data.data] + + in_count_str = "XXX" + for _, _, _, dst_conn, data in state.in_edges(self): + if dst_conn == '_inbuffer': + dims = [str(e) for e in data.subset.size_exact()] + in_count_str = "*".join(dims) + + # outbuffer = None + # for e in state.out_edges(self): + # if e.src_conn == "_outbuffer": + # outbuffer = sdfg.arrays[e.data.data] + # out_count_str = "XXX" + # for _, src_conn, _, _, data in state.out_edges(self): + # if src_conn == '_outbuffer': + # dims = [str(e) for e in data.subset.size_exact()] + # out_count_str = "*".join(dims) + + return inbuffer, in_count_str + diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py new file mode 100644 index 0000000000..73b31be9b4 --- /dev/null +++ b/tests/library/mpi/win_put_test.py @@ -0,0 +1,205 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_put") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False) + + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + fence_state_1 = sdfg.add_state("win_fence_1") + + sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) + + fence_name = sdfg.add_fence() + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + window_node = fence_state_1.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + fence_state_1.add_edge(window_node, + None, + win_fence_node, + None, + Memlet.from_array(window_name, window_desc)) + + assertion_node = fence_state_1.add_access("assertion") + + fence_state_1.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_1.add_write(fence_name) + fence_state_1.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + +############################################################################### + + put_state = sdfg.add_state("win_put") + + sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge()) + + put_name = sdfg.add_fence() + win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name) + + # pseudo access for ordering + fence_node = put_state.add_access(fence_name) + fence_desc = sdfg.arrays[fence_name] + + send_buffer = put_state.add_access("send_buffer") + + target_rank = put_state.add_access("target_rank") + + put_state.add_edge(fence_node, + None, + win_put_node, + None, + Memlet.from_array(fence_name, fence_desc)) + + put_state.add_edge(send_buffer, + None, + win_put_node, + "_inbuffer", + Memlet.simple(send_buffer, "0:n", num_accesses=n)) + + put_state.add_edge(target_rank, + None, + win_put_node, + "_target_rank", + Memlet.simple(target_rank, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True) + wnode = put_state.add_write(put_name) + put_state.add_edge(win_put_node, + "_out", + wnode, + None, + Memlet.from_array(put_name, scal)) + +############################################################################### + + fence_state_2 = sdfg.add_state("win_fence_2") + + sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge()) + + fence_name = sdfg.add_fence() + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + put_node = fence_state_2.add_access(put_name) + put_desc = sdfg.arrays[put_name] + + fence_state_2.add_edge(put_node, + None, + win_fence_node, + None, + Memlet.from_array(put_name, put_desc)) + + assertion_node = fence_state_2.add_access("assertion") + + fence_state_2.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_2.add_write(fence_name) + fence_state_2.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_put(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + sdfg = make_sdfg(dtype) + mpi_func = utils.distributed_compile(sdfg, comm_world) + + window_size = 10 + win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + send_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + + target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32) + + assertion = np.full([1], 0, dtype=np.int32) + + # print(comm_rank, win_buffer) + + mpi_func(assertion=assertion, + win_buffer=win_buffer, + send_buffer=send_buffer, + target_rank=target_rank, + n=window_size) + + # print(comm_rank, win_buffer) + + correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype) + if (not np.allclose(win_buffer, correct_data)): + raise (ValueError("The received values are not what I expected on root.")) + +if __name__ == "__main__": + test_win_put(dace.int32) + test_win_put(dace.float32) From b92ccbb1bf6d76815359e138aa53148455f6ae50 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 16:21:20 +0800 Subject: [PATCH 04/28] Added RMA get library node and its test --- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_get.py | 68 +++++++++ dace/libraries/mpi/nodes/win_put.py | 13 +- tests/library/mpi/win_get_test.py | 205 +++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 12 deletions(-) create mode 100644 dace/libraries/mpi/nodes/win_get.py create mode 100644 tests/library/mpi/win_get_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index ba18c77bad..5400bd45de 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -16,3 +16,4 @@ from .win_create import Win_create from .win_fence import Win_fence from .win_put import Win_put +from .win_get import Win_get diff --git a/dace/libraries/mpi/nodes/win_get.py b/dace/libraries/mpi/nodes/win_get.py new file mode 100644 index 0000000000..e05a5d6195 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_get.py @@ -0,0 +1,68 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinGetMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + outbuffer, out_count_str = node.validate(parent_sdfg, parent_state) + mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(outbuffer.dtype.base_type) + + window_name = node.window_name + + code = f""" + MPI_Get(_outbuffer, {out_count_str}, {mpi_dtype_str}, \ + _target_rank, 0, {out_count_str}, {mpi_dtype_str}, \ + __state->{window_name}_window); + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_get(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinGetMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs) + self.window_name = window_name + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer, root) of the three data descriptors in the + parent SDFG. + """ + + outbuffer = None + for e in state.out_edges(self): + if e.src_conn == "_outbuffer": + outbuffer = sdfg.arrays[e.data.data] + out_count_str = "XXX" + for _, src_conn, _, _, data in state.out_edges(self): + if src_conn == '_outbuffer': + dims = [str(e) for e in data.subset.size_exact()] + out_count_str = "*".join(dims) + + return outbuffer, out_count_str diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py index 815e1b5a12..6dd23a7324 100644 --- a/dace/libraries/mpi/nodes/win_put.py +++ b/dace/libraries/mpi/nodes/win_put.py @@ -55,7 +55,7 @@ def validate(self, sdfg, state): parent SDFG. """ - inbuffer, target_rank = None, None + inbuffer = None for e in state.in_edges(self): if e.dst_conn == "_inbuffer": inbuffer = sdfg.arrays[e.data.data] @@ -65,16 +65,5 @@ def validate(self, sdfg, state): if dst_conn == '_inbuffer': dims = [str(e) for e in data.subset.size_exact()] in_count_str = "*".join(dims) - - # outbuffer = None - # for e in state.out_edges(self): - # if e.src_conn == "_outbuffer": - # outbuffer = sdfg.arrays[e.data.data] - # out_count_str = "XXX" - # for _, src_conn, _, _, data in state.out_edges(self): - # if src_conn == '_outbuffer': - # dims = [str(e) for e in data.subset.size_exact()] - # out_count_str = "*".join(dims) return inbuffer, in_count_str - diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py new file mode 100644 index 0000000000..656c129f80 --- /dev/null +++ b/tests/library/mpi/win_get_test.py @@ -0,0 +1,205 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_put") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("receive_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False) + + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + fence_state_1 = sdfg.add_state("win_fence_1") + + sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) + + fence_name = sdfg.add_fence() + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + window_node = fence_state_1.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + fence_state_1.add_edge(window_node, + None, + win_fence_node, + None, + Memlet.from_array(window_name, window_desc)) + + assertion_node = fence_state_1.add_access("assertion") + + fence_state_1.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_1.add_write(fence_name) + fence_state_1.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + +############################################################################### + + get_state = sdfg.add_state("win_get") + + sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge()) + + get_name = sdfg.add_fence() + win_put_node = mpi.nodes.win_get.Win_get(get_name, window_name) + + # pseudo access for ordering + fence_node = get_state.add_access(fence_name) + fence_desc = sdfg.arrays[fence_name] + + target_rank = get_state.add_access("target_rank") + + get_state.add_edge(fence_node, + None, + win_put_node, + None, + Memlet.from_array(fence_name, fence_desc)) + + get_state.add_edge(target_rank, + None, + win_put_node, + "_target_rank", + Memlet.simple(target_rank, "0:1", num_accesses=1)) + + + receive_buffer = get_state.add_write("receive_buffer") + get_state.add_edge(win_put_node, + "_outbuffer", + receive_buffer, + None, + Memlet.simple(receive_buffer, "0:n", num_accesses=n)) + + _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True) + wnode = get_state.add_write(get_name) + get_state.add_edge(win_put_node, + "_out", + wnode, + None, + Memlet.from_array(get_name, scal)) + +############################################################################### + + fence_state_2 = sdfg.add_state("win_fence_2") + + sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge()) + + fence_name = sdfg.add_fence() + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + put_node = fence_state_2.add_access(get_name) + put_desc = sdfg.arrays[get_name] + + fence_state_2.add_edge(put_node, + None, + win_fence_node, + None, + Memlet.from_array(get_name, put_desc)) + + assertion_node = fence_state_2.add_access("assertion") + + fence_state_2.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_2.add_write(fence_name) + fence_state_2.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_put(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + sdfg = make_sdfg(dtype) + mpi_func = utils.distributed_compile(sdfg, comm_world) + + window_size = 10 + win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + receive_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + + target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32) + + assertion = np.full([1], 0, dtype=np.int32) + + print(comm_rank, receive_buffer) + + mpi_func(assertion=assertion, + win_buffer=win_buffer, + receive_buffer=receive_buffer, + target_rank=target_rank, + n=window_size) + + print(comm_rank, receive_buffer) + + correct_data = np.full(window_size, (comm_rank + 1) % comm_size, dtype=np_dtype) + if (not np.allclose(receive_buffer, correct_data)): + raise (ValueError("The received values are not what I expected on root.")) + +if __name__ == "__main__": + test_win_put(dace.int32) + # test_win_put(dace.float32) From 6ebaed5e1fa23c58b29cfe2e23706675e6296855 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 20:17:28 +0800 Subject: [PATCH 05/28] Removed debug msg in put/get tests Removed debug msg in put/get tests Renamed sdfg --- tests/library/mpi/win_get_test.py | 8 ++------ tests/library/mpi/win_put_test.py | 4 ---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py index 656c129f80..697f4f6d54 100644 --- a/tests/library/mpi/win_get_test.py +++ b/tests/library/mpi/win_get_test.py @@ -15,7 +15,7 @@ def make_sdfg(dtype): n = dace.symbol("n") - sdfg = dace.SDFG("mpi_win_put") + sdfg = dace.SDFG("mpi_win_get") window_state = sdfg.add_state("create_window") sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) @@ -186,20 +186,16 @@ def test_win_put(dtype): assertion = np.full([1], 0, dtype=np.int32) - print(comm_rank, receive_buffer) - mpi_func(assertion=assertion, win_buffer=win_buffer, receive_buffer=receive_buffer, target_rank=target_rank, n=window_size) - print(comm_rank, receive_buffer) - correct_data = np.full(window_size, (comm_rank + 1) % comm_size, dtype=np_dtype) if (not np.allclose(receive_buffer, correct_data)): raise (ValueError("The received values are not what I expected on root.")) if __name__ == "__main__": test_win_put(dace.int32) - # test_win_put(dace.float32) + test_win_put(dace.float32) diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py index 73b31be9b4..ce1c6ba2e3 100644 --- a/tests/library/mpi/win_put_test.py +++ b/tests/library/mpi/win_put_test.py @@ -186,16 +186,12 @@ def test_win_put(dtype): assertion = np.full([1], 0, dtype=np.int32) - # print(comm_rank, win_buffer) - mpi_func(assertion=assertion, win_buffer=win_buffer, send_buffer=send_buffer, target_rank=target_rank, n=window_size) - # print(comm_rank, win_buffer) - correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype) if (not np.allclose(win_buffer, correct_data)): raise (ValueError("The received values are not what I expected on root.")) From b4e9e6579735e8cc0c8d6a7523a6eb6c7d33cb27 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 21:27:04 +0800 Subject: [PATCH 06/28] Updated RMA ops administration in sdfg.py Removed a space Updated to remove put in get --- dace/distr_types.py | 24 ++++++++++++++ dace/sdfg/sdfg.py | 49 +++++++++++++++++++++++++++-- tests/library/mpi/win_fence_test.py | 2 +- tests/library/mpi/win_get_test.py | 31 +++++++++--------- tests/library/mpi/win_put_test.py | 6 ++-- 5 files changed, 89 insertions(+), 23 deletions(-) diff --git a/dace/distr_types.py b/dace/distr_types.py index 1b595a1b84..77e4730ad1 100644 --- a/dace/distr_types.py +++ b/dace/distr_types.py @@ -598,3 +598,27 @@ def exit_code(self, sdfg): delete[] __state->{self.name}_self_dst; delete[] __state->{self.name}_self_size; """ + +@make_properties +class RMA_window(object): + """ + RMA_window is the descriptor class for MPI Remote Memory Access window + Real window creation is implemented in mpi.nodes.win_create.Win_create + """ + + name = Property(dtype=str, desc="The name of new window.") + def __init__(self, + name: str): + self.name = name + self._validate() + + def validate(self): + """ Validate the correctness of this object. + Raises an exception on error. """ + self._validate() + + # Validation of this class is in a separate function, so that this + # class can call `_validate()` without calling the subclasses' + # `validate` function. + def _validate(self): + return True diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index f3a37ef08c..1fb32cdbf8 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -32,7 +32,7 @@ from dace.sdfg.graph import OrderedDiGraph, Edge, SubgraphView from dace.sdfg.state import SDFGState from dace.sdfg.propagation import propagate_memlets_sdfg -from dace.distr_types import ProcessGrid, SubArray, RedistrArray +from dace.distr_types import ProcessGrid, SubArray, RedistrArray, RMA_window from dace.dtypes import validate_name from dace.properties import (DebugInfoProperty, EnumProperty, ListProperty, make_properties, Property, CodeProperty, TransformationHistProperty, OptionalSDFGReferenceProperty, DictProperty, CodeBlock) @@ -409,6 +409,16 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]): desc="Process-grid descriptors for this SDFG", to_json=_arrays_to_json, from_json=_arrays_from_json) + _windows = DictProperty(str, + RMA_window, + desc="MPI RMA window descriptors for this SDFG", + to_json=_arrays_to_json, + from_json=_arrays_from_json) + _rma_ops = DictProperty(str, + str, + desc="MPI RMA fence descriptors for this SDFG", + to_json=_arrays_to_json, + from_json=_arrays_from_json) _subarrays = DictProperty(str, SubArray, desc="Sub-array descriptors for this SDFG", @@ -477,6 +487,8 @@ def __init__(self, # Grid-distribution-related fields self._pgrids = {} + self._windows = {} + self._rma_ops = {} self._subarrays = {} self._rdistrarrays = {} @@ -647,6 +659,16 @@ def process_grids(self): """ Returns a dictionary of process-grid descriptors (`ProcessGrid` objects) used in this SDFG. """ return self._pgrids + @property + def rma_windows(self): + """ Returns a dictionary of RMA window descriptors (`RMA_window` objects) used in this SDFG. """ + return self._windows + + @property + def rma_ops(self): + """ Returns a dictionary of RMA operations descriptors (an empty string) used in this SDFG. """ + return self._rma_ops + @property def subarrays(self): """ Returns a dictionary of sub-array descriptors (`SubArray` objects) used in this SDFG. """ @@ -1666,8 +1688,9 @@ def add_state_after(self, state: 'SDFGState', label=None, is_start_state=False) def _find_new_name(self, name: str): """ Tries to find a new name by adding an underscore and a number. """ - names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | self._subarrays.keys() - | self._rdistrarrays.keys()) + names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | + self._subarrays.keys() | self._rdistrarrays.keys() | self._windows.keys() | + self._rma_ops.keys()) return dt.find_new_name(name, names) def find_new_constant(self, name: str): @@ -2043,6 +2066,26 @@ def add_pgrid(self, return grid_name + def add_window(self): + """ Adds a RMA window to the RMA window descriptor store. + """ + + window_name = self._find_new_name('__win') + + self._windows[window_name] = RMA_window(window_name) + + return window_name + + def add_rma_ops(self): + """ Adds a RMA op to the RMA ops descriptor store. + """ + + rma_op_name = self._find_new_name('__win_op') + + self._rma_ops[rma_op_name] = "" + + return rma_op_name + def add_subarray(self, dtype: dtypes.typeclass, shape: ShapeType, diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py index 348945527e..e114e3c7cf 100644 --- a/tests/library/mpi/win_fence_test.py +++ b/tests/library/mpi/win_fence_test.py @@ -45,7 +45,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state, dace.InterstateEdge()) - fence_name = sdfg.add_fence() + fence_name = sdfg.add_rma_ops() win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py index 697f4f6d54..18ac9d7cfb 100644 --- a/tests/library/mpi/win_get_test.py +++ b/tests/library/mpi/win_get_test.py @@ -49,7 +49,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) - fence_name = sdfg.add_fence() + fence_name = sdfg.add_rma_ops() win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering @@ -84,8 +84,8 @@ def make_sdfg(dtype): sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge()) - get_name = sdfg.add_fence() - win_put_node = mpi.nodes.win_get.Win_get(get_name, window_name) + get_name = sdfg.add_rma_ops() + win_get_node = mpi.nodes.win_get.Win_get(get_name, window_name) # pseudo access for ordering fence_node = get_state.add_access(fence_name) @@ -95,19 +95,18 @@ def make_sdfg(dtype): get_state.add_edge(fence_node, None, - win_put_node, + win_get_node, None, Memlet.from_array(fence_name, fence_desc)) get_state.add_edge(target_rank, None, - win_put_node, + win_get_node, "_target_rank", Memlet.simple(target_rank, "0:1", num_accesses=1)) - receive_buffer = get_state.add_write("receive_buffer") - get_state.add_edge(win_put_node, + get_state.add_edge(win_get_node, "_outbuffer", receive_buffer, None, @@ -115,7 +114,7 @@ def make_sdfg(dtype): _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True) wnode = get_state.add_write(get_name) - get_state.add_edge(win_put_node, + get_state.add_edge(win_get_node, "_out", wnode, None, @@ -127,18 +126,18 @@ def make_sdfg(dtype): sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge()) - fence_name = sdfg.add_fence() + fence_name = sdfg.add_rma_ops() win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering - put_node = fence_state_2.add_access(get_name) - put_desc = sdfg.arrays[get_name] + get_node = fence_state_2.add_access(get_name) + get_desc = sdfg.arrays[get_name] - fence_state_2.add_edge(put_node, + fence_state_2.add_edge(get_node, None, win_fence_node, None, - Memlet.from_array(get_name, put_desc)) + Memlet.from_array(get_name, get_desc)) assertion_node = fence_state_2.add_access("assertion") @@ -165,7 +164,7 @@ def make_sdfg(dtype): pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) ]) -def test_win_put(dtype): +def test_win_get(dtype): from mpi4py import MPI np_dtype = getattr(np, dtype.to_string()) comm_world = MPI.COMM_WORLD @@ -197,5 +196,5 @@ def test_win_put(dtype): raise (ValueError("The received values are not what I expected on root.")) if __name__ == "__main__": - test_win_put(dace.int32) - test_win_put(dace.float32) + test_win_get(dace.int32) + test_win_get(dace.float32) diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py index ce1c6ba2e3..4352e7294d 100644 --- a/tests/library/mpi/win_put_test.py +++ b/tests/library/mpi/win_put_test.py @@ -49,7 +49,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) - fence_name = sdfg.add_fence() + fence_name = sdfg.add_rma_ops() win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering @@ -84,7 +84,7 @@ def make_sdfg(dtype): sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge()) - put_name = sdfg.add_fence() + put_name = sdfg.add_rma_ops() win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name) # pseudo access for ordering @@ -127,7 +127,7 @@ def make_sdfg(dtype): sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge()) - fence_name = sdfg.add_fence() + fence_name = sdfg.add_rma_ops() win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering From 5be096288852e6b401f5ab390d3a9c1e3897f986 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 18 Aug 2023 21:52:54 +0800 Subject: [PATCH 07/28] Implemented replacement for MPI RMA win_create, fence, put, get --- dace/frontend/common/distr.py | 241 +++++++++++++++++++++++++++++++ dace/frontend/python/newast.py | 5 + tests/library/mpi/mpi4py_test.py | 68 +++++++++ 3 files changed, 314 insertions(+) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index d6f22da358..e3107457a4 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -15,6 +15,27 @@ RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic] ProgramVisitor = 'dace.frontend.python.newast.ProgramVisitor' +# a helper function for getting an access node by argument name +# creates a scalar if it's a number +def _get_int_arg_node(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + argument: Union[str, sp.Expr, Number] + ): + if isinstance(argument, str) and argument in sdfg.arrays.keys(): + arg_name = argument + arg_node = state.add_read(arg_name) + else: + # create a transient scalar and take its name + arg_name = _define_local_scalar(pv, sdfg, state, dace.int32) + arg_node = state.add_access(arg_name) + # every tasklet is in different scope, no need to worry about name confilct + color_tasklet = state.add_tasklet(f'_set_{arg_name}_', {}, {'__out'}, f'__out = {argument}') + state.add_edge(color_tasklet, '__out', arg_node, None, Memlet.simple(arg_node, '0')) + + return arg_name, arg_node + + ##### MPI Cartesian Communicators @@ -894,6 +915,226 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str): return None +@oprepo.replaces('mpi4py.MPI.Win.Create') +@oprepo.replaces('dace.Win.Create') +def _rma_window_create(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + buffer: str, + comm: Union[str, ShapeType], + grid: str = None): + """ Adds a RMA window to the DaCe Program. + + :param buffer: The name of window buffer. + :param comm: A dummy input for compatibility with mpi4py + :process_grid: Name of the process-grid for collective scatter/gather operations. + :return: Name of the window. + """ + + from dace.libraries.mpi.nodes.win_create import Win_create + + # fine a new window name + window_name = sdfg.add_window() + + window_node = Win_create(window_name, grid) + + buf_desc = sdfg.arrays[buffer] + buf_node = state.add_read(buffer) + state.add_edge(buf_node, + None, + window_node, + '_win_buffer', + Memlet.from_array(buffer, buf_desc)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = state.add_write(window_name) + state.add_edge(window_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + + return window_name + + +@oprepo.replaces_method('RMA_window', 'Fence') +def _rma_fence(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + assertion: Union[str, sp.Expr, Number] = 0): + """ Adds a RMA fence to the DaCe Program. + + :param window_name: The name of the window to be sychronized. + :param assertion: A value or scalar for fence assertion. + :return: Name of the fence. + """ + + from dace.libraries.mpi.nodes.win_fence import Win_fence + + # fine a new fence name + fence_name = sdfg.add_rma_ops() + + _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) + + fence_node = Win_fence(fence_name, window_name) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + if len(all_rma_ops_name) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(fence_name) - 1] + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + + # for window fence ordering + state.add_edge(last_rma_op_node, + None, + fence_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + state.add_edge(assertion_node, + None, + fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = state.add_write(fence_name) + state.add_edge(fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + + return window_name + + +@oprepo.replaces_method('RMA_window', 'Put') +def _rma_put(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + origin: str, + target_rank: Union[str, sp.Expr, Number] = 0): + """ Initiate a RMA put for the DaCe Program. + + :param window_name: The name of the window to be sychronized. + :param origin: The name of origin buffer. + :target_rank: A value or scalar of the target rank. + :return: Name of the new RMA put descriptor. + """ + + from dace.libraries.mpi.nodes.win_put import Win_put + + put_name = sdfg.add_rma_ops() + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(put_name) - 1] + + put_node = Win_put(put_name, window_name) + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + state.add_edge(last_rma_op_node, + None, + put_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + origin_node = state.add_read(origin) + origin_desc = sdfg.arrays[origin] + state.add_edge(origin_node, + None, + put_node, + '_inbuffer', + Memlet.from_array(origin, origin_desc)) + + _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank) + state.add_edge(target_rank_node, + None, + put_node, + '_target_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True) + wnode = state.add_write(put_name) + state.add_edge(put_node, + "_out", + wnode, + None, + Memlet.from_array(put_name, scal)) + + return put_name + + +@oprepo.replaces_method('RMA_window', 'Get') +def _rma_get(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + origin: str, + target_rank: Union[str, sp.Expr, Number] = 0): + """ Initiate a RMA get for the DaCe Program. + + :param window_name: The name of the window to be sychronized. + :param origin: The name of origin buffer. + :target_rank: A value or scalar of the target rank. + :return: Name of the new RMA get descriptor. + """ + + from dace.libraries.mpi.nodes.win_get import Win_get + + get_name = sdfg.add_rma_ops() + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(get_name) - 1] + + get_node = Win_get(get_name, window_name) + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + state.add_edge(last_rma_op_node, + None, + get_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank) + state.add_edge(target_rank_node, + None, + get_node, + '_target_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + origin_node = state.add_write(origin) + origin_desc = sdfg.arrays[origin] + state.add_edge(get_node, + '_outbuffer', + origin_node, + None, + Memlet.from_array(origin, origin_desc)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True) + wnode = state.add_write(get_name) + state.add_edge(get_node, + '_out', + wnode, + None, + Memlet.from_array(get_name, scal)) + + return get_name + + @oprepo.replaces('dace.comm.Subarray') def _subarray(pv: ProgramVisitor, sdfg: SDFG, diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py index c9d92b7860..5e92f6b487 100644 --- a/dace/frontend/python/newast.py +++ b/dace/frontend/python/newast.py @@ -1307,6 +1307,9 @@ def defined(self): result.update( {k: self.sdfg.process_grids[v] for k, v in self.variables.items() if v in self.sdfg.process_grids}) + result.update( + {k: self.sdfg.rma_windows[v] + for k, v in self.variables.items() if v in self.sdfg.rma_windows}) try: from mpi4py import MPI result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)}) @@ -4686,6 +4689,8 @@ def _gettype(self, opnode: ast.AST) -> List[Tuple[str, str]]: for operand in operands: if isinstance(operand, str) and operand in self.sdfg.process_grids: result.append((operand, type(self.sdfg.process_grids[operand]).__name__)) + elif isinstance(operand, str) and operand in self.sdfg.rma_windows: + result.append((operand, type(self.sdfg.rma_windows[operand]).__name__)) elif isinstance(operand, str) and operand in self.sdfg.arrays: result.append((operand, type(self.sdfg.arrays[operand]))) elif isinstance(operand, str) and operand in self.scope_arrays: diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 52b5deb7a8..ca185077b8 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -39,6 +39,72 @@ def comm_world_bcast(A: dace.int32[10]): assert (np.array_equal(A, A_ref)) +@pytest.mark.mpi +def test_RMA_put(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def mpi4py_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Fence(0) + win.Put(send_buf, target_rank=rank) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_rma_put.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + win_buffer_ref = np.full(window_size, rank, dtype=np.int32) + send_buffer = np.full(window_size, rank, dtype=np.int32) + + func(win_buf=win_buffer, send_buf=send_buffer, rank=((rank + 1) % size)) + mpi4py_rma_put.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=((rank + 1) % size)) + + assert (np.array_equal(win_buffer, win_buffer_ref)) + + +@pytest.mark.mpi +def test_RMA_get(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def mpi4py_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Fence(0) + win.Get(recv_buf, target_rank=rank) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_rma_get.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + recv_buf = np.full(window_size, rank, dtype=np.int32) + recv_buf_ref = np.full(window_size, rank, dtype=np.int32) + + func(win_buf=win_buffer, recv_buf=recv_buf, rank=((rank + 1) % size)) + mpi4py_rma_get.f(win_buf=win_buffer, recv_buf=recv_buf_ref, rank=((rank + 1) % size)) + + assert (np.array_equal(recv_buf, recv_buf_ref)) + + @pytest.mark.mpi def test_external_comm_bcast(): @@ -348,3 +414,5 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): test_isend_irecv() test_send_recv() test_alltoall() + test_RMA_put() + test_RMA_get() From ebf8eeaa5617f9fef64e3f9e4fc50cdb45fe91c5 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 25 Aug 2023 14:04:20 +0800 Subject: [PATCH 08/28] Added an extra connector to RMA put,get for ordering --- dace/frontend/common/distr.py | 4 ++-- dace/libraries/mpi/nodes/win_get.py | 2 +- dace/libraries/mpi/nodes/win_put.py | 2 +- tests/library/mpi/win_get_test.py | 2 +- tests/library/mpi/win_put_test.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index e3107457a4..7b79556eb2 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1045,7 +1045,7 @@ def _rma_put(pv: ProgramVisitor, state.add_edge(last_rma_op_node, None, put_node, - None, + "_in", Memlet.from_array(last_rma_op_name, last_rma_op_desc)) origin_node = state.add_read(origin) @@ -1105,7 +1105,7 @@ def _rma_get(pv: ProgramVisitor, state.add_edge(last_rma_op_node, None, get_node, - None, + "_in", Memlet.from_array(last_rma_op_name, last_rma_op_desc)) _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank) diff --git a/dace/libraries/mpi/nodes/win_get.py b/dace/libraries/mpi/nodes/win_get.py index e05a5d6195..fb8f6bacb9 100644 --- a/dace/libraries/mpi/nodes/win_get.py +++ b/dace/libraries/mpi/nodes/win_get.py @@ -46,7 +46,7 @@ class Win_get(MPINode): window_name = dace.properties.Property(dtype=str, default=None) def __init__(self, name, window_name, *args, **kwargs): - super().__init__(name, *args, inputs={"_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs) + super().__init__(name, *args, inputs={"_in", "_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs) self.window_name = window_name def validate(self, sdfg, state): diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py index 6dd23a7324..de3811cd7c 100644 --- a/dace/libraries/mpi/nodes/win_put.py +++ b/dace/libraries/mpi/nodes/win_put.py @@ -46,7 +46,7 @@ class Win_put(MPINode): window_name = dace.properties.Property(dtype=str, default=None) def __init__(self, name, window_name, *args, **kwargs): - super().__init__(name, *args, inputs={"_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs) + super().__init__(name, *args, inputs={"_in", "_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs) self.window_name = window_name def validate(self, sdfg, state): diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py index 18ac9d7cfb..8be9245278 100644 --- a/tests/library/mpi/win_get_test.py +++ b/tests/library/mpi/win_get_test.py @@ -96,7 +96,7 @@ def make_sdfg(dtype): get_state.add_edge(fence_node, None, win_get_node, - None, + "_in", Memlet.from_array(fence_name, fence_desc)) get_state.add_edge(target_rank, diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py index 4352e7294d..56fd7bdc67 100644 --- a/tests/library/mpi/win_put_test.py +++ b/tests/library/mpi/win_put_test.py @@ -98,7 +98,7 @@ def make_sdfg(dtype): put_state.add_edge(fence_node, None, win_put_node, - None, + "_in", Memlet.from_array(fence_name, fence_desc)) put_state.add_edge(send_buffer, From 6fa9603ad44f8777106ff4467760513aa4c2e0c4 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 25 Aug 2023 14:10:12 +0800 Subject: [PATCH 09/28] Updated amd_rma_ops for better readability --- dace/frontend/common/distr.py | 6 +++--- dace/sdfg/sdfg.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 7b79556eb2..e1e47a466a 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -974,7 +974,7 @@ def _rma_fence(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_fence import Win_fence # fine a new fence name - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops("fence") _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) @@ -1032,7 +1032,7 @@ def _rma_put(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_put import Win_put - put_name = sdfg.add_rma_ops() + put_name = sdfg.add_rma_ops("put") # check for the last RMA operation all_rma_ops_name = list(sdfg._rma_ops.keys()) @@ -1092,7 +1092,7 @@ def _rma_get(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_get import Win_get - get_name = sdfg.add_rma_ops() + get_name = sdfg.add_rma_ops("get") # check for the last RMA operation all_rma_ops_name = list(sdfg._rma_ops.keys()) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index 1fb32cdbf8..dc0ca05132 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -2076,11 +2076,11 @@ def add_window(self): return window_name - def add_rma_ops(self): + def add_rma_ops(self, op:str): """ Adds a RMA op to the RMA ops descriptor store. """ - rma_op_name = self._find_new_name('__win_op') + rma_op_name = self._find_new_name(f'__win_{op}') self._rma_ops[rma_op_name] = "" From ce8c173d83a97eacb75a40361de73bd061eba652 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 25 Aug 2023 14:44:29 +0800 Subject: [PATCH 10/28] Added support of different comm world for win_create --- dace/frontend/common/distr.py | 9 +++++++-- dace/libraries/mpi/nodes/win_create.py | 10 +++++----- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index e1e47a466a..d4951a2257 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -926,17 +926,22 @@ def _rma_window_create(pv: ProgramVisitor, """ Adds a RMA window to the DaCe Program. :param buffer: The name of window buffer. - :param comm: A dummy input for compatibility with mpi4py + :param comm: The comm world name of this window :process_grid: Name of the process-grid for collective scatter/gather operations. :return: Name of the window. """ from dace.libraries.mpi.nodes.win_create import Win_create + # if 'comm' is not a 'str' means it's using mpi4py objects + # which can only be deafult the comm world + if not isinstance(comm, str): + comm = None + # fine a new window name window_name = sdfg.add_window() - window_node = Win_create(window_name, grid) + window_node = Win_create(window_name, comm) buf_desc = sdfg.arrays[buffer] buf_node = state.add_read(buffer) diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py index 5d1bff89c6..e3f7ba10d0 100644 --- a/dace/libraries/mpi/nodes/win_create.py +++ b/dace/libraries/mpi/nodes/win_create.py @@ -24,8 +24,8 @@ def expansion(node, parent_state, parent_sdfg, **kwargs): ] comm = "MPI_COMM_WORLD" - if node.grid: - comm = f"__state->{node.grid}_comm" + if node.comm: + comm = f"__state->{node.comm}_comm" code = f""" MPI_Win_create(_win_buffer, @@ -56,11 +56,11 @@ class Win_create(MPINode): } default_implementation = "MPI" - grid = dace.properties.Property(dtype=str, allow_none=True, default=None) + comm = dace.properties.Property(dtype=str, allow_none=True, default=None) - def __init__(self, name, grid=None, *args, **kwargs): + def __init__(self, name, comm=None, *args, **kwargs): super().__init__(name, *args, inputs={"_win_buffer"}, outputs={"_out"}, **kwargs) - self.grid = grid + self.comm = comm def validate(self, sdfg, state): """ From 1e8606fcc3f0902ef7aa5170218251cf4b633570 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 25 Aug 2023 17:05:40 +0800 Subject: [PATCH 11/28] Added a synchronization check for RMA put/get --- dace/frontend/common/distr.py | 40 +++++++++++++++++++++++++++++------ dace/sdfg/sdfg.py | 6 +++--- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index d4951a2257..e27b24fe30 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -979,7 +979,7 @@ def _rma_fence(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_fence import Win_fence # fine a new fence name - fence_name = sdfg.add_rma_ops("fence") + fence_name = sdfg.add_rma_ops(window_name, "fence") _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) @@ -987,10 +987,12 @@ def _rma_fence(pv: ProgramVisitor, # check for the last RMA operation all_rma_ops_name = list(sdfg._rma_ops.keys()) - if len(all_rma_ops_name) == 1: + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: last_rma_op_name = window_name else: - last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(fence_name) - 1] + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(fence_name) - 1] last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1037,11 +1039,23 @@ def _rma_put(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_put import Win_put - put_name = sdfg.add_rma_ops("put") + put_name = sdfg.add_rma_ops(window_name, "put") # check for the last RMA operation all_rma_ops_name = list(sdfg._rma_ops.keys()) - last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(put_name) - 1] + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + cur_window_fences = [rma_op for rma_op in cur_window_rma_ops + if f"{window_name}_fence" in rma_op] + + if len(cur_window_fences) % 2: + # if only odd number of fences, + # that means we're in a ongoing epoch + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1] + else: + # if even number of fences, + # that means this operation is either a passive sync. one or a corrupted one + raise ValueError("Wrong synchronization of RMA calls!") put_node = Win_put(put_name, window_name) @@ -1097,11 +1111,23 @@ def _rma_get(pv: ProgramVisitor, from dace.libraries.mpi.nodes.win_get import Win_get - get_name = sdfg.add_rma_ops("get") + get_name = sdfg.add_rma_ops(window_name, "get") # check for the last RMA operation all_rma_ops_name = list(sdfg._rma_ops.keys()) - last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(get_name) - 1] + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + cur_window_fences = [rma_op for rma_op in cur_window_rma_ops + if f"{window_name}_fence" in rma_op] + + if len(cur_window_fences) % 2: + # if only odd number of fences, + # that means we're in a ongoing epoch + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1] + else: + # if even number of fences, + # that means this operation is either a passive sync. one or a corrupted one + raise ValueError("Wrong synchronization of RMA calls!") get_node = Win_get(get_name, window_name) diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py index dc0ca05132..edf71bcb4b 100644 --- a/dace/sdfg/sdfg.py +++ b/dace/sdfg/sdfg.py @@ -416,7 +416,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]): from_json=_arrays_from_json) _rma_ops = DictProperty(str, str, - desc="MPI RMA fence descriptors for this SDFG", + desc="MPI RMA ops descriptors for this SDFG", to_json=_arrays_to_json, from_json=_arrays_from_json) _subarrays = DictProperty(str, @@ -2076,11 +2076,11 @@ def add_window(self): return window_name - def add_rma_ops(self, op:str): + def add_rma_ops(self, window_name:str, op:str): """ Adds a RMA op to the RMA ops descriptor store. """ - rma_op_name = self._find_new_name(f'__win_{op}') + rma_op_name = self._find_new_name(f'{window_name}_{op}') self._rma_ops[rma_op_name] = "" From c4d2ab90a6fc0b3f106e351ddcb59b297ace5184 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 25 Aug 2023 21:28:44 +0800 Subject: [PATCH 12/28] Updated fence, get, and put tests --- tests/library/mpi/win_fence_test.py | 2 +- tests/library/mpi/win_get_test.py | 6 +++--- tests/library/mpi/win_put_test.py | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py index e114e3c7cf..4c2b9beb9a 100644 --- a/tests/library/mpi/win_fence_test.py +++ b/tests/library/mpi/win_fence_test.py @@ -45,7 +45,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state, dace.InterstateEdge()) - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops(window_name, "fence") win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py index 8be9245278..50c3d5d7e2 100644 --- a/tests/library/mpi/win_get_test.py +++ b/tests/library/mpi/win_get_test.py @@ -49,7 +49,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops(window_name, "fence") win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering @@ -84,7 +84,7 @@ def make_sdfg(dtype): sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge()) - get_name = sdfg.add_rma_ops() + get_name = sdfg.add_rma_ops(window_name, "get") win_get_node = mpi.nodes.win_get.Win_get(get_name, window_name) # pseudo access for ordering @@ -126,7 +126,7 @@ def make_sdfg(dtype): sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge()) - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops(window_name, "fence") win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py index 56fd7bdc67..d08001a646 100644 --- a/tests/library/mpi/win_put_test.py +++ b/tests/library/mpi/win_put_test.py @@ -49,7 +49,7 @@ def make_sdfg(dtype): sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops(window_name, "fence") win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering @@ -84,7 +84,7 @@ def make_sdfg(dtype): sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge()) - put_name = sdfg.add_rma_ops() + put_name = sdfg.add_rma_ops(window_name, "put") win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name) # pseudo access for ordering @@ -127,7 +127,7 @@ def make_sdfg(dtype): sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge()) - fence_name = sdfg.add_rma_ops() + fence_name = sdfg.add_rma_ops(window_name, "fence") win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) # pseudo access for ordering From 0a564bb483601dc4711dbba72076be67f20549a2 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 28 Aug 2023 16:37:32 +0800 Subject: [PATCH 13/28] Replaced SDFG compile to avoid RuntimeError: Could not load library --- tests/library/mpi/win_create_test.py | 10 +++++++--- tests/library/mpi/win_fence_test.py | 10 +++++++--- tests/library/mpi/win_get_test.py | 10 +++++++--- tests/library/mpi/win_put_test.py | 10 +++++++--- 4 files changed, 28 insertions(+), 12 deletions(-) diff --git a/tests/library/mpi/win_create_test.py b/tests/library/mpi/win_create_test.py index f5f8b58f78..db9d356c74 100644 --- a/tests/library/mpi/win_create_test.py +++ b/tests/library/mpi/win_create_test.py @@ -58,9 +58,13 @@ def test_win_create(dtype): if comm_size < 2: raise ValueError("This test is supposed to be run with at least two processes!") - - sdfg = make_sdfg(dtype) - mpi_func = utils.distributed_compile(sdfg, comm_world) + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() window_size = 10 win_buffer = np.arange(0, window_size, dtype=np_dtype) diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py index 4c2b9beb9a..20a6b11f0f 100644 --- a/tests/library/mpi/win_fence_test.py +++ b/tests/library/mpi/win_fence_test.py @@ -93,9 +93,13 @@ def test_win_fence(dtype): if comm_size < 2: raise ValueError("This test is supposed to be run with at least two processes!") - - sdfg = make_sdfg(dtype) - mpi_func = utils.distributed_compile(sdfg, comm_world) + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() window_size = 10 win_buffer = np.arange(0, window_size, dtype=np_dtype) diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py index 50c3d5d7e2..9f2e780d69 100644 --- a/tests/library/mpi/win_get_test.py +++ b/tests/library/mpi/win_get_test.py @@ -173,9 +173,13 @@ def test_win_get(dtype): if comm_size < 2: raise ValueError("This test is supposed to be run with at least two processes!") - - sdfg = make_sdfg(dtype) - mpi_func = utils.distributed_compile(sdfg, comm_world) + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() window_size = 10 win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py index d08001a646..0e8af8487b 100644 --- a/tests/library/mpi/win_put_test.py +++ b/tests/library/mpi/win_put_test.py @@ -174,9 +174,13 @@ def test_win_put(dtype): if comm_size < 2: raise ValueError("This test is supposed to be run with at least two processes!") - - sdfg = make_sdfg(dtype) - mpi_func = utils.distributed_compile(sdfg, comm_world) + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() window_size = 10 win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) From d1f6ba2a87d4d501afcc1d7be1bafc86a8807392 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 28 Aug 2023 17:37:15 +0800 Subject: [PATCH 14/28] Added RMA accumulate library node, test, and replacement --- dace/frontend/common/distr.py | 81 +++++++- dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_accumulate.py | 72 ++++++++ tests/library/mpi/mpi4py_test.py | 34 ++++ tests/library/mpi/win_accumulate_test.py | 205 +++++++++++++++++++++ 5 files changed, 391 insertions(+), 2 deletions(-) create mode 100644 dace/libraries/mpi/nodes/win_accumulate.py create mode 100644 tests/library/mpi/win_accumulate_test.py diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index e27b24fe30..63acc4a449 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1028,7 +1028,7 @@ def _rma_put(pv: ProgramVisitor, state: SDFGState, window_name: str, origin: str, - target_rank: Union[str, sp.Expr, Number] = 0): + target_rank: Union[str, sp.Expr, Number]): """ Initiate a RMA put for the DaCe Program. :param window_name: The name of the window to be sychronized. @@ -1100,7 +1100,7 @@ def _rma_get(pv: ProgramVisitor, state: SDFGState, window_name: str, origin: str, - target_rank: Union[str, sp.Expr, Number] = 0): + target_rank: Union[str, sp.Expr, Number]): """ Initiate a RMA get for the DaCe Program. :param window_name: The name of the window to be sychronized. @@ -1166,6 +1166,83 @@ def _rma_get(pv: ProgramVisitor, return get_name +@oprepo.replaces_method('RMA_window', 'Accumulate') +def _rma_accumulate(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + origin: str, + target_rank: Union[str, sp.Expr, Number], + op: str = "MPI_SUM"): + """ Initiate a RMA accumulate for the DaCe Program. + + :param window_name: The name of the window to be sychronized. + :param origin: The name of origin buffer. + :target_rank: A value or scalar of the target rank. + :op: The name of MPI reduction + :return: Name of the new RMA accumulate descriptor. + """ + from mpi4py import MPI + from dace.libraries.mpi.nodes.win_accumulate import Win_accumulate + + accumulate_name = sdfg.add_rma_ops(window_name, "accumulate") + + if isinstance(op, MPI.Op): + op = _mpi4py_to_MPI(MPI, op) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + cur_window_fences = [rma_op for rma_op in cur_window_rma_ops + if f"{window_name}_fence" in rma_op] + + if len(cur_window_fences) % 2: + # if only odd number of fences, + # that means we're in a ongoing epoch + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1] + else: + # if even number of fences, + # that means this operation is either a passive sync. one or a corrupted one + raise ValueError("Wrong synchronization of RMA calls!") + + accumulate_node = Win_accumulate(accumulate_name, window_name, op) + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + state.add_edge(last_rma_op_node, + None, + accumulate_node, + "_in", + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + origin_node = state.add_read(origin) + origin_desc = sdfg.arrays[origin] + state.add_edge(origin_node, + None, + accumulate_node, + '_inbuffer', + Memlet.from_array(origin, origin_desc)) + + _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank) + state.add_edge(target_rank_node, + None, + accumulate_node, + '_target_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(accumulate_name, dace.int32, transient=True) + wnode = state.add_write(accumulate_name) + state.add_edge(accumulate_node, + "_out", + wnode, + None, + Memlet.from_array(accumulate_name, scal)) + + return accumulate_name + + @oprepo.replaces('dace.comm.Subarray') def _subarray(pv: ProgramVisitor, sdfg: SDFG, diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 5400bd45de..9a1bd77730 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -17,3 +17,4 @@ from .win_fence import Win_fence from .win_put import Win_put from .win_get import Win_get +from .win_accumulate import Win_accumulate diff --git a/dace/libraries/mpi/nodes/win_accumulate.py b/dace/libraries/mpi/nodes/win_accumulate.py new file mode 100644 index 0000000000..6cc13b4bcd --- /dev/null +++ b/dace/libraries/mpi/nodes/win_accumulate.py @@ -0,0 +1,72 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinAccumulateMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + inbuffer, in_count_str = node.validate(parent_sdfg, parent_state) + mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type) + + window_name = node.window_name + op = node.op + + code = f""" + MPI_Accumulate(_inbuffer, {in_count_str}, {mpi_dtype_str}, \ + _target_rank, 0, {in_count_str}, {mpi_dtype_str}, \ + {op}, __state->{window_name}_window); + """ + + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_accumulate(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinAccumulateMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + op = dace.properties.Property(dtype=str, default='MPI_SUM') + + def __init__(self, name, window_name, op="MPI_SUM", *args, **kwargs): + super().__init__(name, *args, inputs={"_in", "_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs) + self.window_name = window_name + self.op = op + + def validate(self, sdfg, state): + """ + :return: A three-tuple (buffer, root) of the three data descriptors in the + parent SDFG. + """ + + inbuffer = None + for e in state.in_edges(self): + if e.dst_conn == "_inbuffer": + inbuffer = sdfg.arrays[e.data.data] + + in_count_str = "XXX" + for _, _, _, dst_conn, data in state.in_edges(self): + if dst_conn == '_inbuffer': + dims = [str(e) for e in data.subset.size_exact()] + in_count_str = "*".join(dims) + + return inbuffer, in_count_str diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index ca185077b8..df34bb6e65 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -105,6 +105,39 @@ def mpi4py_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace assert (np.array_equal(recv_buf, recv_buf_ref)) +@pytest.mark.mpi +def test_RMA_accumulate(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + # sum all rank at rank 0 + @dace.program + def mpi4py_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Fence(0) + win.Accumulate(send_buf, target_rank=rank, op=MPI.SUM) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_rma_accumulate.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + win_buffer_ref = np.full(window_size, rank, dtype=np.int32) + send_buffer = np.full(window_size, rank, dtype=np.int32) + + func(win_buf=win_buffer, send_buf=send_buffer, rank=0) + mpi4py_rma_accumulate.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=0) + + assert (np.array_equal(win_buffer, win_buffer_ref)) + @pytest.mark.mpi def test_external_comm_bcast(): @@ -416,3 +449,4 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): test_alltoall() test_RMA_put() test_RMA_get() + test_RMA_accumulate() diff --git a/tests/library/mpi/win_accumulate_test.py b/tests/library/mpi/win_accumulate_test.py new file mode 100644 index 0000000000..c5338e12ac --- /dev/null +++ b/tests/library/mpi/win_accumulate_test.py @@ -0,0 +1,205 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_accumulate") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False) + + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + fence_state_1 = sdfg.add_state("win_fence_1") + + sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge()) + + fence_name = sdfg.add_rma_ops(window_name, "fence") + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + window_node = fence_state_1.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + fence_state_1.add_edge(window_node, + None, + win_fence_node, + None, + Memlet.from_array(window_name, window_desc)) + + assertion_node = fence_state_1.add_access("assertion") + + fence_state_1.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_1.add_write(fence_name) + fence_state_1.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + +############################################################################### + + accumulate_state = sdfg.add_state("win_accumulate") + + sdfg.add_edge(fence_state_1, accumulate_state, dace.InterstateEdge()) + + accumulate_name = sdfg.add_rma_ops(window_name, "accumulate") + win_accumulate_node = mpi.nodes.win_accumulate.Win_accumulate(accumulate_name, window_name) + + # pseudo access for ordering + fence_node = accumulate_state.add_access(fence_name) + fence_desc = sdfg.arrays[fence_name] + + send_buffer = accumulate_state.add_access("send_buffer") + + target_rank = accumulate_state.add_access("target_rank") + + accumulate_state.add_edge(fence_node, + None, + win_accumulate_node, + "_in", + Memlet.from_array(fence_name, fence_desc)) + + accumulate_state.add_edge(send_buffer, + None, + win_accumulate_node, + "_inbuffer", + Memlet.simple(send_buffer, "0:n", num_accesses=n)) + + accumulate_state.add_edge(target_rank, + None, + win_accumulate_node, + "_target_rank", + Memlet.simple(target_rank, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(accumulate_name, dace.int32, transient=True) + wnode = accumulate_state.add_write(accumulate_name) + accumulate_state.add_edge(win_accumulate_node, + "_out", + wnode, + None, + Memlet.from_array(accumulate_name, scal)) + +############################################################################### + + fence_state_2 = sdfg.add_state("win_fence_2") + + sdfg.add_edge(accumulate_state, fence_state_2, dace.InterstateEdge()) + + fence_name = sdfg.add_rma_ops(window_name, "fence") + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name) + + # pseudo access for ordering + accumulate_node = fence_state_2.add_access(accumulate_name) + accumulate_desc = sdfg.arrays[accumulate_name] + + fence_state_2.add_edge(accumulate_node, + None, + win_fence_node, + None, + Memlet.from_array(accumulate_name, accumulate_desc)) + + assertion_node = fence_state_2.add_access("assertion") + + fence_state_2.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True) + wnode = fence_state_2.add_write(fence_name) + fence_state_2.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_accumulate(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() + + window_size = 10 + win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + send_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + + # accumulate all ranks in rank 0 + target_rank = np.full([1], 0, dtype=np.int32) + assertion = np.full([1], 0, dtype=np.int32) + + mpi_func(assertion=assertion, + win_buffer=win_buffer, + send_buffer=send_buffer, + target_rank=target_rank, + n=window_size) + + correct_data = np.full(window_size, comm_size * (comm_size - 1) / 2, dtype=np_dtype) + if (comm_rank == 0 and not np.allclose(win_buffer, correct_data)): + raise (ValueError("The received values are not what I expected on root.")) + +if __name__ == "__main__": + test_win_accumulate(dace.int32) + test_win_accumulate(dace.float32) From e4cc8e3ee2b21c014af540b086457146f64ac2d6 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Tue, 29 Aug 2023 00:30:38 +0800 Subject: [PATCH 15/28] Added MPI RMA flush, lock, unlock library nodes and their tests for passive sync. --- dace/libraries/mpi/nodes/__init__.py | 3 + dace/libraries/mpi/nodes/win_flush.py | 43 +++ dace/libraries/mpi/nodes/win_lock.py | 46 +++ dace/libraries/mpi/nodes/win_unlock.py | 43 +++ tests/library/mpi/win_passive_sync_test.py | 332 +++++++++++++++++++++ 5 files changed, 467 insertions(+) create mode 100644 dace/libraries/mpi/nodes/win_flush.py create mode 100644 dace/libraries/mpi/nodes/win_lock.py create mode 100644 dace/libraries/mpi/nodes/win_unlock.py create mode 100644 tests/library/mpi/win_passive_sync_test.py diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 9a1bd77730..3d3e0ac8f9 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -18,3 +18,6 @@ from .win_put import Win_put from .win_get import Win_get from .win_accumulate import Win_accumulate +from .win_lock import Win_lock +from .win_unlock import Win_unlock +from .win_flush import Win_flush diff --git a/dace/libraries/mpi/nodes/win_flush.py b/dace/libraries/mpi/nodes/win_flush.py new file mode 100644 index 0000000000..70e2ac1905 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_flush.py @@ -0,0 +1,43 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinFlushMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + window_name = node.window_name + code = f""" + MPI_Win_flush(_rank, __state->{window_name}_window); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_flush(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinFlushMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_rank"}, outputs={"_out"}, **kwargs) + self.window_name = window_name diff --git a/dace/libraries/mpi/nodes/win_lock.py b/dace/libraries/mpi/nodes/win_lock.py new file mode 100644 index 0000000000..48a5fe6fd4 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_lock.py @@ -0,0 +1,46 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinLockMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + window_name = node.window_name + code = f""" + MPI_Win_lock(_lock_type, + _rank, + _assertion, + __state->{window_name}_window); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_lock(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinLockMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_rank", "_lock_type", "_assertion"}, outputs={"_out"}, **kwargs) + self.window_name = window_name diff --git a/dace/libraries/mpi/nodes/win_unlock.py b/dace/libraries/mpi/nodes/win_unlock.py new file mode 100644 index 0000000000..7bd6963fa9 --- /dev/null +++ b/dace/libraries/mpi/nodes/win_unlock.py @@ -0,0 +1,43 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinUnlockMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + window_name = node.window_name + code = f""" + MPI_Win_unlock(_rank, __state->{window_name}_window); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_unlock(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinUnlockMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_rank"}, outputs={"_out"}, **kwargs) + self.window_name = window_name diff --git a/tests/library/mpi/win_passive_sync_test.py b/tests/library/mpi/win_passive_sync_test.py new file mode 100644 index 0000000000..8a0ac7c3d7 --- /dev/null +++ b/tests/library/mpi/win_passive_sync_test.py @@ -0,0 +1,332 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_passive_sync") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("lock_type", [1], dtype=dace.int32, transient=False) + sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False) + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False) + sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False) + + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + lock_state = sdfg.add_state("win_lock") + + sdfg.add_edge(window_state, lock_state, dace.InterstateEdge()) + + lock_name = sdfg.add_rma_ops(window_name, "lock") + win_lock_node = mpi.nodes.win_lock.Win_lock(lock_name, window_name) + + # pseudo access for ordering + window_node = lock_state.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + lock_state.add_edge(window_node, + None, + win_lock_node, + None, + Memlet.from_array(window_name, window_desc)) + + lock_type_node = lock_state.add_access("lock_type") + + target_rank_node = lock_state.add_access("target_rank") + + assertion_node = lock_state.add_access("assertion") + + lock_state.add_edge(lock_type_node, + None, + win_lock_node, + '_lock_type', + Memlet.simple(lock_type_node, "0:1", num_accesses=1)) + + lock_state.add_edge(target_rank_node, + None, + win_lock_node, + '_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + lock_state.add_edge(assertion_node, + None, + win_lock_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(lock_name, dace.int32, transient=True) + wnode = lock_state.add_write(lock_name) + lock_state.add_edge(win_lock_node, + "_out", + wnode, + None, + Memlet.from_array(lock_name, scal)) + +############################################################################### + + put_state = sdfg.add_state("win_put") + + sdfg.add_edge(lock_state, put_state, dace.InterstateEdge()) + + put_name = sdfg.add_rma_ops(window_name, "put") + win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name) + + # pseudo access for ordering + lock_node = put_state.add_access(lock_name) + lock_desc = sdfg.arrays[lock_name] + + send_buffer = put_state.add_access("send_buffer") + + target_rank = put_state.add_access("target_rank") + + put_state.add_edge(lock_node, + None, + win_put_node, + "_in", + Memlet.from_array(lock_name, lock_desc)) + + put_state.add_edge(send_buffer, + None, + win_put_node, + "_inbuffer", + Memlet.simple(send_buffer, "0:n", num_accesses=n)) + + put_state.add_edge(target_rank, + None, + win_put_node, + "_target_rank", + Memlet.simple(target_rank, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True) + wnode = put_state.add_write(put_name) + put_state.add_edge(win_put_node, + "_out", + wnode, + None, + Memlet.from_array(put_name, scal)) + +############################################################################### + + flush_state = sdfg.add_state("win_flush") + + sdfg.add_edge(put_state, flush_state, dace.InterstateEdge()) + + flush_name = sdfg.add_rma_ops(window_name, "flush") + win_flush_node = mpi.nodes.win_flush.Win_flush(flush_name, window_name) + + # pseudo access for ordering + put_node = flush_state.add_access(put_name) + put_desc = sdfg.arrays[put_name] + + flush_state.add_edge(put_node, + None, + win_flush_node, + None, + Memlet.from_array(put_name, put_desc)) + + target_rank_node = flush_state.add_access("target_rank") + + flush_state.add_edge(target_rank_node, + None, + win_flush_node, + '_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(flush_name, dace.int32, transient=True) + wnode = flush_state.add_write(flush_name) + flush_state.add_edge(win_flush_node, + "_out", + wnode, + None, + Memlet.from_array(flush_name, scal)) + +############################################################################### + + unlock_state = sdfg.add_state("win_unlock") + + sdfg.add_edge(flush_state, unlock_state, dace.InterstateEdge()) + + unlock_name = sdfg.add_rma_ops(window_name, "unlock") + win_unlock_node = mpi.nodes.win_unlock.Win_unlock(unlock_name, window_name) + + # pseudo access for ordering + flush_node = unlock_state.add_access(flush_name) + flush_desc = sdfg.arrays[flush_name] + + unlock_state.add_edge(flush_node, + None, + win_unlock_node, + None, + Memlet.from_array(flush_name, flush_desc)) + + target_rank_node = unlock_state.add_access("target_rank") + + unlock_state.add_edge(target_rank_node, + None, + win_unlock_node, + '_rank', + Memlet.simple(target_rank_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(unlock_name, dace.int32, transient=True) + wnode = unlock_state.add_write(unlock_name) + unlock_state.add_edge(win_unlock_node, + "_out", + wnode, + None, + Memlet.from_array(unlock_name, scal)) + +# added these two fences as Barrier to ensure that every rank has completed +# since every rank are running independently +# some ranks might exit(since they completed) the transmission +# while others are still transmitting +############################################################################### + + fence_state_1 = sdfg.add_state("win_fence") + + sdfg.add_edge(unlock_state, fence_state_1, dace.InterstateEdge()) + + fence_name_1 = sdfg.add_rma_ops(window_name, "fence") + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name_1, window_name) + + # pseudo access for ordering + unlock_node = fence_state_1.add_access(unlock_name) + unlock_desc = sdfg.arrays[unlock_name] + + fence_state_1.add_edge(unlock_node, + None, + win_fence_node, + None, + Memlet.from_array(unlock_name, unlock_desc)) + + assertion_node = fence_state_1.add_access("assertion") + + fence_state_1.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name_1, dace.int32, transient=True) + wnode = fence_state_1.add_write(fence_name_1) + fence_state_1.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name_1, scal)) + +############################################################################### + + fence_state_2 = sdfg.add_state("win_fence") + + sdfg.add_edge(fence_state_1, fence_state_2, dace.InterstateEdge()) + + fence_name_2 = sdfg.add_rma_ops(window_name, "fence") + win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name_2, window_name) + + # pseudo access for ordering + fence_node = fence_state_2.add_access(fence_name_1) + fence_desc = sdfg.arrays[fence_name_1] + + fence_state_2.add_edge(fence_node, + None, + win_fence_node, + None, + Memlet.from_array(fence_name_1, fence_desc)) + + assertion_node = fence_state_2.add_access("assertion") + + fence_state_2.add_edge(assertion_node, + None, + win_fence_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + _, scal = sdfg.add_scalar(fence_name_2, dace.int32, transient=True) + wnode = fence_state_2.add_write(fence_name_2) + fence_state_2.add_edge(win_fence_node, + "_out", + wnode, + None, + Memlet.from_array(fence_name_2, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_put(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() + + window_size = 10 + win_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + send_buffer = np.full(window_size, comm_rank, dtype=np_dtype) + + target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32) + lock_type = np.full([1], MPI.LOCK_SHARED, dtype=np.int32) + assertion = np.full([1], 0, dtype=np.int32) + + mpi_func(lock_type=lock_type, + assertion=assertion, + win_buffer=win_buffer, + send_buffer=send_buffer, + target_rank=target_rank, + n=window_size) + + correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype) + if (not np.allclose(win_buffer, correct_data)): + raise (ValueError("The received values are not what I expected on root.")) + +if __name__ == "__main__": + test_win_put(dace.int32) + test_win_put(dace.float32) From f8feae9278e99bc643a862f50ac3167466f56543 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Tue, 29 Aug 2023 15:48:19 +0800 Subject: [PATCH 16/28] Implemented replacement and their tests for passive sync. nodes --- dace/frontend/common/distr.py | 246 ++++++++++++++++++++++++++++--- tests/library/mpi/mpi4py_test.py | 121 +++++++++++++++ 2 files changed, 349 insertions(+), 18 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 63acc4a449..e6f50bec6d 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1022,6 +1022,200 @@ def _rma_fence(pv: ProgramVisitor, return window_name +@oprepo.replaces_method('RMA_window', 'Flush') +def _rma_flush(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + rank: Union[str, sp.Expr, Number]): + """ Adds a RMA flush to the DaCe Program. + flush will completes all outdtanding RMA operations + + :param window_name: The name of the window to be sychronized. + :param rank: A value or scalar to specify the target rank. + :return: Name of the flush. + """ + + from dace.libraries.mpi.nodes.win_flush import Win_flush + + # fine a new flush name + flush_name = sdfg.add_rma_ops(window_name, "flush") + + _, rank_node = _get_int_arg_node(pv, sdfg, state, rank) + + flush_node = Win_flush(flush_name, window_name) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(flush_name) - 1] + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + + # for ordering + state.add_edge(last_rma_op_node, + None, + flush_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + state.add_edge(rank_node, + None, + flush_node, + '_rank', + Memlet.simple(rank_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(flush_name, dace.int32, transient=True) + wnode = state.add_write(flush_name) + state.add_edge(flush_node, + "_out", + wnode, + None, + Memlet.from_array(flush_name, scal)) + + return window_name + + +@oprepo.replaces_method('RMA_window', 'Lock') +def _rma_lock(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + rank: Union[str, sp.Expr, Number], + lock_type: Union[str, sp.Expr, Number] = 234, # MPI.LOCK_EXCLUSIVE = 234 + assertion: Union[str, sp.Expr, Number] = 0): + """ Adds a RMA lock to the DaCe Program. + + :param window_name: The name of the window to be sychronized. + :param assertion: A value or scalar for lock assertion. + :return: Name of the lock. + """ + + from dace.libraries.mpi.nodes.win_lock import Win_lock + + # fine a new lock name + lock_name = sdfg.add_rma_ops(window_name, "lock") + lock_node = Win_lock(lock_name, window_name) + + _, rank_node = _get_int_arg_node(pv, sdfg, state, rank) + _, lock_type_node = _get_int_arg_node(pv, sdfg, state, lock_type) + _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(lock_name) - 1] + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + + # for window lock ordering + state.add_edge(last_rma_op_node, + None, + lock_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + state.add_edge(rank_node, + None, + lock_node, + '_rank', + Memlet.simple(rank_node, "0:1", num_accesses=1)) + + state.add_edge(lock_type_node, + None, + lock_node, + '_lock_type', + Memlet.simple(lock_type_node, "0:1", num_accesses=1)) + + state.add_edge(assertion_node, + None, + lock_node, + '_assertion', + Memlet.simple(assertion_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(lock_name, dace.int32, transient=True) + wnode = state.add_write(lock_name) + state.add_edge(lock_node, + "_out", + wnode, + None, + Memlet.from_array(lock_name, scal)) + + return window_name + + +@oprepo.replaces_method('RMA_window', 'Unlock') +def _rma_unlock(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + rank: Union[str, sp.Expr, Number]): + """ Adds a RMA unlock to the DaCe Program. + Completes an RMA access epoch at the target process + + :param window_name: The name of the window to be sychronized. + :param rank: A value or scalar to specify the target rank. + :return: Name of the Unlock. + """ + + from dace.libraries.mpi.nodes.win_unlock import Win_unlock + + # fine a new unlock name + unlock_name = sdfg.add_rma_ops(window_name, "unlock") + + _, rank_node = _get_int_arg_node(pv, sdfg, state, rank) + + unlock_node = Win_unlock(unlock_name, window_name) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(unlock_name) - 1] + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + + # for ordering + state.add_edge(last_rma_op_node, + None, + unlock_node, + None, + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + state.add_edge(rank_node, + None, + unlock_node, + '_rank', + Memlet.simple(rank_node, "0:1", num_accesses=1)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(unlock_name, dace.int32, transient=True) + wnode = state.add_write(unlock_name) + state.add_edge(unlock_node, + "_out", + wnode, + None, + Memlet.from_array(unlock_name, scal)) + + return window_name + + @oprepo.replaces_method('RMA_window', 'Put') def _rma_put(pv: ProgramVisitor, sdfg: SDFG, @@ -1048,14 +1242,20 @@ def _rma_put(pv: ProgramVisitor, cur_window_fences = [rma_op for rma_op in cur_window_rma_ops if f"{window_name}_fence" in rma_op] - if len(cur_window_fences) % 2: - # if only odd number of fences, - # that means we're in a ongoing epoch - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1] - else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1] + + # if only odd number of fences, + # that means we're in a ongoing epoch + if len(cur_window_fences) % 2 == 0: # if even number of fences, # that means this operation is either a passive sync. one or a corrupted one - raise ValueError("Wrong synchronization of RMA calls!") + # same logic applies to passive sync. + cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops + if "lock" in rma_op] + # if we don't have even number of syncs, give user a warning + if len(cur_window_passive_syncs) % 2 == 0: + print("You might have a bad synchronization of RMA calls!") + put_node = Win_put(put_name, window_name) @@ -1120,14 +1320,19 @@ def _rma_get(pv: ProgramVisitor, cur_window_fences = [rma_op for rma_op in cur_window_rma_ops if f"{window_name}_fence" in rma_op] - if len(cur_window_fences) % 2: - # if only odd number of fences, - # that means we're in a ongoing epoch - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1] - else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1] + + # if only odd number of fences, + # that means we're in a ongoing epoch + if len(cur_window_fences) % 2 == 0: # if even number of fences, # that means this operation is either a passive sync. one or a corrupted one - raise ValueError("Wrong synchronization of RMA calls!") + # same logic applies to passive sync. + cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops + if "lock" in rma_op] + # if we don't have even number of syncs, give user a warning + if len(cur_window_passive_syncs) % 2 == 0: + print("You might have a bad synchronization of RMA calls!") get_node = Win_get(get_name, window_name) @@ -1197,14 +1402,19 @@ def _rma_accumulate(pv: ProgramVisitor, cur_window_fences = [rma_op for rma_op in cur_window_rma_ops if f"{window_name}_fence" in rma_op] - if len(cur_window_fences) % 2: - # if only odd number of fences, - # that means we're in a ongoing epoch - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1] - else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1] + + # if only odd number of fences, + # that means we're in a ongoing epoch + if len(cur_window_fences) % 2 == 0: # if even number of fences, # that means this operation is either a passive sync. one or a corrupted one - raise ValueError("Wrong synchronization of RMA calls!") + # same logic applies to passive sync. + cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops + if "lock" in rma_op] + # if we don't have even number of syncs, give user a warning + if len(cur_window_passive_syncs) % 2 == 0: + print("You might have a bad synchronization of RMA calls!") accumulate_node = Win_accumulate(accumulate_name, window_name, op) diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index df34bb6e65..55cbff160b 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -138,6 +138,124 @@ def mpi4py_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], ran assert (np.array_equal(win_buffer, win_buffer_ref)) + +@pytest.mark.mpi +def test_passive_RMA_put(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def mpi4py_passive_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Lock(rank) + win.Put(send_buf, target_rank=rank) + win.Flush(rank) + win.Unlock(rank) + + # as MPI barrier + win.Fence(0) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_passive_rma_put.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + win_buffer_ref = np.full(window_size, rank, dtype=np.int32) + send_buffer = np.full(window_size, rank, dtype=np.int32) + + + func(win_buf=win_buffer, send_buf=send_buffer, rank=((rank + 1) % size)) + mpi4py_passive_rma_put.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=((rank + 1) % size)) + + assert (np.array_equal(win_buffer, win_buffer_ref)) + + +@pytest.mark.mpi +def test_passive_RMA_get(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + @dace.program + def mpi4py_passive_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Lock(rank) + win.Get(recv_buf, target_rank=rank) + win.Flush(rank) + win.Unlock(rank) + + # as MPI barrier + win.Fence(0) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_passive_rma_get.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + recv_buf = np.full(window_size, rank, dtype=np.int32) + recv_buf_ref = np.full(window_size, rank, dtype=np.int32) + + func(win_buf=win_buffer, recv_buf=recv_buf, rank=((rank + 1) % size)) + mpi4py_passive_rma_get.f(win_buf=win_buffer, recv_buf=recv_buf_ref, rank=((rank + 1) % size)) + + assert (np.array_equal(recv_buf, recv_buf_ref)) + + +@pytest.mark.mpi +def test_RMA_passive_accumulate(): + from mpi4py import MPI + commworld = MPI.COMM_WORLD + rank = commworld.Get_rank() + size = commworld.Get_size() + + # sum all rank at rank 0 + @dace.program + def mpi4py_passive_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32): + win = MPI.Win.Create(win_buf, comm=commworld) + win.Lock(rank) + win.Accumulate(send_buf, target_rank=rank, op=MPI.SUM) + win.Flush(rank) + win.Unlock(rank) + + # as MPI barrier + win.Fence(0) + win.Fence(0) + + if size < 2: + raise ValueError("Please run this test with at least two processes.") + + sdfg = None + if rank == 0: + sdfg = mpi4py_passive_rma_accumulate.to_sdfg() + func = utils.distributed_compile(sdfg, commworld) + + window_size = 10 + win_buffer = np.full(window_size, rank, dtype=np.int32) + win_buffer_ref = np.full(window_size, rank, dtype=np.int32) + send_buffer = np.full(window_size, rank, dtype=np.int32) + + func(win_buf=win_buffer, send_buf=send_buffer, rank=0) + mpi4py_passive_rma_accumulate.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=0) + + if rank == 0: + assert (np.array_equal(win_buffer, win_buffer_ref)) + + @pytest.mark.mpi def test_external_comm_bcast(): @@ -450,3 +568,6 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime): test_RMA_put() test_RMA_get() test_RMA_accumulate() + test_passive_RMA_put() + test_passive_RMA_get() + test_RMA_passive_accumulate() From ada74b333a1544856335b67ebd70d1fec9873a32 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 31 Aug 2023 22:29:09 +0800 Subject: [PATCH 17/28] Added lock type validator for better MPI compatibility --- dace/frontend/common/distr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index e6f50bec6d..85d0753f3a 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1088,7 +1088,7 @@ def _rma_lock(pv: ProgramVisitor, state: SDFGState, window_name: str, rank: Union[str, sp.Expr, Number], - lock_type: Union[str, sp.Expr, Number] = 234, # MPI.LOCK_EXCLUSIVE = 234 + lock_type: Union[str, sp.Expr, Number] = 234, # in intel MPI MPI.LOCK_EXCLUSIVE = 234 assertion: Union[str, sp.Expr, Number] = 0): """ Adds a RMA lock to the DaCe Program. @@ -1103,6 +1103,11 @@ def _rma_lock(pv: ProgramVisitor, lock_name = sdfg.add_rma_ops(window_name, "lock") lock_node = Win_lock(lock_name, window_name) + # different MPI might get other value + if lock_type == 234: + from mpi4py import MPI + lock_type = MPI.LOCK_EXCLUSIVE + _, rank_node = _get_int_arg_node(pv, sdfg, state, rank) _, lock_type_node = _get_int_arg_node(pv, sdfg, state, lock_type) _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) From 892dda54c35a100ae1398ae593f6dc3078ecc91b Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 7 Sep 2023 00:17:48 +0800 Subject: [PATCH 18/28] Added a GEMM sample for RMA --- samples/mpi/mat_mul.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 samples/mpi/mat_mul.py diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py new file mode 100644 index 0000000000..8e3396c793 --- /dev/null +++ b/samples/mpi/mat_mul.py @@ -0,0 +1,29 @@ +import numpy as np + +dim_1 = 200 +dim_2 = 300 + +a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2) +b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1) + +def matrix_mul(a, b): + a_mat = np.array(a) + b_mat = np.array(b) + c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1])) + + if a_mat.shape[1] != b_mat.shape[0]: + raise ValueError("A, B matrix dimension mismatched!") + + # more or less like C stationary + for i in range(a_mat.shape[0]): + for j in range(b_mat.shape[1]): + for k in range(a_mat.shape[1]): + c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + return c_mat + + +# print(matrix_mul(a,b)) +# print(np.matmul(a,b)) + +print("Result correctness:", np.allclose(matrix_mul(a,b), np.matmul(a,b))) \ No newline at end of file From 8149219ad88a42cde819b0bdc63a6b99142e6b5f Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 7 Sep 2023 23:57:22 +0800 Subject: [PATCH 19/28] Updated mat_mul to tiled version --- samples/mpi/mat_mul.py | 63 ++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index 8e3396c793..47d42e7f06 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -1,29 +1,58 @@ import numpy as np +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +import time -dim_1 = 200 -dim_2 = 300 +dim_1 = 1024 +dim_2 = 1024 + +tile = 128 a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2) b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1) def matrix_mul(a, b): - a_mat = np.array(a) - b_mat = np.array(b) - c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1])) + a_mat = np.array(a, dtype=np.int64) + b_mat = np.array(b, dtype=np.int64) + c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64) if a_mat.shape[1] != b_mat.shape[0]: raise ValueError("A, B matrix dimension mismatched!") # more or less like C stationary - for i in range(a_mat.shape[0]): - for j in range(b_mat.shape[1]): - for k in range(a_mat.shape[1]): - c_mat[i][j] += a_mat[i][k] * b_mat[k][j] - - return c_mat - - -# print(matrix_mul(a,b)) -# print(np.matmul(a,b)) - -print("Result correctness:", np.allclose(matrix_mul(a,b), np.matmul(a,b))) \ No newline at end of file + # for i in range(a_mat.shape[0]): + # for j in range(b_mat.shape[1]): + # for k in range(a_mat.shape[1]): + # c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + @dace.program + def mpi4py_passive_rma_put(a_mat: dace.int64[dim_1,dim_2], b_mat: dace.int64[dim_1,dim_2], c_mat: dace.int64[dim_1,dim_2], tile: dace.int64): + for i_tile in range(a_mat.shape[0] // tile): + for j_tile in range(b_mat.shape[1] // tile): + for k_tile in range(a_mat.shape[1] // tile): + for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])): + for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])): + for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])): + c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + sdfg = None + sdfg = mpi4py_passive_rma_put.to_sdfg() + sdfg.openmp_sections = False + func = sdfg.compile() + + start = time.time() + func(a_mat, b_mat, c_mat, tile) + time_con = time.time() - start + + return c_mat, time_con + +c_mat, time_con = matrix_mul(a,b) +print(c_mat, time_con) + +start = time.time() +c_np = np.matmul(a,b) +time_con = time.time() - start +print(c_np, time_con) + +print("Result correctness:", np.allclose(c_mat, c_np)) From 380b5e462e0a5d0a450aba1f003303c7673c5fac Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 8 Sep 2023 14:51:24 +0800 Subject: [PATCH 20/28] Implemented distributed version mat_mul.py --- samples/mpi/mat_mul.py | 134 +++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 39 deletions(-) diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index 47d42e7f06..dc6bc6f80c 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -2,57 +2,113 @@ import dace from dace.sdfg import utils import dace.dtypes as dtypes +from mpi4py import MPI import time -dim_1 = 1024 -dim_2 = 1024 -tile = 128 +# to check if this process owns this chunk of data +# compare given i and j with grid_i and grid_j +def owner(i, j, grid_i, grid_j): + if i == grid_i and j == grid_j: + return True + else: + return False -a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2) -b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1) -def matrix_mul(a, b): - a_mat = np.array(a, dtype=np.int64) - b_mat = np.array(b, dtype=np.int64) - c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64) +# get matrix form remote rank +def get_mat(win, buffer, dim_0, dim_1, grid_dim): + rank = dim_0 * grid_dim + dim_1 + win.Lock(rank) + win.Get(buffer, target_rank=rank) + win.Flush(rank) + win.Unlock(rank) - if a_mat.shape[1] != b_mat.shape[0]: + +def matrix_mul(comm_world, a, b): + # check if matrix multiplication is valid + if a.shape[1] != b.shape[0]: raise ValueError("A, B matrix dimension mismatched!") - # more or less like C stationary - # for i in range(a_mat.shape[0]): - # for j in range(b_mat.shape[1]): - # for k in range(a_mat.shape[1]): - # c_mat[i][j] += a_mat[i][k] * b_mat[k][j] - - @dace.program - def mpi4py_passive_rma_put(a_mat: dace.int64[dim_1,dim_2], b_mat: dace.int64[dim_1,dim_2], c_mat: dace.int64[dim_1,dim_2], tile: dace.int64): - for i_tile in range(a_mat.shape[0] // tile): - for j_tile in range(b_mat.shape[1] // tile): - for k_tile in range(a_mat.shape[1] // tile): - for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])): - for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])): - for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])): - c_mat[i][j] += a_mat[i][k] * b_mat[k][j] - - sdfg = None - sdfg = mpi4py_passive_rma_put.to_sdfg() - sdfg.openmp_sections = False - func = sdfg.compile() - + # comm init + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + grid_dim = int(np.floor(np.sqrt(comm_size))) + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim + + local_i_dim = a.shape[0] + local_j_dim = b.shape[1] + local_k_dim = a.shape[1] + + whole_i_dim = grid_dim * a.shape[0] + whole_j_dim = grid_dim * b.shape[1] + whole_k_dim = grid_dim * a.shape[1] + + a_mat = np.array(a + comm_rank, dtype=np.int32) + b_mat = np.array(b + comm_rank, dtype=np.int32) + c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32) + + # local buffers for remote fetching + foreign_a_mat = np.zeros(a.shape, dtype=np.int32) + foreign_b_mat = np.zeros(b.shape, dtype=np.int32) + + # RMA windows + a_win = MPI.Win.Create(a_mat, comm=comm_world) + b_win = MPI.Win.Create(b_mat, comm=comm_world) + start = time.time() - func(a_mat, b_mat, c_mat, tile) + for i in range(whole_i_dim // local_i_dim): + for j in range(whole_j_dim // local_j_dim): + for k in range(whole_k_dim // local_k_dim): + if owner(i, j, grid_i, grid_j): + get_mat(a_win, foreign_a_mat, i, k, grid_dim) + get_mat(b_win, foreign_b_mat, k, j, grid_dim) + + c_mat += np.matmul(foreign_a_mat, foreign_b_mat) time_con = time.time() - start + # to ensure every process completed the calculation + comm_world.Barrier() + return c_mat, time_con -c_mat, time_con = matrix_mul(a,b) -print(c_mat, time_con) -start = time.time() -c_np = np.matmul(a,b) -time_con = time.time() - start -print(c_np, time_con) +if __name__ == "__main__": + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + grid_dim = int(np.floor(np.sqrt(comm_size))) + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim + + dim_1 = 256 + dim_2 = 256 + + a = np.ones((dim_1, dim_2), dtype=np.int32) + b = np.ones((dim_2, dim_1), dtype=np.int32) + + c_mat, time_con = matrix_mul(comm_world, a, b) + # print(comm_rank, c_mat) + # print(comm_rank, "matrix_mul time:", time_con) + + whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.int32) + for i in range(grid_dim): + for j in range(grid_dim): + whole_a[i * dim_1:(i+1) * dim_1, j * dim_2:(j+1) * dim_2] += i * grid_dim + j + + whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.int32) + for i in range(grid_dim): + for j in range(grid_dim): + whole_b[i * dim_2:(i+1) * dim_2, j * dim_1:(j+1) * dim_1] += i * grid_dim + j + + start = time.time() + c_np = np.matmul(whole_a, whole_b) + time_con = time.time() - start + + # print(comm_rank, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2]) + # print(comm_rank, "np.matmul time:", time_con) -print("Result correctness:", np.allclose(c_mat, c_np)) + # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2])) + assert(np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2])) From c8477179ff14a9fe1b956b8045498b6a5705fb04 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 8 Sep 2023 15:25:18 +0800 Subject: [PATCH 21/28] Functionized the distibuted computation in mat_mul.py --- samples/mpi/mat_mul.py | 80 +++++++++++++------------- samples/mpi/ping_pong.py | 117 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+), 42 deletions(-) create mode 100644 samples/mpi/ping_pong.py diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index dc6bc6f80c..c5df0c6c32 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -5,23 +5,46 @@ from mpi4py import MPI import time +def dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size): + grid_dim = int(np.floor(np.sqrt(comm_size))) + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim -# to check if this process owns this chunk of data -# compare given i and j with grid_i and grid_j -def owner(i, j, grid_i, grid_j): - if i == grid_i and j == grid_j: - return True - else: - return False + local_i_dim = a_mat.shape[0] + local_j_dim = b_mat.shape[1] + local_k_dim = a_mat.shape[1] + whole_i_dim = grid_dim * a_mat.shape[0] + whole_j_dim = grid_dim * b_mat.shape[1] + whole_k_dim = grid_dim * a_mat.shape[1] -# get matrix form remote rank -def get_mat(win, buffer, dim_0, dim_1, grid_dim): - rank = dim_0 * grid_dim + dim_1 - win.Lock(rank) - win.Get(buffer, target_rank=rank) - win.Flush(rank) - win.Unlock(rank) + # local buffers for remote fetching + foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32) + foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32) + + # RMA windows + a_win = MPI.Win.Create(a_mat, comm=comm_world) + b_win = MPI.Win.Create(b_mat, comm=comm_world) + for i in range(whole_i_dim // local_i_dim): + for j in range(whole_j_dim // local_j_dim): + for k in range(whole_k_dim // local_k_dim): + # check if this process owns this chunk of data + if i == grid_i and j == grid_j: + target_rank_a = i * grid_dim + k + target_rank_b = k * grid_dim + j + a_win.Lock(target_rank_a) + b_win.Lock(target_rank_b) + + a_win.Get(foreign_a_mat, target_rank=target_rank_a) + b_win.Get(foreign_b_mat, target_rank=target_rank_b) + + a_win.Flush(target_rank_a) + b_win.Flush(target_rank_b) + + a_win.Unlock(target_rank_a) + b_win.Unlock(target_rank_b) + + c_mat += np.matmul(foreign_a_mat, foreign_b_mat) def matrix_mul(comm_world, a, b): @@ -33,39 +56,12 @@ def matrix_mul(comm_world, a, b): comm_rank = comm_world.Get_rank() comm_size = comm_world.Get_size() - grid_dim = int(np.floor(np.sqrt(comm_size))) - grid_i = comm_rank // grid_dim - grid_j = comm_rank % grid_dim - - local_i_dim = a.shape[0] - local_j_dim = b.shape[1] - local_k_dim = a.shape[1] - - whole_i_dim = grid_dim * a.shape[0] - whole_j_dim = grid_dim * b.shape[1] - whole_k_dim = grid_dim * a.shape[1] - a_mat = np.array(a + comm_rank, dtype=np.int32) b_mat = np.array(b + comm_rank, dtype=np.int32) c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32) - # local buffers for remote fetching - foreign_a_mat = np.zeros(a.shape, dtype=np.int32) - foreign_b_mat = np.zeros(b.shape, dtype=np.int32) - - # RMA windows - a_win = MPI.Win.Create(a_mat, comm=comm_world) - b_win = MPI.Win.Create(b_mat, comm=comm_world) - start = time.time() - for i in range(whole_i_dim // local_i_dim): - for j in range(whole_j_dim // local_j_dim): - for k in range(whole_k_dim // local_k_dim): - if owner(i, j, grid_i, grid_j): - get_mat(a_win, foreign_a_mat, i, k, grid_dim) - get_mat(b_win, foreign_b_mat, k, j, grid_dim) - - c_mat += np.matmul(foreign_a_mat, foreign_b_mat) + dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size) time_con = time.time() - start # to ensure every process completed the calculation diff --git a/samples/mpi/ping_pong.py b/samples/mpi/ping_pong.py new file mode 100644 index 0000000000..d8cf490f62 --- /dev/null +++ b/samples/mpi/ping_pong.py @@ -0,0 +1,117 @@ +import numpy as np +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from mpi4py import MPI +import time + +dim_1 = 128 +dim_2 = 128 + +a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2) +b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1) + +# to check if this process owns this chunk of data +# compare given i and j with grid_i and grid_j +@dace.program +def owner(i, j, grid_i, grid_j): + if i == grid_i and j == grid_j: + return True + else: + return False + +# get matrix form remote rank +@dace.program +def get_mat(win: dace.RMA_window, buffer: dace.int32[dim_1,dim_2], dim_0: dace.int32, dim_1: dace.int32, grid_dim: dace.int32): + rank = dim_0 * grid_dim + dim_1 + win.Lock(rank) + win.Get(buffer, target_rank=rank) + win.Flush(rank) + win.Unlock(rank) + +def matrix_mul(a, b): + # check if matrix multiplication is valid + if a.shape[1] != b.shape[0]: + raise ValueError("A, B matrix dimension mismatched!") + + # comm init + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + grid_dim = 2 + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim + + if comm_size != 2: + raise ValueError("Please run this test with two processes.") + + a_mat = np.array(a + comm_rank, dtype=np.int64) + b_mat = np.array(b + comm_rank, dtype=np.int64) + foreign_a_mat = np.zeros(a.shape, dtype=np.int64) + foreign_b_mat = np.zeros(b.shape, dtype=np.int64) + c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64) + + + # more or less like C stationary + # for i in range(a_mat.shape[0]): + # for j in range(b_mat.shape[1]): + # for k in range(a_mat.shape[1]): + # c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + + @dace.program + def mpi4py_send_recv(comm_rank: dace.int32, a_mat: dace.int32[dim_1,dim_2], foreign_a_mat: dace.int32[dim_1,dim_2], grid_dim: dace.int32): + a_win = MPI.Win.Create(a_mat, comm=comm_world) + if comm_rank == 0: + get_mat(a_win, foreign_a_mat, 0, 1, grid_dim) + else: + get_mat(a_win, foreign_a_mat, 0, 0, grid_dim) + return foreign_a_mat + + sdfg = None + if comm_rank == 0: + sdfg = mpi4py_send_recv.to_sdfg(simplify=True) + func = utils.distributed_compile(sdfg, comm_world) + + + start = time.time() + + foreign_a_mat = func(comm_rank=comm_rank, a_mat=a_mat, foreign_a_mat=foreign_a_mat, grid_dim=grid_dim) + if comm_rank == 0: + if(np.allclose(a_mat+1, foreign_a_mat)): + print("Good") + else: + if(np.allclose(a_mat-1, foreign_a_mat)): + print("Good") + + time_con = time.time() - start + + + # to ensure every process completed the calculation + comm_world.Barrier() + +matrix_mul(a,b) + + # more or less like C stationary + # for i in range(a_mat.shape[0]): + # for j in range(b_mat.shape[1]): + # for k in range(a_mat.shape[1]): + # c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + # @dace.program + # def mpi4py_passive_rma_put(a_mat: dace.int32[dim_1,dim_2], b_mat: dace.int32[dim_1,dim_2], c_mat: dace.int32[dim_1,dim_2], tile: dace.int32): + # for i_tile in range(a_mat.shape[0] // tile): + # for j_tile in range(b_mat.shape[1] // tile): + # for k_tile in range(a_mat.shape[1] // tile): + # for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])): + # for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])): + # for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])): + # c_mat[i][j] += a_mat[i][k] * b_mat[k][j] + + # sdfg = None + # sdfg = mpi4py_passive_rma_put.to_sdfg() + # sdfg.openmp_sections = False + # func = sdfg.compile() + + # func(a_mat, b_mat, c_mat, tile) \ No newline at end of file From ca65d87d84868aa739f0cdb841af6705b9ee9c56 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 11 Sep 2023 14:46:48 +0800 Subject: [PATCH 22/28] Enabled dace acceleration for mat_mul.py --- samples/mpi/mat_mul.py | 103 +++++++++++++++++++++++------------------ 1 file changed, 58 insertions(+), 45 deletions(-) diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index c5df0c6c32..dd8fcbee2c 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -5,47 +5,6 @@ from mpi4py import MPI import time -def dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size): - grid_dim = int(np.floor(np.sqrt(comm_size))) - grid_i = comm_rank // grid_dim - grid_j = comm_rank % grid_dim - - local_i_dim = a_mat.shape[0] - local_j_dim = b_mat.shape[1] - local_k_dim = a_mat.shape[1] - - whole_i_dim = grid_dim * a_mat.shape[0] - whole_j_dim = grid_dim * b_mat.shape[1] - whole_k_dim = grid_dim * a_mat.shape[1] - - # local buffers for remote fetching - foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32) - foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32) - - # RMA windows - a_win = MPI.Win.Create(a_mat, comm=comm_world) - b_win = MPI.Win.Create(b_mat, comm=comm_world) - for i in range(whole_i_dim // local_i_dim): - for j in range(whole_j_dim // local_j_dim): - for k in range(whole_k_dim // local_k_dim): - # check if this process owns this chunk of data - if i == grid_i and j == grid_j: - target_rank_a = i * grid_dim + k - target_rank_b = k * grid_dim + j - a_win.Lock(target_rank_a) - b_win.Lock(target_rank_b) - - a_win.Get(foreign_a_mat, target_rank=target_rank_a) - b_win.Get(foreign_b_mat, target_rank=target_rank_b) - - a_win.Flush(target_rank_a) - b_win.Flush(target_rank_b) - - a_win.Unlock(target_rank_a) - b_win.Unlock(target_rank_b) - - c_mat += np.matmul(foreign_a_mat, foreign_b_mat) - def matrix_mul(comm_world, a, b): # check if matrix multiplication is valid @@ -60,12 +19,66 @@ def matrix_mul(comm_world, a, b): b_mat = np.array(b + comm_rank, dtype=np.int32) c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32) + @dace.program + def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]], + b_mat: dace.int32[b_mat.shape[0], b_mat.shape[1]], + c_mat: dace.int32[a_mat.shape[0], b_mat.shape[1]], + comm_rank: dace.int32, + comm_size: dace.int32): + grid_dim = int(np.floor(np.sqrt(comm_size))) + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim + + local_i_dim = a_mat.shape[0] + local_j_dim = b_mat.shape[1] + local_k_dim = a_mat.shape[1] + + whole_i_dim = grid_dim * a_mat.shape[0] + whole_j_dim = grid_dim * b_mat.shape[1] + whole_k_dim = grid_dim * a_mat.shape[1] + + # local buffers for remote fetching + foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32) + foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32) + + # RMA windows + a_win = MPI.Win.Create(a_mat, comm=comm_world) + b_win = MPI.Win.Create(b_mat, comm=comm_world) + for i in range(whole_i_dim // local_i_dim): + for j in range(whole_j_dim // local_j_dim): + for k in range(whole_k_dim // local_k_dim): + # check if this process owns this chunk of data + if i == grid_i and j == grid_j: + target_rank_a = i * grid_dim + k + target_rank_b = k * grid_dim + j + a_win.Lock(target_rank_a) + a_win.Get(foreign_a_mat, target_rank=target_rank_a) + a_win.Flush(target_rank_a) + a_win.Unlock(target_rank_a) + + b_win.Lock(target_rank_b) + b_win.Get(foreign_b_mat, target_rank=target_rank_b) + b_win.Flush(target_rank_b) + b_win.Unlock(target_rank_b) + + c_mat += foreign_a_mat @ foreign_b_mat + + # as MPI barrier + # to ensure every process completed the calculation + a_win.Fence(0) + a_win.Fence(0) + + sdfg = None + if comm_rank == 0: + # ValueError: Node type "Win_lock" not supported for promotion + sdfg = dist_mat_mult.to_sdfg(simplify=False) + func = utils.distributed_compile(sdfg, comm_world) + start = time.time() - dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size) - time_con = time.time() - start - # to ensure every process completed the calculation - comm_world.Barrier() + func(a_mat=a_mat, b_mat=b_mat, c_mat=c_mat, comm_rank=comm_rank, comm_size=comm_size) + + time_con = time.time() - start return c_mat, time_con From f7231a08572b3aa71313cb3d346df0827e7de618 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 11 Sep 2023 16:06:44 +0800 Subject: [PATCH 23/28] Added MPI RMA free library node, replacement, and tests for both frontend and backend --- dace/frontend/common/distr.py | 52 ++++++++++++++ dace/libraries/mpi/nodes/__init__.py | 1 + dace/libraries/mpi/nodes/win_free.py | 43 +++++++++++ tests/library/mpi/mpi4py_test.py | 3 + tests/library/mpi/mpi_free_test.py | 102 +++++++++++++++++++++++++++ 5 files changed, 201 insertions(+) create mode 100644 dace/libraries/mpi/nodes/win_free.py create mode 100644 tests/library/mpi/mpi_free_test.py diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index 85d0753f3a..fb40a22d4c 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -1082,6 +1082,58 @@ def _rma_flush(pv: ProgramVisitor, return window_name +@oprepo.replaces_method('RMA_window', 'Free') +def _rma_free(pv: ProgramVisitor, + sdfg: SDFG, + state: SDFGState, + window_name: str, + assertion: Union[str, sp.Expr, Number] = 0): + """ Adds a RMA free to the DaCe Program. + + :param window_name: The name of the window to be freed. + :return: Name of the free. + """ + + from dace.libraries.mpi.nodes.win_free import Win_free + + # fine a new free name + free_name = sdfg.add_rma_ops(window_name, "free") + + _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) + + free_node = Win_free(free_name, window_name) + + # check for the last RMA operation + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(free_name) - 1] + + last_rma_op_node = state.add_read(last_rma_op_name) + last_rma_op_desc = sdfg.arrays[last_rma_op_name] + + # for window free ordering + state.add_edge(last_rma_op_node, + None, + free_node, + "_in", + Memlet.from_array(last_rma_op_name, last_rma_op_desc)) + + # Pseudo-writing for newast.py #3195 check and complete Processcomm creation + _, scal = sdfg.add_scalar(free_name, dace.int32, transient=True) + wnode = state.add_write(free_name) + state.add_edge(free_node, + "_out", + wnode, + None, + Memlet.from_array(free_name, scal)) + + return window_name + + @oprepo.replaces_method('RMA_window', 'Lock') def _rma_lock(pv: ProgramVisitor, sdfg: SDFG, diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py index 3d3e0ac8f9..91d97091ac 100644 --- a/dace/libraries/mpi/nodes/__init__.py +++ b/dace/libraries/mpi/nodes/__init__.py @@ -21,3 +21,4 @@ from .win_lock import Win_lock from .win_unlock import Win_unlock from .win_flush import Win_flush +from .win_free import Win_free diff --git a/dace/libraries/mpi/nodes/win_free.py b/dace/libraries/mpi/nodes/win_free.py new file mode 100644 index 0000000000..81009093fc --- /dev/null +++ b/dace/libraries/mpi/nodes/win_free.py @@ -0,0 +1,43 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace.library +import dace.properties +import dace.sdfg.nodes +from dace.transformation.transformation import ExpandTransformation +from .. import environments +from dace.libraries.mpi.nodes.node import MPINode + + +@dace.library.expansion +class ExpandWinFreeMPI(ExpandTransformation): + + environments = [environments.mpi.MPI] + + @staticmethod + def expansion(node, parent_state, parent_sdfg, **kwargs): + window_name = node.window_name + code = f""" + MPI_Win_free(&__state->{window_name}_window); + """ + tasklet = dace.sdfg.nodes.Tasklet(node.name, + node.in_connectors, + node.out_connectors, + code, + language=dace.dtypes.Language.CPP, + side_effects=True) + return tasklet + + +@dace.library.node +class Win_free(MPINode): + + # Global properties + implementations = { + "MPI": ExpandWinFreeMPI, + } + default_implementation = "MPI" + + window_name = dace.properties.Property(dtype=str, default=None) + + def __init__(self, name, window_name, *args, **kwargs): + super().__init__(name, *args, inputs={"_in"}, outputs={"_out"}, **kwargs) + self.window_name = window_name diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py index 55cbff160b..124f4299dd 100644 --- a/tests/library/mpi/mpi4py_test.py +++ b/tests/library/mpi/mpi4py_test.py @@ -52,6 +52,7 @@ def mpi4py_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace win.Fence(0) win.Put(send_buf, target_rank=rank) win.Fence(0) + win.Free() if size < 2: raise ValueError("Please run this test with at least two processes.") @@ -158,6 +159,8 @@ def mpi4py_passive_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], ra win.Fence(0) win.Fence(0) + win.Free() + if size < 2: raise ValueError("Please run this test with at least two processes.") diff --git a/tests/library/mpi/mpi_free_test.py b/tests/library/mpi/mpi_free_test.py new file mode 100644 index 0000000000..f87220e4bc --- /dev/null +++ b/tests/library/mpi/mpi_free_test.py @@ -0,0 +1,102 @@ +# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved. +import dace +from dace.sdfg import utils +import dace.dtypes as dtypes +from dace.memlet import Memlet +import dace.libraries.mpi as mpi +import dace.frontend.common.distr as comm +import numpy as np +import pytest + + +############################################################################### + + +def make_sdfg(dtype): + n = dace.symbol("n") + + sdfg = dace.SDFG("mpi_win_free") + window_state = sdfg.add_state("create_window") + + sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False) + win_buffer = window_state.add_access("win_buffer") + + window_name = sdfg.add_window() + win_create_node = mpi.nodes.win_create.Win_create(window_name) + + window_state.add_edge(win_buffer, + None, + win_create_node, + '_win_buffer', + Memlet.simple(win_buffer, "0:n", num_accesses=n)) + + # for other nodes depends this window to connect + _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True) + wnode = window_state.add_write(window_name) + window_state.add_edge(win_create_node, + "_out", + wnode, + None, + Memlet.from_array(window_name, scal)) + +############################################################################### + + free_state = sdfg.add_state("win_free") + + sdfg.add_edge(window_state, free_state, dace.InterstateEdge()) + + free_name = sdfg.add_rma_ops(window_name, "free") + win_free_node = mpi.nodes.win_free.Win_free(free_name, window_name) + + # pseudo access for ordering + window_node = free_state.add_access(window_name) + window_desc = sdfg.arrays[window_name] + + free_state.add_edge(window_node, + None, + win_free_node, + "_in", + Memlet.from_array(window_name, window_desc)) + + _, scal = sdfg.add_scalar(free_name, dace.int32, transient=True) + wnode = free_state.add_write(free_name) + free_state.add_edge(win_free_node, + "_out", + wnode, + None, + Memlet.from_array(free_name, scal)) + + return sdfg + + +############################################################################### + +@pytest.mark.parametrize("implementation, dtype", [ + pytest.param("MPI", dace.float32, marks=pytest.mark.mpi), + pytest.param("MPI", dace.int32, marks=pytest.mark.mpi) +]) +def test_win_free(dtype): + from mpi4py import MPI + np_dtype = getattr(np, dtype.to_string()) + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + if comm_size < 2: + raise ValueError("This test is supposed to be run with at least two processes!") + + mpi_func = None + for r in range(0, comm_size): + if r == comm_rank: + sdfg = make_sdfg(dtype) + mpi_func = sdfg.compile() + comm_world.Barrier() + + window_size = 10 + win_buffer = np.arange(0, window_size, dtype=np_dtype) + + mpi_func(win_buffer=win_buffer, n=window_size) + +if __name__ == "__main__": + test_win_free(dace.int32) + test_win_free(dace.float32) From da62e8e201cd0184104c2eb6d54fb68b9915b37d Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Mon, 11 Sep 2023 17:01:05 +0800 Subject: [PATCH 24/28] Refactored RMA last op checker to a function --- dace/frontend/common/distr.py | 54 ++++++++++++----------------------- 1 file changed, 19 insertions(+), 35 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index fb40a22d4c..a64647d453 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -915,6 +915,20 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str): return None +def get_last_rma_op(sdfg: SDFG, + cur_op_name: str, + window_name: str): + all_rma_ops_name = list(sdfg._rma_ops.keys()) + cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name + if f"{window_name}_" in rma_op] + if len(cur_window_rma_ops) == 1: + last_rma_op_name = window_name + else: + last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(cur_op_name) - 1] + + return last_rma_op_name + + @oprepo.replaces('mpi4py.MPI.Win.Create') @oprepo.replaces('dace.Win.Create') def _rma_window_create(pv: ProgramVisitor, @@ -986,13 +1000,7 @@ def _rma_fence(pv: ProgramVisitor, fence_node = Win_fence(fence_name, window_name) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - if len(cur_window_rma_ops) == 1: - last_rma_op_name = window_name - else: - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(fence_name) - 1] + last_rma_op_name = get_last_rma_op(sdfg, fence_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1046,13 +1054,7 @@ def _rma_flush(pv: ProgramVisitor, flush_node = Win_flush(flush_name, window_name) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - if len(cur_window_rma_ops) == 1: - last_rma_op_name = window_name - else: - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(flush_name) - 1] + last_rma_op_name = get_last_rma_op(sdfg, flush_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1104,13 +1106,7 @@ def _rma_free(pv: ProgramVisitor, free_node = Win_free(free_name, window_name) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - if len(cur_window_rma_ops) == 1: - last_rma_op_name = window_name - else: - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(free_name) - 1] + last_rma_op_name = get_last_rma_op(sdfg, free_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1165,13 +1161,7 @@ def _rma_lock(pv: ProgramVisitor, _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - if len(cur_window_rma_ops) == 1: - last_rma_op_name = window_name - else: - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(lock_name) - 1] + last_rma_op_name = get_last_rma_op(sdfg, lock_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1237,13 +1227,7 @@ def _rma_unlock(pv: ProgramVisitor, unlock_node = Win_unlock(unlock_name, window_name) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - if len(cur_window_rma_ops) == 1: - last_rma_op_name = window_name - else: - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(unlock_name) - 1] + last_rma_op_name = get_last_rma_op(sdfg, unlock_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] From 54fab2e8a0dee31e87e4203638f8dab6360e54e3 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Tue, 12 Sep 2023 23:55:20 +0800 Subject: [PATCH 25/28] Added RMA sync logic checking to last op checker --- dace/frontend/common/distr.py | 101 ++++++++++++---------------------- 1 file changed, 35 insertions(+), 66 deletions(-) diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py index a64647d453..26130e0560 100644 --- a/dace/frontend/common/distr.py +++ b/dace/frontend/common/distr.py @@ -915,9 +915,20 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str): return None -def get_last_rma_op(sdfg: SDFG, - cur_op_name: str, - window_name: str): +def _get_last_rma_op(sdfg: SDFG, + cur_op_name: str, + window_name: str, + is_trans: bool = False): + """ Get last RMA operation name of a window from the SDFG. + And do some logical checks if is_trans is True. + + :param sdfg: The sdfg for searching. + :param cur_op_name: current operation in the window. + :param window_name: The RMA window name for searching. + :param is_trans: check RMA sync is exist before op if this param is true + :return: Name of the last RMA operation. + """ + all_rma_ops_name = list(sdfg._rma_ops.keys()) cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name if f"{window_name}_" in rma_op] @@ -926,6 +937,19 @@ def get_last_rma_op(sdfg: SDFG, else: last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(cur_op_name) - 1] + if is_trans: + # if only odd number of fences or locks, + # that means we're in a ongoing epoch + # if even number, + # that means this operation might have corrupted sync + cur_window_fences = [rma_op for rma_op in cur_window_rma_ops + if f"{window_name}_fence" in rma_op] + cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops + if "lock" in rma_op] + if len(cur_window_fences) % 2 == 0 and len(cur_window_passive_syncs) % 2 == 0: + # if we don't have even number of syncs, give user a warning + print("You might have a bad synchronization of RMA calls!") + return last_rma_op_name @@ -1000,7 +1024,7 @@ def _rma_fence(pv: ProgramVisitor, fence_node = Win_fence(fence_name, window_name) # check for the last RMA operation - last_rma_op_name = get_last_rma_op(sdfg, fence_name, window_name) + last_rma_op_name = _get_last_rma_op(sdfg, fence_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1054,7 +1078,7 @@ def _rma_flush(pv: ProgramVisitor, flush_node = Win_flush(flush_name, window_name) # check for the last RMA operation - last_rma_op_name = get_last_rma_op(sdfg, flush_name, window_name) + last_rma_op_name = _get_last_rma_op(sdfg, flush_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1106,7 +1130,7 @@ def _rma_free(pv: ProgramVisitor, free_node = Win_free(free_name, window_name) # check for the last RMA operation - last_rma_op_name = get_last_rma_op(sdfg, free_name, window_name) + last_rma_op_name = _get_last_rma_op(sdfg, free_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1161,7 +1185,7 @@ def _rma_lock(pv: ProgramVisitor, _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion) # check for the last RMA operation - last_rma_op_name = get_last_rma_op(sdfg, lock_name, window_name) + last_rma_op_name = _get_last_rma_op(sdfg, lock_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1227,7 +1251,7 @@ def _rma_unlock(pv: ProgramVisitor, unlock_node = Win_unlock(unlock_name, window_name) # check for the last RMA operation - last_rma_op_name = get_last_rma_op(sdfg, unlock_name, window_name) + last_rma_op_name = _get_last_rma_op(sdfg, unlock_name, window_name) last_rma_op_node = state.add_read(last_rma_op_name) last_rma_op_desc = sdfg.arrays[last_rma_op_name] @@ -1277,26 +1301,7 @@ def _rma_put(pv: ProgramVisitor, put_name = sdfg.add_rma_ops(window_name, "put") # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - cur_window_fences = [rma_op for rma_op in cur_window_rma_ops - if f"{window_name}_fence" in rma_op] - - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1] - - # if only odd number of fences, - # that means we're in a ongoing epoch - if len(cur_window_fences) % 2 == 0: - # if even number of fences, - # that means this operation is either a passive sync. one or a corrupted one - # same logic applies to passive sync. - cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops - if "lock" in rma_op] - # if we don't have even number of syncs, give user a warning - if len(cur_window_passive_syncs) % 2 == 0: - print("You might have a bad synchronization of RMA calls!") - + last_rma_op_name = _get_last_rma_op(sdfg, put_name, window_name, is_trans=True) put_node = Win_put(put_name, window_name) @@ -1355,25 +1360,7 @@ def _rma_get(pv: ProgramVisitor, get_name = sdfg.add_rma_ops(window_name, "get") # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - cur_window_fences = [rma_op for rma_op in cur_window_rma_ops - if f"{window_name}_fence" in rma_op] - - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1] - - # if only odd number of fences, - # that means we're in a ongoing epoch - if len(cur_window_fences) % 2 == 0: - # if even number of fences, - # that means this operation is either a passive sync. one or a corrupted one - # same logic applies to passive sync. - cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops - if "lock" in rma_op] - # if we don't have even number of syncs, give user a warning - if len(cur_window_passive_syncs) % 2 == 0: - print("You might have a bad synchronization of RMA calls!") + last_rma_op_name = _get_last_rma_op(sdfg, get_name, window_name, is_trans=True) get_node = Win_get(get_name, window_name) @@ -1437,25 +1424,7 @@ def _rma_accumulate(pv: ProgramVisitor, op = _mpi4py_to_MPI(MPI, op) # check for the last RMA operation - all_rma_ops_name = list(sdfg._rma_ops.keys()) - cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name - if f"{window_name}_" in rma_op] - cur_window_fences = [rma_op for rma_op in cur_window_rma_ops - if f"{window_name}_fence" in rma_op] - - last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1] - - # if only odd number of fences, - # that means we're in a ongoing epoch - if len(cur_window_fences) % 2 == 0: - # if even number of fences, - # that means this operation is either a passive sync. one or a corrupted one - # same logic applies to passive sync. - cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops - if "lock" in rma_op] - # if we don't have even number of syncs, give user a warning - if len(cur_window_passive_syncs) % 2 == 0: - print("You might have a bad synchronization of RMA calls!") + last_rma_op_name = _get_last_rma_op(sdfg, accumulate_name, window_name, is_trans=True) accumulate_node = Win_accumulate(accumulate_name, window_name, op) From 59f8e72f7a1a0159ee8b9115456d4ca982b1ca4c Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Thu, 14 Sep 2023 22:13:19 +0800 Subject: [PATCH 26/28] changed the data type of matrix from int32 to float32 --- samples/mpi/mat_mul.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index dd8fcbee2c..d2399720b7 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -15,14 +15,14 @@ def matrix_mul(comm_world, a, b): comm_rank = comm_world.Get_rank() comm_size = comm_world.Get_size() - a_mat = np.array(a + comm_rank, dtype=np.int32) - b_mat = np.array(b + comm_rank, dtype=np.int32) - c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32) + a_mat = np.array(a + comm_rank, dtype=np.float32) + b_mat = np.array(b + comm_rank, dtype=np.float32) + c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.float32) @dace.program - def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]], - b_mat: dace.int32[b_mat.shape[0], b_mat.shape[1]], - c_mat: dace.int32[a_mat.shape[0], b_mat.shape[1]], + def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]], + b_mat: dace.float32[b_mat.shape[0], b_mat.shape[1]], + c_mat: dace.float32[a_mat.shape[0], b_mat.shape[1]], comm_rank: dace.int32, comm_size: dace.int32): grid_dim = int(np.floor(np.sqrt(comm_size))) @@ -38,8 +38,8 @@ def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]], whole_k_dim = grid_dim * a_mat.shape[1] # local buffers for remote fetching - foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32) - foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32) + foreign_a_mat = np.zeros(a_mat.shape, dtype=np.float32) + foreign_b_mat = np.zeros(b_mat.shape, dtype=np.float32) # RMA windows a_win = MPI.Win.Create(a_mat, comm=comm_world) @@ -92,22 +92,22 @@ def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]], grid_i = comm_rank // grid_dim grid_j = comm_rank % grid_dim - dim_1 = 256 - dim_2 = 256 + dim_1 = 1024 + dim_2 = 1024 - a = np.ones((dim_1, dim_2), dtype=np.int32) - b = np.ones((dim_2, dim_1), dtype=np.int32) + a = np.ones((dim_1, dim_2), dtype=np.float32) + b = np.ones((dim_2, dim_1), dtype=np.float32) c_mat, time_con = matrix_mul(comm_world, a, b) # print(comm_rank, c_mat) # print(comm_rank, "matrix_mul time:", time_con) - whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.int32) + whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.float32) for i in range(grid_dim): for j in range(grid_dim): whole_a[i * dim_1:(i+1) * dim_1, j * dim_2:(j+1) * dim_2] += i * grid_dim + j - whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.int32) + whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.float32) for i in range(grid_dim): for j in range(grid_dim): whole_b[i * dim_2:(i+1) * dim_2, j * dim_1:(j+1) * dim_1] += i * grid_dim + j From 1241422fb4f227ed9a4e32bef7a106ab5fb84123 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 15 Sep 2023 00:17:50 +0800 Subject: [PATCH 27/28] Implemented strong scaling benchmark for mat_mul.py --- samples/mpi/mat_mul.py | 76 ++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py index d2399720b7..94a67139df 100644 --- a/samples/mpi/mat_mul.py +++ b/samples/mpi/mat_mul.py @@ -6,18 +6,14 @@ import time -def matrix_mul(comm_world, a, b): - # check if matrix multiplication is valid - if a.shape[1] != b.shape[0]: - raise ValueError("A, B matrix dimension mismatched!") - +def matrix_mul(comm_world, dim_1, dim_2): # comm init comm_rank = comm_world.Get_rank() comm_size = comm_world.Get_size() - a_mat = np.array(a + comm_rank, dtype=np.float32) - b_mat = np.array(b + comm_rank, dtype=np.float32) - c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.float32) + a_mat = np.full((dim_1, dim_2), 1 + comm_rank, dtype=np.float32) + b_mat = np.full((dim_2, dim_1), 1 + comm_rank, dtype=np.float32) + c_mat = np.zeros((dim_1, dim_1), dtype=np.float32) @dace.program def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]], @@ -83,11 +79,7 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]], return c_mat, time_con -if __name__ == "__main__": - comm_world = MPI.COMM_WORLD - comm_rank = comm_world.Get_rank() - comm_size = comm_world.Get_size() - +def weak_scaling(comm_world, comm_rank, comm_size): grid_dim = int(np.floor(np.sqrt(comm_size))) grid_i = comm_rank // grid_dim grid_j = comm_rank % grid_dim @@ -95,10 +87,7 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]], dim_1 = 1024 dim_2 = 1024 - a = np.ones((dim_1, dim_2), dtype=np.float32) - b = np.ones((dim_2, dim_1), dtype=np.float32) - - c_mat, time_con = matrix_mul(comm_world, a, b) + c_mat, time_con = matrix_mul(comm_world, dim_1, dim_2) # print(comm_rank, c_mat) # print(comm_rank, "matrix_mul time:", time_con) @@ -121,3 +110,56 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]], # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2])) assert(np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2])) + + +def strong_scaling(comm_world, comm_rank, comm_size): + grid_dim = int(np.floor(np.sqrt(comm_size))) + grid_i = comm_rank // grid_dim + grid_j = comm_rank % grid_dim + + total_dim = 8192 + dim_1 = total_dim + dim_2 = total_dim + if total_dim % comm_size > 0: + dim_1 += comm_size - total_dim % comm_size + dim_2 += comm_size - total_dim % comm_size + + local_dim_1 = dim_1 // grid_dim + local_dim_2 = dim_2 // grid_dim + + a = np.ones((local_dim_1, local_dim_2), dtype=np.float32) + b = np.ones((local_dim_2, local_dim_1), dtype=np.float32) + + c_mat, time_con = matrix_mul(comm_world, local_dim_1, local_dim_2) + # print(comm_rank, c_mat) + # print(comm_rank, "matrix_mul time:", time_con) + + # validation, since it will compute the whole matrix in the edge + # whole_a = np.ones((local_dim_1 * grid_dim, local_dim_2 * grid_dim), dtype=np.float32) + # for i in range(grid_dim): + # for j in range(grid_dim): + # whole_a[i * local_dim_1:(i+1) * local_dim_1, j * local_dim_2:(j+1) * local_dim_2] += i * grid_dim + j + + # whole_b = np.ones((local_dim_2 * grid_dim, local_dim_1 * grid_dim), dtype=np.float32) + # for i in range(grid_dim): + # for j in range(grid_dim): + # whole_b[i * local_dim_2:(i+1) * local_dim_2, j * local_dim_1:(j+1) * local_dim_1] += i * grid_dim + j + + # start = time.time() + # c_np = np.matmul(whole_a, whole_b) + # time_con = time.time() - start + # # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * local_dim_1:(grid_i+1) * local_dim_1, grid_j* local_dim_2:(grid_j+1) * local_dim_2])) + # assert(np.allclose(c_mat, c_np[grid_i * local_dim_1:(grid_i+1) * local_dim_1, grid_j* local_dim_2:(grid_j+1) * local_dim_2])) + +if __name__ == "__main__": + comm_world = MPI.COMM_WORLD + comm_rank = comm_world.Get_rank() + comm_size = comm_world.Get_size() + + grid_dim = int(np.floor(np.sqrt(comm_size))) + + if comm_size != grid_dim ** 2: + raise ValueError("Please run this test with a square number of processes.") + + # weak_scaling(comm_world, comm_rank, comm_size) + strong_scaling(comm_world, comm_rank, comm_size) From e7baaf73b47166d70e26dbe2e229bdcd078a56c4 Mon Sep 17 00:00:00 2001 From: "Fu-Chiang, Chang" Date: Fri, 15 Sep 2023 23:27:18 +0800 Subject: [PATCH 28/28] Fixed the window size configuration in window creation --- dace/libraries/mpi/nodes/win_create.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py index e3f7ba10d0..7abfc02b96 100644 --- a/dace/libraries/mpi/nodes/win_create.py +++ b/dace/libraries/mpi/nodes/win_create.py @@ -29,7 +29,7 @@ def expansion(node, parent_state, parent_sdfg, **kwargs): code = f""" MPI_Win_create(_win_buffer, - {win_buf_count_str}, + {win_buf_count_str} * sizeof({win_buffer_dtype}), sizeof({win_buffer_dtype}), MPI_INFO_NULL, {comm},