From 44e14624b1d4ed3961234f87c94b800517a2092e Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 16:00:34 +0800
Subject: [PATCH 01/28] Added win create library node and its test

---
 dace/libraries/mpi/nodes/__init__.py   |  1 +
 dace/libraries/mpi/nodes/win_create.py | 82 ++++++++++++++++++++++++++
 tests/library/mpi/win_create_test.py   | 76 ++++++++++++++++++++++++
 3 files changed, 159 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/win_create.py
 create mode 100644 tests/library/mpi/win_create_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 0cd36cc82f..3a0b2e3348 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -13,3 +13,4 @@
 from .alltoall import Alltoall
 from .dummy import Dummy
 from .redistribute import Redistribute
+from .win_create import Win_create
diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py
new file mode 100644
index 0000000000..5d1bff89c6
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_create.py
@@ -0,0 +1,82 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinCreateMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        win_buffer, win_buf_count_str = node.validate(parent_sdfg, parent_state)
+        win_buffer_dtype = dace.libraries.mpi.utils.MPI_DDT(win_buffer.dtype.base_type)
+        window_name = node.name
+
+        node.fields = [
+            f"MPI_Win {window_name}_window;"
+        ]
+    
+        comm = "MPI_COMM_WORLD"
+        if node.grid:
+            comm = f"__state->{node.grid}_comm"
+
+        code = f"""
+            MPI_Win_create(_win_buffer,
+                           {win_buf_count_str},
+                           sizeof({win_buffer_dtype}),
+                           MPI_INFO_NULL,
+                           {comm},
+                           &__state->{window_name}_window);
+            """
+        
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          state_fields=node.fields,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+
+        return tasklet
+
+
+@dace.library.node
+class Win_create(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinCreateMPI,
+    }
+    default_implementation = "MPI"
+
+    grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
+
+    def __init__(self, name, grid=None, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_win_buffer"}, outputs={"_out"}, **kwargs)
+        self.grid = grid
+
+    def validate(self, sdfg, state):
+        """
+        :return: A three-tuple (buffer, root) of the three data descriptors in the
+                 parent SDFG.
+        """
+
+        win_buffer = None
+        for e in state.in_edges(self):
+            if e.dst_conn == "_win_buffer":
+                win_buffer = sdfg.arrays[e.data.data]
+
+        win_buf_count_str = "XXX"
+        for _, _, _, dst_conn, data in state.in_edges(self):
+            if dst_conn == '_win_buffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                win_buf_count_str = "*".join(dims)
+
+        return win_buffer, win_buf_count_str
diff --git a/tests/library/mpi/win_create_test.py b/tests/library/mpi/win_create_test.py
new file mode 100644
index 0000000000..f5f8b58f78
--- /dev/null
+++ b/tests/library/mpi/win_create_test.py
@@ -0,0 +1,76 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_create")
+    state = sdfg.add_state("start")
+
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    win_buffer = state.add_access("win_buffer")
+
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    state.add_edge(win_buffer,
+                   None,
+                   win_create_node,
+                   '_win_buffer',
+                   Memlet.simple(win_buffer, "0:n", num_accesses=n))
+    
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = state.add_write(window_name)
+    state.add_edge(win_create_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(window_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_create(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+    
+    sdfg = make_sdfg(dtype)
+    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    window_size = 10
+    win_buffer = np.arange(0, window_size, dtype=np_dtype)
+
+    mpi_func(win_buffer=win_buffer, n=window_size)
+
+
+###############################################################################
+
+
+if __name__ == "__main__":
+    test_win_create(dace.float32)
+    test_win_create(dace.int32)

From 4ff33c2ba86ae1aa8b3c776c670c4b35d064c94d Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 16:01:21 +0800
Subject: [PATCH 02/28] Added MPI RMA fence library node and its test

---
 dace/libraries/mpi/nodes/__init__.py  |   1 +
 dace/libraries/mpi/nodes/win_fence.py |  43 ++++++++++
 tests/library/mpi/win_fence_test.py   | 108 ++++++++++++++++++++++++++
 3 files changed, 152 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/win_fence.py
 create mode 100644 tests/library/mpi/win_fence_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 3a0b2e3348..53097461d6 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -14,3 +14,4 @@
 from .dummy import Dummy
 from .redistribute import Redistribute
 from .win_create import Win_create
+from .win_fence import Win_fence
diff --git a/dace/libraries/mpi/nodes/win_fence.py b/dace/libraries/mpi/nodes/win_fence.py
new file mode 100644
index 0000000000..ae2d0a0dda
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_fence.py
@@ -0,0 +1,43 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinFenceMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        window_name = node.window_name
+        code = f"""
+            MPI_Win_fence(_assertion, __state->{window_name}_window);
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_fence(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinFenceMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_assertion"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py
new file mode 100644
index 0000000000..348945527e
--- /dev/null
+++ b/tests/library/mpi/win_fence_test.py
@@ -0,0 +1,108 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_fence")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    win_buffer = window_state.add_access("win_buffer")
+    
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                   None,
+                   win_create_node,
+                   '_win_buffer',
+                   Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+    
+    fence_state = sdfg.add_state("win_fence")
+
+    sdfg.add_edge(window_state, fence_state, dace.InterstateEdge())
+
+    fence_name = sdfg.add_fence()
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    window_node = fence_state.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    fence_state.add_edge(window_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(window_name, window_desc))
+
+    sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
+    assertion_node = fence_state.add_access("assertion")
+
+    fence_state.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+    
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state.add_write(fence_name)
+    fence_state.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_fence(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+    
+    sdfg = make_sdfg(dtype)
+    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    window_size = 10
+    win_buffer = np.arange(0, window_size, dtype=np_dtype)
+    assertion = np.full([1], 0, dtype=np.int32)
+
+    mpi_func(assertion=assertion, win_buffer=win_buffer, n=window_size)
+
+if __name__ == "__main__":
+    test_win_fence(dace.int32)
+    test_win_fence(dace.float32)

From d3696e020bee85c7acab5ab90f23438cfafeff0a Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 16:02:07 +0800
Subject: [PATCH 03/28] Added RMA put library node and its test

---
 dace/libraries/mpi/nodes/__init__.py |   1 +
 dace/libraries/mpi/nodes/win_put.py  |  80 +++++++++++
 tests/library/mpi/win_put_test.py    | 205 +++++++++++++++++++++++++++
 3 files changed, 286 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/win_put.py
 create mode 100644 tests/library/mpi/win_put_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 53097461d6..ba18c77bad 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -15,3 +15,4 @@
 from .redistribute import Redistribute
 from .win_create import Win_create
 from .win_fence import Win_fence
+from .win_put import Win_put
diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py
new file mode 100644
index 0000000000..815e1b5a12
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_put.py
@@ -0,0 +1,80 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinPutMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        inbuffer, in_count_str = node.validate(parent_sdfg, parent_state)
+        mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type)
+
+        window_name = node.window_name
+
+        code = f"""
+            MPI_Put(_inbuffer, {in_count_str}, {mpi_dtype_str}, \
+                    _target_rank, 0, {in_count_str}, {mpi_dtype_str}, \
+                    __state->{window_name}_window);
+        """
+
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_put(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinPutMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
+
+    def validate(self, sdfg, state):
+        """
+        :return: A three-tuple (buffer, root) of the three data descriptors in the
+                 parent SDFG.
+        """
+
+        inbuffer, target_rank = None, None 
+        for e in state.in_edges(self):
+            if e.dst_conn == "_inbuffer":
+                inbuffer = sdfg.arrays[e.data.data]
+
+        in_count_str = "XXX"
+        for _, _, _, dst_conn, data in state.in_edges(self):
+            if dst_conn == '_inbuffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                in_count_str = "*".join(dims)
+
+        # outbuffer = None
+        # for e in state.out_edges(self):
+        #     if e.src_conn == "_outbuffer":
+        #         outbuffer = sdfg.arrays[e.data.data]
+        # out_count_str = "XXX"
+        # for _, src_conn, _, _, data in state.out_edges(self):
+        #     if src_conn == '_outbuffer':
+        #         dims = [str(e) for e in data.subset.size_exact()]
+        #         out_count_str = "*".join(dims)
+        
+        return inbuffer, in_count_str
+
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
new file mode 100644
index 0000000000..73b31be9b4
--- /dev/null
+++ b/tests/library/mpi/win_put_test.py
@@ -0,0 +1,205 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_put")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False)
+
+    win_buffer = window_state.add_access("win_buffer")
+    
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                          None,
+                          win_create_node,
+                          '_win_buffer',
+                          Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+
+    fence_state_1 = sdfg.add_state("win_fence_1")
+
+    sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
+
+    fence_name = sdfg.add_fence()
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    window_node = fence_state_1.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    fence_state_1.add_edge(window_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(window_name, window_desc))
+
+    assertion_node = fence_state_1.add_access("assertion")
+
+    fence_state_1.add_edge(assertion_node,
+                           None,
+                           win_fence_node,
+                           '_assertion',
+                           Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_1.add_write(fence_name)
+    fence_state_1.add_edge(win_fence_node,
+                           "_out",
+                           wnode,
+                           None,
+                           Memlet.from_array(fence_name, scal))
+
+###############################################################################
+
+    put_state = sdfg.add_state("win_put")
+
+    sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge())
+
+    put_name = sdfg.add_fence()
+    win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name)
+
+    # pseudo access for ordering
+    fence_node = put_state.add_access(fence_name)
+    fence_desc = sdfg.arrays[fence_name]
+
+    send_buffer = put_state.add_access("send_buffer")
+    
+    target_rank = put_state.add_access("target_rank")
+
+    put_state.add_edge(fence_node,
+                       None,
+                       win_put_node,
+                       None,
+                       Memlet.from_array(fence_name, fence_desc))
+
+    put_state.add_edge(send_buffer,
+                       None,
+                       win_put_node,
+                       "_inbuffer",
+                       Memlet.simple(send_buffer, "0:n", num_accesses=n))
+
+    put_state.add_edge(target_rank,
+                       None,
+                       win_put_node,
+                       "_target_rank",
+                       Memlet.simple(target_rank, "0:1", num_accesses=1))
+    
+    _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True)
+    wnode = put_state.add_write(put_name)
+    put_state.add_edge(win_put_node,
+                       "_out",
+                       wnode,
+                       None,
+                       Memlet.from_array(put_name, scal))
+
+###############################################################################
+
+    fence_state_2 = sdfg.add_state("win_fence_2")
+
+    sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge())
+
+    fence_name = sdfg.add_fence()
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    put_node = fence_state_2.add_access(put_name)
+    put_desc = sdfg.arrays[put_name]
+
+    fence_state_2.add_edge(put_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(put_name, put_desc))
+    
+    assertion_node = fence_state_2.add_access("assertion")
+
+    fence_state_2.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_2.add_write(fence_name)
+    fence_state_2.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_put(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+    
+    sdfg = make_sdfg(dtype)
+    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    window_size = 10
+    win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+    send_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+
+    target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32)
+
+    assertion = np.full([1], 0, dtype=np.int32)
+
+    # print(comm_rank, win_buffer)
+
+    mpi_func(assertion=assertion,
+             win_buffer=win_buffer,
+             send_buffer=send_buffer,
+             target_rank=target_rank,
+             n=window_size)
+
+    # print(comm_rank, win_buffer)
+
+    correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype)
+    if (not np.allclose(win_buffer, correct_data)):
+        raise (ValueError("The received values are not what I expected on root."))
+
+if __name__ == "__main__":
+    test_win_put(dace.int32)
+    test_win_put(dace.float32)

From b92ccbb1bf6d76815359e138aa53148455f6ae50 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 16:21:20 +0800
Subject: [PATCH 04/28] Added RMA get library node and its test

---
 dace/libraries/mpi/nodes/__init__.py |   1 +
 dace/libraries/mpi/nodes/win_get.py  |  68 +++++++++
 dace/libraries/mpi/nodes/win_put.py  |  13 +-
 tests/library/mpi/win_get_test.py    | 205 +++++++++++++++++++++++++++
 4 files changed, 275 insertions(+), 12 deletions(-)
 create mode 100644 dace/libraries/mpi/nodes/win_get.py
 create mode 100644 tests/library/mpi/win_get_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index ba18c77bad..5400bd45de 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -16,3 +16,4 @@
 from .win_create import Win_create
 from .win_fence import Win_fence
 from .win_put import Win_put
+from .win_get import Win_get
diff --git a/dace/libraries/mpi/nodes/win_get.py b/dace/libraries/mpi/nodes/win_get.py
new file mode 100644
index 0000000000..e05a5d6195
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_get.py
@@ -0,0 +1,68 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinGetMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        outbuffer, out_count_str = node.validate(parent_sdfg, parent_state)
+        mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(outbuffer.dtype.base_type)
+
+        window_name = node.window_name
+
+        code = f"""
+            MPI_Get(_outbuffer, {out_count_str}, {mpi_dtype_str}, \
+                    _target_rank, 0, {out_count_str}, {mpi_dtype_str}, \
+                    __state->{window_name}_window);
+        """
+
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_get(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinGetMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs)
+        self.window_name = window_name
+
+    def validate(self, sdfg, state):
+        """
+        :return: A three-tuple (buffer, root) of the three data descriptors in the
+                 parent SDFG.
+        """
+
+        outbuffer = None
+        for e in state.out_edges(self):
+            if e.src_conn == "_outbuffer":
+                outbuffer = sdfg.arrays[e.data.data]
+        out_count_str = "XXX"
+        for _, src_conn, _, _, data in state.out_edges(self):
+            if src_conn == '_outbuffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                out_count_str = "*".join(dims)
+        
+        return outbuffer, out_count_str
diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py
index 815e1b5a12..6dd23a7324 100644
--- a/dace/libraries/mpi/nodes/win_put.py
+++ b/dace/libraries/mpi/nodes/win_put.py
@@ -55,7 +55,7 @@ def validate(self, sdfg, state):
                  parent SDFG.
         """
 
-        inbuffer, target_rank = None, None 
+        inbuffer = None 
         for e in state.in_edges(self):
             if e.dst_conn == "_inbuffer":
                 inbuffer = sdfg.arrays[e.data.data]
@@ -65,16 +65,5 @@ def validate(self, sdfg, state):
             if dst_conn == '_inbuffer':
                 dims = [str(e) for e in data.subset.size_exact()]
                 in_count_str = "*".join(dims)
-
-        # outbuffer = None
-        # for e in state.out_edges(self):
-        #     if e.src_conn == "_outbuffer":
-        #         outbuffer = sdfg.arrays[e.data.data]
-        # out_count_str = "XXX"
-        # for _, src_conn, _, _, data in state.out_edges(self):
-        #     if src_conn == '_outbuffer':
-        #         dims = [str(e) for e in data.subset.size_exact()]
-        #         out_count_str = "*".join(dims)
         
         return inbuffer, in_count_str
-
diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
new file mode 100644
index 0000000000..656c129f80
--- /dev/null
+++ b/tests/library/mpi/win_get_test.py
@@ -0,0 +1,205 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_put")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("receive_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False)
+
+    win_buffer = window_state.add_access("win_buffer")
+    
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                          None,
+                          win_create_node,
+                          '_win_buffer',
+                          Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+
+    fence_state_1 = sdfg.add_state("win_fence_1")
+
+    sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
+
+    fence_name = sdfg.add_fence()
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    window_node = fence_state_1.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    fence_state_1.add_edge(window_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(window_name, window_desc))
+
+    assertion_node = fence_state_1.add_access("assertion")
+
+    fence_state_1.add_edge(assertion_node,
+                           None,
+                           win_fence_node,
+                           '_assertion',
+                           Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_1.add_write(fence_name)
+    fence_state_1.add_edge(win_fence_node,
+                           "_out",
+                           wnode,
+                           None,
+                           Memlet.from_array(fence_name, scal))
+
+###############################################################################
+
+    get_state = sdfg.add_state("win_get")
+
+    sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge())
+
+    get_name = sdfg.add_fence()
+    win_put_node = mpi.nodes.win_get.Win_get(get_name, window_name)
+
+    # pseudo access for ordering
+    fence_node = get_state.add_access(fence_name)
+    fence_desc = sdfg.arrays[fence_name]
+    
+    target_rank = get_state.add_access("target_rank")
+
+    get_state.add_edge(fence_node,
+                       None,
+                       win_put_node,
+                       None,
+                       Memlet.from_array(fence_name, fence_desc))
+
+    get_state.add_edge(target_rank,
+                       None,
+                       win_put_node,
+                       "_target_rank",
+                       Memlet.simple(target_rank, "0:1", num_accesses=1))
+    
+
+    receive_buffer = get_state.add_write("receive_buffer")
+    get_state.add_edge(win_put_node,
+                       "_outbuffer",
+                       receive_buffer,
+                       None,
+                       Memlet.simple(receive_buffer, "0:n", num_accesses=n))
+    
+    _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True)
+    wnode = get_state.add_write(get_name)
+    get_state.add_edge(win_put_node,
+                       "_out",
+                       wnode,
+                       None,
+                       Memlet.from_array(get_name, scal))
+
+###############################################################################
+
+    fence_state_2 = sdfg.add_state("win_fence_2")
+
+    sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge())
+
+    fence_name = sdfg.add_fence()
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    put_node = fence_state_2.add_access(get_name)
+    put_desc = sdfg.arrays[get_name]
+
+    fence_state_2.add_edge(put_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(get_name, put_desc))
+    
+    assertion_node = fence_state_2.add_access("assertion")
+
+    fence_state_2.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_2.add_write(fence_name)
+    fence_state_2.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_put(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+    
+    sdfg = make_sdfg(dtype)
+    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    window_size = 10
+    win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+    receive_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+
+    target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32)
+
+    assertion = np.full([1], 0, dtype=np.int32)
+
+    print(comm_rank, receive_buffer)
+
+    mpi_func(assertion=assertion,
+             win_buffer=win_buffer,
+             receive_buffer=receive_buffer,
+             target_rank=target_rank,
+             n=window_size)
+
+    print(comm_rank, receive_buffer)
+
+    correct_data = np.full(window_size, (comm_rank + 1) % comm_size, dtype=np_dtype)
+    if (not np.allclose(receive_buffer, correct_data)):
+        raise (ValueError("The received values are not what I expected on root."))
+
+if __name__ == "__main__":
+    test_win_put(dace.int32)
+    # test_win_put(dace.float32)

From 6ebaed5e1fa23c58b29cfe2e23706675e6296855 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 20:17:28 +0800
Subject: [PATCH 05/28] Removed debug msg in put/get tests

Removed debug msg in put/get tests

Renamed sdfg
---
 tests/library/mpi/win_get_test.py | 8 ++------
 tests/library/mpi/win_put_test.py | 4 ----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
index 656c129f80..697f4f6d54 100644
--- a/tests/library/mpi/win_get_test.py
+++ b/tests/library/mpi/win_get_test.py
@@ -15,7 +15,7 @@
 def make_sdfg(dtype):
     n = dace.symbol("n")
 
-    sdfg = dace.SDFG("mpi_win_put")
+    sdfg = dace.SDFG("mpi_win_get")
     window_state = sdfg.add_state("create_window")
 
     sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
@@ -186,20 +186,16 @@ def test_win_put(dtype):
 
     assertion = np.full([1], 0, dtype=np.int32)
 
-    print(comm_rank, receive_buffer)
-
     mpi_func(assertion=assertion,
              win_buffer=win_buffer,
              receive_buffer=receive_buffer,
              target_rank=target_rank,
              n=window_size)
 
-    print(comm_rank, receive_buffer)
-
     correct_data = np.full(window_size, (comm_rank + 1) % comm_size, dtype=np_dtype)
     if (not np.allclose(receive_buffer, correct_data)):
         raise (ValueError("The received values are not what I expected on root."))
 
 if __name__ == "__main__":
     test_win_put(dace.int32)
-    # test_win_put(dace.float32)
+    test_win_put(dace.float32)
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
index 73b31be9b4..ce1c6ba2e3 100644
--- a/tests/library/mpi/win_put_test.py
+++ b/tests/library/mpi/win_put_test.py
@@ -186,16 +186,12 @@ def test_win_put(dtype):
 
     assertion = np.full([1], 0, dtype=np.int32)
 
-    # print(comm_rank, win_buffer)
-
     mpi_func(assertion=assertion,
              win_buffer=win_buffer,
              send_buffer=send_buffer,
              target_rank=target_rank,
              n=window_size)
 
-    # print(comm_rank, win_buffer)
-
     correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype)
     if (not np.allclose(win_buffer, correct_data)):
         raise (ValueError("The received values are not what I expected on root."))

From b4e9e6579735e8cc0c8d6a7523a6eb6c7d33cb27 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 21:27:04 +0800
Subject: [PATCH 06/28] Updated RMA ops administration in sdfg.py

Removed a space

Updated to remove put in get
---
 dace/distr_types.py                 | 24 ++++++++++++++
 dace/sdfg/sdfg.py                   | 49 +++++++++++++++++++++++++++--
 tests/library/mpi/win_fence_test.py |  2 +-
 tests/library/mpi/win_get_test.py   | 31 +++++++++---------
 tests/library/mpi/win_put_test.py   |  6 ++--
 5 files changed, 89 insertions(+), 23 deletions(-)

diff --git a/dace/distr_types.py b/dace/distr_types.py
index 1b595a1b84..77e4730ad1 100644
--- a/dace/distr_types.py
+++ b/dace/distr_types.py
@@ -598,3 +598,27 @@ def exit_code(self, sdfg):
             delete[] __state->{self.name}_self_dst;
             delete[] __state->{self.name}_self_size;
         """
+
+@make_properties
+class RMA_window(object):
+    """
+    RMA_window is the descriptor class for MPI Remote Memory Access window
+    Real window creation is implemented in mpi.nodes.win_create.Win_create
+    """
+
+    name = Property(dtype=str, desc="The name of new window.")
+    def __init__(self,
+                 name: str):
+        self.name = name
+        self._validate()
+
+    def validate(self):
+        """ Validate the correctness of this object.
+            Raises an exception on error. """
+        self._validate()
+
+    # Validation of this class is in a separate function, so that this
+    # class can call `_validate()` without calling the subclasses'
+    # `validate` function.
+    def _validate(self):
+        return True
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index f3a37ef08c..1fb32cdbf8 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -32,7 +32,7 @@
 from dace.sdfg.graph import OrderedDiGraph, Edge, SubgraphView
 from dace.sdfg.state import SDFGState
 from dace.sdfg.propagation import propagate_memlets_sdfg
-from dace.distr_types import ProcessGrid, SubArray, RedistrArray
+from dace.distr_types import ProcessGrid, SubArray, RedistrArray, RMA_window
 from dace.dtypes import validate_name
 from dace.properties import (DebugInfoProperty, EnumProperty, ListProperty, make_properties, Property, CodeProperty,
                              TransformationHistProperty, OptionalSDFGReferenceProperty, DictProperty, CodeBlock)
@@ -409,6 +409,16 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]):
                            desc="Process-grid descriptors for this SDFG",
                            to_json=_arrays_to_json,
                            from_json=_arrays_from_json)
+    _windows = DictProperty(str,
+                            RMA_window,
+                            desc="MPI RMA window descriptors for this SDFG",
+                            to_json=_arrays_to_json,
+                            from_json=_arrays_from_json)
+    _rma_ops = DictProperty(str,
+                           str,
+                           desc="MPI RMA fence descriptors for this SDFG",
+                           to_json=_arrays_to_json,
+                           from_json=_arrays_from_json)
     _subarrays = DictProperty(str,
                               SubArray,
                               desc="Sub-array descriptors for this SDFG",
@@ -477,6 +487,8 @@ def __init__(self,
 
         # Grid-distribution-related fields
         self._pgrids = {}
+        self._windows = {}
+        self._rma_ops = {}
         self._subarrays = {}
         self._rdistrarrays = {}
 
@@ -647,6 +659,16 @@ def process_grids(self):
         """ Returns a dictionary of process-grid descriptors (`ProcessGrid` objects) used in this SDFG. """
         return self._pgrids
 
+    @property
+    def rma_windows(self):
+        """ Returns a dictionary of RMA window descriptors (`RMA_window` objects) used in this SDFG. """
+        return self._windows
+
+    @property
+    def rma_ops(self):
+        """ Returns a dictionary of RMA operations descriptors (an empty string) used in this SDFG. """
+        return self._rma_ops
+
     @property
     def subarrays(self):
         """ Returns a dictionary of sub-array descriptors (`SubArray` objects) used in this SDFG. """
@@ -1666,8 +1688,9 @@ def add_state_after(self, state: 'SDFGState', label=None, is_start_state=False)
     def _find_new_name(self, name: str):
         """ Tries to find a new name by adding an underscore and a number. """
 
-        names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() | self._subarrays.keys()
-                 | self._rdistrarrays.keys())
+        names = (self._arrays.keys() | self.constants_prop.keys() | self._pgrids.keys() |
+                 self._subarrays.keys() | self._rdistrarrays.keys() | self._windows.keys() |
+                 self._rma_ops.keys())
         return dt.find_new_name(name, names)
 
     def find_new_constant(self, name: str):
@@ -2043,6 +2066,26 @@ def add_pgrid(self,
 
         return grid_name
 
+    def add_window(self):
+        """ Adds a RMA window to the RMA window descriptor store.
+        """
+
+        window_name = self._find_new_name('__win')
+
+        self._windows[window_name] = RMA_window(window_name)
+
+        return window_name
+
+    def add_rma_ops(self):
+        """ Adds a RMA op to the RMA ops descriptor store.
+        """
+
+        rma_op_name = self._find_new_name('__win_op')
+
+        self._rma_ops[rma_op_name] = ""
+
+        return rma_op_name
+
     def add_subarray(self,
                      dtype: dtypes.typeclass,
                      shape: ShapeType,
diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py
index 348945527e..e114e3c7cf 100644
--- a/tests/library/mpi/win_fence_test.py
+++ b/tests/library/mpi/win_fence_test.py
@@ -45,7 +45,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state, dace.InterstateEdge())
 
-    fence_name = sdfg.add_fence()
+    fence_name = sdfg.add_rma_ops()
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
index 697f4f6d54..18ac9d7cfb 100644
--- a/tests/library/mpi/win_get_test.py
+++ b/tests/library/mpi/win_get_test.py
@@ -49,7 +49,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
 
-    fence_name = sdfg.add_fence()
+    fence_name = sdfg.add_rma_ops()
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
@@ -84,8 +84,8 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge())
 
-    get_name = sdfg.add_fence()
-    win_put_node = mpi.nodes.win_get.Win_get(get_name, window_name)
+    get_name = sdfg.add_rma_ops()
+    win_get_node = mpi.nodes.win_get.Win_get(get_name, window_name)
 
     # pseudo access for ordering
     fence_node = get_state.add_access(fence_name)
@@ -95,19 +95,18 @@ def make_sdfg(dtype):
 
     get_state.add_edge(fence_node,
                        None,
-                       win_put_node,
+                       win_get_node,
                        None,
                        Memlet.from_array(fence_name, fence_desc))
 
     get_state.add_edge(target_rank,
                        None,
-                       win_put_node,
+                       win_get_node,
                        "_target_rank",
                        Memlet.simple(target_rank, "0:1", num_accesses=1))
-    
 
     receive_buffer = get_state.add_write("receive_buffer")
-    get_state.add_edge(win_put_node,
+    get_state.add_edge(win_get_node,
                        "_outbuffer",
                        receive_buffer,
                        None,
@@ -115,7 +114,7 @@ def make_sdfg(dtype):
     
     _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True)
     wnode = get_state.add_write(get_name)
-    get_state.add_edge(win_put_node,
+    get_state.add_edge(win_get_node,
                        "_out",
                        wnode,
                        None,
@@ -127,18 +126,18 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge())
 
-    fence_name = sdfg.add_fence()
+    fence_name = sdfg.add_rma_ops()
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
-    put_node = fence_state_2.add_access(get_name)
-    put_desc = sdfg.arrays[get_name]
+    get_node = fence_state_2.add_access(get_name)
+    get_desc = sdfg.arrays[get_name]
 
-    fence_state_2.add_edge(put_node,
+    fence_state_2.add_edge(get_node,
                          None,
                          win_fence_node,
                          None,
-                         Memlet.from_array(get_name, put_desc))
+                         Memlet.from_array(get_name, get_desc))
     
     assertion_node = fence_state_2.add_access("assertion")
 
@@ -165,7 +164,7 @@ def make_sdfg(dtype):
     pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
     pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
 ])
-def test_win_put(dtype):
+def test_win_get(dtype):
     from mpi4py import MPI
     np_dtype = getattr(np, dtype.to_string())
     comm_world = MPI.COMM_WORLD
@@ -197,5 +196,5 @@ def test_win_put(dtype):
         raise (ValueError("The received values are not what I expected on root."))
 
 if __name__ == "__main__":
-    test_win_put(dace.int32)
-    test_win_put(dace.float32)
+    test_win_get(dace.int32)
+    test_win_get(dace.float32)
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
index ce1c6ba2e3..4352e7294d 100644
--- a/tests/library/mpi/win_put_test.py
+++ b/tests/library/mpi/win_put_test.py
@@ -49,7 +49,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
 
-    fence_name = sdfg.add_fence()
+    fence_name = sdfg.add_rma_ops()
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
@@ -84,7 +84,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge())
 
-    put_name = sdfg.add_fence()
+    put_name = sdfg.add_rma_ops()
     win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name)
 
     # pseudo access for ordering
@@ -127,7 +127,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge())
 
-    fence_name = sdfg.add_fence()
+    fence_name = sdfg.add_rma_ops()
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering

From 5be096288852e6b401f5ab390d3a9c1e3897f986 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 18 Aug 2023 21:52:54 +0800
Subject: [PATCH 07/28] Implemented replacement for MPI RMA win_create, fence,
 put, get

---
 dace/frontend/common/distr.py    | 241 +++++++++++++++++++++++++++++++
 dace/frontend/python/newast.py   |   5 +
 tests/library/mpi/mpi4py_test.py |  68 +++++++++
 3 files changed, 314 insertions(+)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index d6f22da358..e3107457a4 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -15,6 +15,27 @@
 RankType = Union[Integral, str, symbolic.symbol, symbolic.SymExpr, symbolic.sympy.Basic]
 ProgramVisitor = 'dace.frontend.python.newast.ProgramVisitor'
 
+# a helper function for getting an access node by argument name
+# creates a scalar if it's a number
+def _get_int_arg_node(pv: ProgramVisitor,
+                     sdfg: SDFG,
+                     state: SDFGState,
+                     argument: Union[str, sp.Expr, Number]
+                    ):
+    if isinstance(argument, str) and argument in sdfg.arrays.keys():
+        arg_name = argument
+        arg_node = state.add_read(arg_name)
+    else:
+        # create a transient scalar and take its name
+        arg_name = _define_local_scalar(pv, sdfg, state, dace.int32)
+        arg_node = state.add_access(arg_name)
+        # every tasklet is in different scope, no need to worry about name confilct
+        color_tasklet = state.add_tasklet(f'_set_{arg_name}_', {}, {'__out'}, f'__out = {argument}')
+        state.add_edge(color_tasklet, '__out', arg_node, None, Memlet.simple(arg_node, '0'))
+
+    return arg_name, arg_node
+
+
 ##### MPI Cartesian Communicators
 
 
@@ -894,6 +915,226 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
+@oprepo.replaces('mpi4py.MPI.Win.Create')
+@oprepo.replaces('dace.Win.Create')
+def _rma_window_create(pv: ProgramVisitor,
+              sdfg: SDFG,
+              state: SDFGState,
+              buffer: str,
+              comm: Union[str, ShapeType],
+              grid: str = None):
+    """ Adds a RMA window to the DaCe Program.
+
+        :param buffer: The name of window buffer.
+        :param comm: A dummy input for compatibility with mpi4py
+        :process_grid: Name of the process-grid for collective scatter/gather operations.
+        :return: Name of the window.
+    """
+
+    from dace.libraries.mpi.nodes.win_create import Win_create
+
+    # fine a new window name
+    window_name = sdfg.add_window()
+
+    window_node = Win_create(window_name, grid)
+
+    buf_desc = sdfg.arrays[buffer]
+    buf_node = state.add_read(buffer)
+    state.add_edge(buf_node,
+                   None,
+                   window_node,
+                   '_win_buffer',
+                   Memlet.from_array(buffer, buf_desc))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = state.add_write(window_name)
+    state.add_edge(window_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(window_name, scal))
+
+    return window_name
+
+
+@oprepo.replaces_method('RMA_window', 'Fence')
+def _rma_fence(pv: ProgramVisitor,
+              sdfg: SDFG,
+              state: SDFGState,
+              window_name: str,
+              assertion: Union[str, sp.Expr, Number] = 0):
+    """ Adds a RMA fence to the DaCe Program.
+
+        :param window_name: The name of the window to be sychronized.
+        :param assertion: A value or scalar for fence assertion.
+        :return: Name of the fence.
+    """
+
+    from dace.libraries.mpi.nodes.win_fence import Win_fence
+
+    # fine a new fence name
+    fence_name = sdfg.add_rma_ops()
+
+    _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
+
+    fence_node = Win_fence(fence_name, window_name)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    if len(all_rma_ops_name) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(fence_name) - 1]
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+
+    # for window fence ordering
+    state.add_edge(last_rma_op_node,
+                   None,
+                   fence_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    state.add_edge(assertion_node,
+                   None,
+                   fence_node,
+                   '_assertion',
+                   Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = state.add_write(fence_name)
+    state.add_edge(fence_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(fence_name, scal))
+
+    return window_name
+
+
+@oprepo.replaces_method('RMA_window', 'Put')
+def _rma_put(pv: ProgramVisitor,
+             sdfg: SDFG,
+             state: SDFGState,
+             window_name: str,
+             origin: str,
+             target_rank: Union[str, sp.Expr, Number] = 0):
+    """ Initiate a RMA put for the DaCe Program.
+
+        :param window_name: The name of the window to be sychronized.
+        :param origin: The name of origin buffer.
+        :target_rank: A value or scalar of the target rank.
+        :return: Name of the new RMA put descriptor.
+    """
+
+    from dace.libraries.mpi.nodes.win_put import Win_put
+
+    put_name = sdfg.add_rma_ops()
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(put_name) - 1]
+
+    put_node = Win_put(put_name, window_name)
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+    state.add_edge(last_rma_op_node,
+                   None,
+                   put_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    origin_node = state.add_read(origin)
+    origin_desc = sdfg.arrays[origin]
+    state.add_edge(origin_node,
+                   None,
+                   put_node,
+                   '_inbuffer',
+                   Memlet.from_array(origin, origin_desc))
+
+    _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank)
+    state.add_edge(target_rank_node,
+                   None,
+                   put_node,
+                   '_target_rank',
+                   Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True)
+    wnode = state.add_write(put_name)
+    state.add_edge(put_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(put_name, scal))
+
+    return put_name
+
+
+@oprepo.replaces_method('RMA_window', 'Get')
+def _rma_get(pv: ProgramVisitor,
+             sdfg: SDFG,
+             state: SDFGState,
+             window_name: str,
+             origin: str,
+             target_rank: Union[str, sp.Expr, Number] = 0):
+    """ Initiate a RMA get for the DaCe Program.
+
+        :param window_name: The name of the window to be sychronized.
+        :param origin: The name of origin buffer.
+        :target_rank: A value or scalar of the target rank.
+        :return: Name of the new RMA get descriptor.
+    """
+
+    from dace.libraries.mpi.nodes.win_get import Win_get
+
+    get_name = sdfg.add_rma_ops()
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(get_name) - 1]
+
+    get_node = Win_get(get_name, window_name)
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+    state.add_edge(last_rma_op_node,
+                   None,
+                   get_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank)
+    state.add_edge(target_rank_node,
+                   None,
+                   get_node,
+                   '_target_rank',
+                   Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    origin_node = state.add_write(origin)
+    origin_desc = sdfg.arrays[origin]
+    state.add_edge(get_node,
+                   '_outbuffer',
+                   origin_node,
+                   None,
+                   Memlet.from_array(origin, origin_desc))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(get_name, dace.int32, transient=True)
+    wnode = state.add_write(get_name)
+    state.add_edge(get_node,
+                   '_out',
+                   wnode,
+                   None,
+                   Memlet.from_array(get_name, scal))
+    
+    return get_name
+
+
 @oprepo.replaces('dace.comm.Subarray')
 def _subarray(pv: ProgramVisitor,
               sdfg: SDFG,
diff --git a/dace/frontend/python/newast.py b/dace/frontend/python/newast.py
index c9d92b7860..5e92f6b487 100644
--- a/dace/frontend/python/newast.py
+++ b/dace/frontend/python/newast.py
@@ -1307,6 +1307,9 @@ def defined(self):
         result.update(
             {k: self.sdfg.process_grids[v]
              for k, v in self.variables.items() if v in self.sdfg.process_grids})
+        result.update(
+            {k: self.sdfg.rma_windows[v]
+             for k, v in self.variables.items() if v in self.sdfg.rma_windows})
         try:
             from mpi4py import MPI
             result.update({k: v for k, v in self.globals.items() if isinstance(v, MPI.Comm)})
@@ -4686,6 +4689,8 @@ def _gettype(self, opnode: ast.AST) -> List[Tuple[str, str]]:
         for operand in operands:
             if isinstance(operand, str) and operand in self.sdfg.process_grids:
                 result.append((operand, type(self.sdfg.process_grids[operand]).__name__))
+            elif isinstance(operand, str) and operand in self.sdfg.rma_windows:
+                result.append((operand, type(self.sdfg.rma_windows[operand]).__name__))
             elif isinstance(operand, str) and operand in self.sdfg.arrays:
                 result.append((operand, type(self.sdfg.arrays[operand])))
             elif isinstance(operand, str) and operand in self.scope_arrays:
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 52b5deb7a8..ca185077b8 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -39,6 +39,72 @@ def comm_world_bcast(A: dace.int32[10]):
     assert (np.array_equal(A, A_ref))
 
 
+@pytest.mark.mpi
+def test_RMA_put():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def mpi4py_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Fence(0)
+        win.Put(send_buf, target_rank=rank)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_rma_put.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    win_buffer_ref = np.full(window_size, rank, dtype=np.int32)
+    send_buffer = np.full(window_size, rank, dtype=np.int32)
+
+    func(win_buf=win_buffer, send_buf=send_buffer, rank=((rank + 1) % size))
+    mpi4py_rma_put.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=((rank + 1) % size))
+
+    assert (np.array_equal(win_buffer, win_buffer_ref))
+
+
+@pytest.mark.mpi
+def test_RMA_get():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def mpi4py_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Fence(0)
+        win.Get(recv_buf, target_rank=rank)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_rma_get.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    recv_buf = np.full(window_size, rank, dtype=np.int32)
+    recv_buf_ref = np.full(window_size, rank, dtype=np.int32)
+
+    func(win_buf=win_buffer, recv_buf=recv_buf, rank=((rank + 1) % size))
+    mpi4py_rma_get.f(win_buf=win_buffer, recv_buf=recv_buf_ref, rank=((rank + 1) % size))
+
+    assert (np.array_equal(recv_buf, recv_buf_ref))
+
+
 @pytest.mark.mpi
 def test_external_comm_bcast():
 
@@ -348,3 +414,5 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
     test_isend_irecv()
     test_send_recv()
     test_alltoall()
+    test_RMA_put()
+    test_RMA_get()

From ebf8eeaa5617f9fef64e3f9e4fc50cdb45fe91c5 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 25 Aug 2023 14:04:20 +0800
Subject: [PATCH 08/28] Added an extra connector to RMA put,get for ordering

---
 dace/frontend/common/distr.py       | 4 ++--
 dace/libraries/mpi/nodes/win_get.py | 2 +-
 dace/libraries/mpi/nodes/win_put.py | 2 +-
 tests/library/mpi/win_get_test.py   | 2 +-
 tests/library/mpi/win_put_test.py   | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index e3107457a4..7b79556eb2 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1045,7 +1045,7 @@ def _rma_put(pv: ProgramVisitor,
     state.add_edge(last_rma_op_node,
                    None,
                    put_node,
-                   None,
+                   "_in",
                    Memlet.from_array(last_rma_op_name, last_rma_op_desc))
 
     origin_node = state.add_read(origin)
@@ -1105,7 +1105,7 @@ def _rma_get(pv: ProgramVisitor,
     state.add_edge(last_rma_op_node,
                    None,
                    get_node,
-                   None,
+                   "_in",
                    Memlet.from_array(last_rma_op_name, last_rma_op_desc))
 
     _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank)
diff --git a/dace/libraries/mpi/nodes/win_get.py b/dace/libraries/mpi/nodes/win_get.py
index e05a5d6195..fb8f6bacb9 100644
--- a/dace/libraries/mpi/nodes/win_get.py
+++ b/dace/libraries/mpi/nodes/win_get.py
@@ -46,7 +46,7 @@ class Win_get(MPINode):
     window_name = dace.properties.Property(dtype=str, default=None)
 
     def __init__(self, name, window_name, *args, **kwargs):
-        super().__init__(name, *args, inputs={"_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs)
+        super().__init__(name, *args, inputs={"_in", "_target_rank"}, outputs={"_out", "_outbuffer"}, **kwargs)
         self.window_name = window_name
 
     def validate(self, sdfg, state):
diff --git a/dace/libraries/mpi/nodes/win_put.py b/dace/libraries/mpi/nodes/win_put.py
index 6dd23a7324..de3811cd7c 100644
--- a/dace/libraries/mpi/nodes/win_put.py
+++ b/dace/libraries/mpi/nodes/win_put.py
@@ -46,7 +46,7 @@ class Win_put(MPINode):
     window_name = dace.properties.Property(dtype=str, default=None)
 
     def __init__(self, name, window_name, *args, **kwargs):
-        super().__init__(name, *args, inputs={"_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs)
+        super().__init__(name, *args, inputs={"_in", "_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs)
         self.window_name = window_name
 
     def validate(self, sdfg, state):
diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
index 18ac9d7cfb..8be9245278 100644
--- a/tests/library/mpi/win_get_test.py
+++ b/tests/library/mpi/win_get_test.py
@@ -96,7 +96,7 @@ def make_sdfg(dtype):
     get_state.add_edge(fence_node,
                        None,
                        win_get_node,
-                       None,
+                       "_in",
                        Memlet.from_array(fence_name, fence_desc))
 
     get_state.add_edge(target_rank,
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
index 4352e7294d..56fd7bdc67 100644
--- a/tests/library/mpi/win_put_test.py
+++ b/tests/library/mpi/win_put_test.py
@@ -98,7 +98,7 @@ def make_sdfg(dtype):
     put_state.add_edge(fence_node,
                        None,
                        win_put_node,
-                       None,
+                       "_in",
                        Memlet.from_array(fence_name, fence_desc))
 
     put_state.add_edge(send_buffer,

From 6fa9603ad44f8777106ff4467760513aa4c2e0c4 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 25 Aug 2023 14:10:12 +0800
Subject: [PATCH 09/28] Updated amd_rma_ops for better readability

---
 dace/frontend/common/distr.py | 6 +++---
 dace/sdfg/sdfg.py             | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 7b79556eb2..e1e47a466a 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -974,7 +974,7 @@ def _rma_fence(pv: ProgramVisitor,
     from dace.libraries.mpi.nodes.win_fence import Win_fence
 
     # fine a new fence name
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops("fence")
 
     _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
 
@@ -1032,7 +1032,7 @@ def _rma_put(pv: ProgramVisitor,
 
     from dace.libraries.mpi.nodes.win_put import Win_put
 
-    put_name = sdfg.add_rma_ops()
+    put_name = sdfg.add_rma_ops("put")
 
     # check for the last RMA operation
     all_rma_ops_name = list(sdfg._rma_ops.keys())
@@ -1092,7 +1092,7 @@ def _rma_get(pv: ProgramVisitor,
 
     from dace.libraries.mpi.nodes.win_get import Win_get
 
-    get_name = sdfg.add_rma_ops()
+    get_name = sdfg.add_rma_ops("get")
 
     # check for the last RMA operation
     all_rma_ops_name = list(sdfg._rma_ops.keys())
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index 1fb32cdbf8..dc0ca05132 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -2076,11 +2076,11 @@ def add_window(self):
 
         return window_name
 
-    def add_rma_ops(self):
+    def add_rma_ops(self, op:str):
         """ Adds a RMA op to the RMA ops descriptor store.
         """
 
-        rma_op_name = self._find_new_name('__win_op')
+        rma_op_name = self._find_new_name(f'__win_{op}')
 
         self._rma_ops[rma_op_name] = ""
 

From ce8c173d83a97eacb75a40361de73bd061eba652 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 25 Aug 2023 14:44:29 +0800
Subject: [PATCH 10/28] Added support of different comm world for win_create

---
 dace/frontend/common/distr.py          |  9 +++++++--
 dace/libraries/mpi/nodes/win_create.py | 10 +++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index e1e47a466a..d4951a2257 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -926,17 +926,22 @@ def _rma_window_create(pv: ProgramVisitor,
     """ Adds a RMA window to the DaCe Program.
 
         :param buffer: The name of window buffer.
-        :param comm: A dummy input for compatibility with mpi4py
+        :param comm: The comm world name of this window
         :process_grid: Name of the process-grid for collective scatter/gather operations.
         :return: Name of the window.
     """
 
     from dace.libraries.mpi.nodes.win_create import Win_create
 
+    # if 'comm' is not a 'str' means it's using mpi4py objects
+    # which can only be deafult the comm world
+    if not isinstance(comm, str):
+        comm = None
+
     # fine a new window name
     window_name = sdfg.add_window()
 
-    window_node = Win_create(window_name, grid)
+    window_node = Win_create(window_name, comm)
 
     buf_desc = sdfg.arrays[buffer]
     buf_node = state.add_read(buffer)
diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py
index 5d1bff89c6..e3f7ba10d0 100644
--- a/dace/libraries/mpi/nodes/win_create.py
+++ b/dace/libraries/mpi/nodes/win_create.py
@@ -24,8 +24,8 @@ def expansion(node, parent_state, parent_sdfg, **kwargs):
         ]
     
         comm = "MPI_COMM_WORLD"
-        if node.grid:
-            comm = f"__state->{node.grid}_comm"
+        if node.comm:
+            comm = f"__state->{node.comm}_comm"
 
         code = f"""
             MPI_Win_create(_win_buffer,
@@ -56,11 +56,11 @@ class Win_create(MPINode):
     }
     default_implementation = "MPI"
 
-    grid = dace.properties.Property(dtype=str, allow_none=True, default=None)
+    comm = dace.properties.Property(dtype=str, allow_none=True, default=None)
 
-    def __init__(self, name, grid=None, *args, **kwargs):
+    def __init__(self, name, comm=None, *args, **kwargs):
         super().__init__(name, *args, inputs={"_win_buffer"}, outputs={"_out"}, **kwargs)
-        self.grid = grid
+        self.comm = comm
 
     def validate(self, sdfg, state):
         """

From 1e8606fcc3f0902ef7aa5170218251cf4b633570 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 25 Aug 2023 17:05:40 +0800
Subject: [PATCH 11/28] Added a synchronization check for RMA put/get

---
 dace/frontend/common/distr.py | 40 +++++++++++++++++++++++++++++------
 dace/sdfg/sdfg.py             |  6 +++---
 2 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index d4951a2257..e27b24fe30 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -979,7 +979,7 @@ def _rma_fence(pv: ProgramVisitor,
     from dace.libraries.mpi.nodes.win_fence import Win_fence
 
     # fine a new fence name
-    fence_name = sdfg.add_rma_ops("fence")
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
 
     _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
 
@@ -987,10 +987,12 @@ def _rma_fence(pv: ProgramVisitor,
 
     # check for the last RMA operation
     all_rma_ops_name = list(sdfg._rma_ops.keys())
-    if len(all_rma_ops_name) == 1:
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
         last_rma_op_name = window_name
     else:
-        last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(fence_name) - 1]
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(fence_name) - 1]
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1037,11 +1039,23 @@ def _rma_put(pv: ProgramVisitor,
 
     from dace.libraries.mpi.nodes.win_put import Win_put
 
-    put_name = sdfg.add_rma_ops("put")
+    put_name = sdfg.add_rma_ops(window_name, "put")
 
     # check for the last RMA operation
     all_rma_ops_name = list(sdfg._rma_ops.keys())
-    last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(put_name) - 1]
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
+                           if f"{window_name}_fence" in rma_op]
+
+    if len(cur_window_fences) % 2:
+        # if only odd number of fences,
+        # that means we're in a ongoing epoch
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1]
+    else:
+        # if even number of fences,
+        # that means this operation is either a passive sync. one or a corrupted one
+        raise ValueError("Wrong synchronization of RMA calls!")
 
     put_node = Win_put(put_name, window_name)
 
@@ -1097,11 +1111,23 @@ def _rma_get(pv: ProgramVisitor,
 
     from dace.libraries.mpi.nodes.win_get import Win_get
 
-    get_name = sdfg.add_rma_ops("get")
+    get_name = sdfg.add_rma_ops(window_name, "get")
 
     # check for the last RMA operation
     all_rma_ops_name = list(sdfg._rma_ops.keys())
-    last_rma_op_name = all_rma_ops_name[all_rma_ops_name.index(get_name) - 1]
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
+                           if f"{window_name}_fence" in rma_op]
+
+    if len(cur_window_fences) % 2:
+        # if only odd number of fences,
+        # that means we're in a ongoing epoch
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1]
+    else:
+        # if even number of fences,
+        # that means this operation is either a passive sync. one or a corrupted one
+        raise ValueError("Wrong synchronization of RMA calls!")
 
     get_node = Win_get(get_name, window_name)
 
diff --git a/dace/sdfg/sdfg.py b/dace/sdfg/sdfg.py
index dc0ca05132..edf71bcb4b 100644
--- a/dace/sdfg/sdfg.py
+++ b/dace/sdfg/sdfg.py
@@ -416,7 +416,7 @@ class SDFG(OrderedDiGraph[SDFGState, InterstateEdge]):
                             from_json=_arrays_from_json)
     _rma_ops = DictProperty(str,
                            str,
-                           desc="MPI RMA fence descriptors for this SDFG",
+                           desc="MPI RMA ops descriptors for this SDFG",
                            to_json=_arrays_to_json,
                            from_json=_arrays_from_json)
     _subarrays = DictProperty(str,
@@ -2076,11 +2076,11 @@ def add_window(self):
 
         return window_name
 
-    def add_rma_ops(self, op:str):
+    def add_rma_ops(self, window_name:str, op:str):
         """ Adds a RMA op to the RMA ops descriptor store.
         """
 
-        rma_op_name = self._find_new_name(f'__win_{op}')
+        rma_op_name = self._find_new_name(f'{window_name}_{op}')
 
         self._rma_ops[rma_op_name] = ""
 

From c4d2ab90a6fc0b3f106e351ddcb59b297ace5184 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 25 Aug 2023 21:28:44 +0800
Subject: [PATCH 12/28] Updated fence, get, and put tests

---
 tests/library/mpi/win_fence_test.py | 2 +-
 tests/library/mpi/win_get_test.py   | 6 +++---
 tests/library/mpi/win_put_test.py   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py
index e114e3c7cf..4c2b9beb9a 100644
--- a/tests/library/mpi/win_fence_test.py
+++ b/tests/library/mpi/win_fence_test.py
@@ -45,7 +45,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state, dace.InterstateEdge())
 
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
index 8be9245278..50c3d5d7e2 100644
--- a/tests/library/mpi/win_get_test.py
+++ b/tests/library/mpi/win_get_test.py
@@ -49,7 +49,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
 
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
@@ -84,7 +84,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(fence_state_1, get_state, dace.InterstateEdge())
 
-    get_name = sdfg.add_rma_ops()
+    get_name = sdfg.add_rma_ops(window_name, "get")
     win_get_node = mpi.nodes.win_get.Win_get(get_name, window_name)
 
     # pseudo access for ordering
@@ -126,7 +126,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(get_state, fence_state_2, dace.InterstateEdge())
 
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
index 56fd7bdc67..d08001a646 100644
--- a/tests/library/mpi/win_put_test.py
+++ b/tests/library/mpi/win_put_test.py
@@ -49,7 +49,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
 
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering
@@ -84,7 +84,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(fence_state_1, put_state, dace.InterstateEdge())
 
-    put_name = sdfg.add_rma_ops()
+    put_name = sdfg.add_rma_ops(window_name, "put")
     win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name)
 
     # pseudo access for ordering
@@ -127,7 +127,7 @@ def make_sdfg(dtype):
 
     sdfg.add_edge(put_state, fence_state_2, dace.InterstateEdge())
 
-    fence_name = sdfg.add_rma_ops()
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
     win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
 
     # pseudo access for ordering

From 0a564bb483601dc4711dbba72076be67f20549a2 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 28 Aug 2023 16:37:32 +0800
Subject: [PATCH 13/28] Replaced SDFG compile to avoid RuntimeError: Could not
 load library

---
 tests/library/mpi/win_create_test.py | 10 +++++++---
 tests/library/mpi/win_fence_test.py  | 10 +++++++---
 tests/library/mpi/win_get_test.py    | 10 +++++++---
 tests/library/mpi/win_put_test.py    | 10 +++++++---
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/tests/library/mpi/win_create_test.py b/tests/library/mpi/win_create_test.py
index f5f8b58f78..db9d356c74 100644
--- a/tests/library/mpi/win_create_test.py
+++ b/tests/library/mpi/win_create_test.py
@@ -58,9 +58,13 @@ def test_win_create(dtype):
 
     if comm_size < 2:
         raise ValueError("This test is supposed to be run with at least two processes!")
-    
-    sdfg = make_sdfg(dtype)
-    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
 
     window_size = 10
     win_buffer = np.arange(0, window_size, dtype=np_dtype)
diff --git a/tests/library/mpi/win_fence_test.py b/tests/library/mpi/win_fence_test.py
index 4c2b9beb9a..20a6b11f0f 100644
--- a/tests/library/mpi/win_fence_test.py
+++ b/tests/library/mpi/win_fence_test.py
@@ -93,9 +93,13 @@ def test_win_fence(dtype):
 
     if comm_size < 2:
         raise ValueError("This test is supposed to be run with at least two processes!")
-    
-    sdfg = make_sdfg(dtype)
-    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
 
     window_size = 10
     win_buffer = np.arange(0, window_size, dtype=np_dtype)
diff --git a/tests/library/mpi/win_get_test.py b/tests/library/mpi/win_get_test.py
index 50c3d5d7e2..9f2e780d69 100644
--- a/tests/library/mpi/win_get_test.py
+++ b/tests/library/mpi/win_get_test.py
@@ -173,9 +173,13 @@ def test_win_get(dtype):
 
     if comm_size < 2:
         raise ValueError("This test is supposed to be run with at least two processes!")
-    
-    sdfg = make_sdfg(dtype)
-    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
 
     window_size = 10
     win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
diff --git a/tests/library/mpi/win_put_test.py b/tests/library/mpi/win_put_test.py
index d08001a646..0e8af8487b 100644
--- a/tests/library/mpi/win_put_test.py
+++ b/tests/library/mpi/win_put_test.py
@@ -174,9 +174,13 @@ def test_win_put(dtype):
 
     if comm_size < 2:
         raise ValueError("This test is supposed to be run with at least two processes!")
-    
-    sdfg = make_sdfg(dtype)
-    mpi_func = utils.distributed_compile(sdfg, comm_world)
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
 
     window_size = 10
     win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)

From d1f6ba2a87d4d501afcc1d7be1bafc86a8807392 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 28 Aug 2023 17:37:15 +0800
Subject: [PATCH 14/28] Added RMA accumulate library node, test, and
 replacement

---
 dace/frontend/common/distr.py              |  81 +++++++-
 dace/libraries/mpi/nodes/__init__.py       |   1 +
 dace/libraries/mpi/nodes/win_accumulate.py |  72 ++++++++
 tests/library/mpi/mpi4py_test.py           |  34 ++++
 tests/library/mpi/win_accumulate_test.py   | 205 +++++++++++++++++++++
 5 files changed, 391 insertions(+), 2 deletions(-)
 create mode 100644 dace/libraries/mpi/nodes/win_accumulate.py
 create mode 100644 tests/library/mpi/win_accumulate_test.py

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index e27b24fe30..63acc4a449 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1028,7 +1028,7 @@ def _rma_put(pv: ProgramVisitor,
              state: SDFGState,
              window_name: str,
              origin: str,
-             target_rank: Union[str, sp.Expr, Number] = 0):
+             target_rank: Union[str, sp.Expr, Number]):
     """ Initiate a RMA put for the DaCe Program.
 
         :param window_name: The name of the window to be sychronized.
@@ -1100,7 +1100,7 @@ def _rma_get(pv: ProgramVisitor,
              state: SDFGState,
              window_name: str,
              origin: str,
-             target_rank: Union[str, sp.Expr, Number] = 0):
+             target_rank: Union[str, sp.Expr, Number]):
     """ Initiate a RMA get for the DaCe Program.
 
         :param window_name: The name of the window to be sychronized.
@@ -1166,6 +1166,83 @@ def _rma_get(pv: ProgramVisitor,
     return get_name
 
 
+@oprepo.replaces_method('RMA_window', 'Accumulate')
+def _rma_accumulate(pv: ProgramVisitor,
+             sdfg: SDFG,
+             state: SDFGState,
+             window_name: str,
+             origin: str,
+             target_rank: Union[str, sp.Expr, Number],
+             op: str = "MPI_SUM"):
+    """ Initiate a RMA accumulate for the DaCe Program.
+
+        :param window_name: The name of the window to be sychronized.
+        :param origin: The name of origin buffer.
+        :target_rank: A value or scalar of the target rank.
+        :op: The name of MPI reduction
+        :return: Name of the new RMA accumulate descriptor.
+    """
+    from mpi4py import MPI
+    from dace.libraries.mpi.nodes.win_accumulate import Win_accumulate
+
+    accumulate_name = sdfg.add_rma_ops(window_name, "accumulate")
+
+    if isinstance(op, MPI.Op):
+        op = _mpi4py_to_MPI(MPI, op)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
+                           if f"{window_name}_fence" in rma_op]
+
+    if len(cur_window_fences) % 2:
+        # if only odd number of fences,
+        # that means we're in a ongoing epoch
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1]
+    else:
+        # if even number of fences,
+        # that means this operation is either a passive sync. one or a corrupted one
+        raise ValueError("Wrong synchronization of RMA calls!")
+
+    accumulate_node = Win_accumulate(accumulate_name, window_name, op)
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+    state.add_edge(last_rma_op_node,
+                   None,
+                   accumulate_node,
+                   "_in",
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    origin_node = state.add_read(origin)
+    origin_desc = sdfg.arrays[origin]
+    state.add_edge(origin_node,
+                   None,
+                   accumulate_node,
+                   '_inbuffer',
+                   Memlet.from_array(origin, origin_desc))
+
+    _, target_rank_node = _get_int_arg_node(pv, sdfg, state, target_rank)
+    state.add_edge(target_rank_node,
+                   None,
+                   accumulate_node,
+                   '_target_rank',
+                   Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(accumulate_name, dace.int32, transient=True)
+    wnode = state.add_write(accumulate_name)
+    state.add_edge(accumulate_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(accumulate_name, scal))
+
+    return accumulate_name
+
+
 @oprepo.replaces('dace.comm.Subarray')
 def _subarray(pv: ProgramVisitor,
               sdfg: SDFG,
diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 5400bd45de..9a1bd77730 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -17,3 +17,4 @@
 from .win_fence import Win_fence
 from .win_put import Win_put
 from .win_get import Win_get
+from .win_accumulate import Win_accumulate
diff --git a/dace/libraries/mpi/nodes/win_accumulate.py b/dace/libraries/mpi/nodes/win_accumulate.py
new file mode 100644
index 0000000000..6cc13b4bcd
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_accumulate.py
@@ -0,0 +1,72 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinAccumulateMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        inbuffer, in_count_str = node.validate(parent_sdfg, parent_state)
+        mpi_dtype_str = dace.libraries.mpi.utils.MPI_DDT(inbuffer.dtype.base_type)
+
+        window_name = node.window_name
+        op = node.op
+
+        code = f"""
+            MPI_Accumulate(_inbuffer, {in_count_str}, {mpi_dtype_str}, \
+                    _target_rank, 0, {in_count_str}, {mpi_dtype_str}, \
+                    {op}, __state->{window_name}_window);
+        """
+
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_accumulate(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinAccumulateMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+    op = dace.properties.Property(dtype=str, default='MPI_SUM')
+
+    def __init__(self, name, window_name, op="MPI_SUM", *args, **kwargs):
+        super().__init__(name, *args, inputs={"_in", "_inbuffer", "_target_rank"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
+        self.op = op
+
+    def validate(self, sdfg, state):
+        """
+        :return: A three-tuple (buffer, root) of the three data descriptors in the
+                 parent SDFG.
+        """
+
+        inbuffer = None
+        for e in state.in_edges(self):
+            if e.dst_conn == "_inbuffer":
+                inbuffer = sdfg.arrays[e.data.data]
+
+        in_count_str = "XXX"
+        for _, _, _, dst_conn, data in state.in_edges(self):
+            if dst_conn == '_inbuffer':
+                dims = [str(e) for e in data.subset.size_exact()]
+                in_count_str = "*".join(dims)
+
+        return inbuffer, in_count_str
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index ca185077b8..df34bb6e65 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -105,6 +105,39 @@ def mpi4py_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace
     assert (np.array_equal(recv_buf, recv_buf_ref))
 
 
+@pytest.mark.mpi
+def test_RMA_accumulate():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    # sum all rank at rank 0
+    @dace.program
+    def mpi4py_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Fence(0)
+        win.Accumulate(send_buf, target_rank=rank, op=MPI.SUM)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_rma_accumulate.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    win_buffer_ref = np.full(window_size, rank, dtype=np.int32)
+    send_buffer = np.full(window_size, rank, dtype=np.int32)
+
+    func(win_buf=win_buffer, send_buf=send_buffer, rank=0)
+    mpi4py_rma_accumulate.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=0)
+
+    assert (np.array_equal(win_buffer, win_buffer_ref))
+
 @pytest.mark.mpi
 def test_external_comm_bcast():
 
@@ -416,3 +449,4 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
     test_alltoall()
     test_RMA_put()
     test_RMA_get()
+    test_RMA_accumulate()
diff --git a/tests/library/mpi/win_accumulate_test.py b/tests/library/mpi/win_accumulate_test.py
new file mode 100644
index 0000000000..c5338e12ac
--- /dev/null
+++ b/tests/library/mpi/win_accumulate_test.py
@@ -0,0 +1,205 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_accumulate")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False)
+
+    win_buffer = window_state.add_access("win_buffer")
+
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                          None,
+                          win_create_node,
+                          '_win_buffer',
+                          Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+
+    fence_state_1 = sdfg.add_state("win_fence_1")
+
+    sdfg.add_edge(window_state, fence_state_1, dace.InterstateEdge())
+
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    window_node = fence_state_1.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    fence_state_1.add_edge(window_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(window_name, window_desc))
+
+    assertion_node = fence_state_1.add_access("assertion")
+
+    fence_state_1.add_edge(assertion_node,
+                           None,
+                           win_fence_node,
+                           '_assertion',
+                           Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_1.add_write(fence_name)
+    fence_state_1.add_edge(win_fence_node,
+                           "_out",
+                           wnode,
+                           None,
+                           Memlet.from_array(fence_name, scal))
+
+###############################################################################
+
+    accumulate_state = sdfg.add_state("win_accumulate")
+
+    sdfg.add_edge(fence_state_1, accumulate_state, dace.InterstateEdge())
+
+    accumulate_name = sdfg.add_rma_ops(window_name, "accumulate")
+    win_accumulate_node = mpi.nodes.win_accumulate.Win_accumulate(accumulate_name, window_name)
+
+    # pseudo access for ordering
+    fence_node = accumulate_state.add_access(fence_name)
+    fence_desc = sdfg.arrays[fence_name]
+
+    send_buffer = accumulate_state.add_access("send_buffer")
+
+    target_rank = accumulate_state.add_access("target_rank")
+
+    accumulate_state.add_edge(fence_node,
+                       None,
+                       win_accumulate_node,
+                       "_in",
+                       Memlet.from_array(fence_name, fence_desc))
+
+    accumulate_state.add_edge(send_buffer,
+                       None,
+                       win_accumulate_node,
+                       "_inbuffer",
+                       Memlet.simple(send_buffer, "0:n", num_accesses=n))
+
+    accumulate_state.add_edge(target_rank,
+                       None,
+                       win_accumulate_node,
+                       "_target_rank",
+                       Memlet.simple(target_rank, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(accumulate_name, dace.int32, transient=True)
+    wnode = accumulate_state.add_write(accumulate_name)
+    accumulate_state.add_edge(win_accumulate_node,
+                       "_out",
+                       wnode,
+                       None,
+                       Memlet.from_array(accumulate_name, scal))
+
+###############################################################################
+
+    fence_state_2 = sdfg.add_state("win_fence_2")
+
+    sdfg.add_edge(accumulate_state, fence_state_2, dace.InterstateEdge())
+
+    fence_name = sdfg.add_rma_ops(window_name, "fence")
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name, window_name)
+
+    # pseudo access for ordering
+    accumulate_node = fence_state_2.add_access(accumulate_name)
+    accumulate_desc = sdfg.arrays[accumulate_name]
+
+    fence_state_2.add_edge(accumulate_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(accumulate_name, accumulate_desc))
+
+    assertion_node = fence_state_2.add_access("assertion")
+
+    fence_state_2.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name, dace.int32, transient=True)
+    wnode = fence_state_2.add_write(fence_name)
+    fence_state_2.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_accumulate(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
+
+    window_size = 10
+    win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+    send_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+
+    # accumulate all ranks in rank 0
+    target_rank = np.full([1], 0, dtype=np.int32)
+    assertion = np.full([1], 0, dtype=np.int32)
+
+    mpi_func(assertion=assertion,
+             win_buffer=win_buffer,
+             send_buffer=send_buffer,
+             target_rank=target_rank,
+             n=window_size)
+
+    correct_data = np.full(window_size, comm_size * (comm_size - 1) / 2, dtype=np_dtype)
+    if (comm_rank == 0 and not np.allclose(win_buffer, correct_data)):
+        raise (ValueError("The received values are not what I expected on root."))
+
+if __name__ == "__main__":
+    test_win_accumulate(dace.int32)
+    test_win_accumulate(dace.float32)

From e4cc8e3ee2b21c014af540b086457146f64ac2d6 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Tue, 29 Aug 2023 00:30:38 +0800
Subject: [PATCH 15/28] Added MPI RMA flush, lock, unlock library nodes and
 their tests for passive sync.

---
 dace/libraries/mpi/nodes/__init__.py       |   3 +
 dace/libraries/mpi/nodes/win_flush.py      |  43 +++
 dace/libraries/mpi/nodes/win_lock.py       |  46 +++
 dace/libraries/mpi/nodes/win_unlock.py     |  43 +++
 tests/library/mpi/win_passive_sync_test.py | 332 +++++++++++++++++++++
 5 files changed, 467 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/win_flush.py
 create mode 100644 dace/libraries/mpi/nodes/win_lock.py
 create mode 100644 dace/libraries/mpi/nodes/win_unlock.py
 create mode 100644 tests/library/mpi/win_passive_sync_test.py

diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 9a1bd77730..3d3e0ac8f9 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -18,3 +18,6 @@
 from .win_put import Win_put
 from .win_get import Win_get
 from .win_accumulate import Win_accumulate
+from .win_lock import Win_lock
+from .win_unlock import Win_unlock
+from .win_flush import Win_flush
diff --git a/dace/libraries/mpi/nodes/win_flush.py b/dace/libraries/mpi/nodes/win_flush.py
new file mode 100644
index 0000000000..70e2ac1905
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_flush.py
@@ -0,0 +1,43 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinFlushMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        window_name = node.window_name
+        code = f"""
+            MPI_Win_flush(_rank, __state->{window_name}_window);
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_flush(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinFlushMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_rank"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
diff --git a/dace/libraries/mpi/nodes/win_lock.py b/dace/libraries/mpi/nodes/win_lock.py
new file mode 100644
index 0000000000..48a5fe6fd4
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_lock.py
@@ -0,0 +1,46 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinLockMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        window_name = node.window_name
+        code = f"""
+            MPI_Win_lock(_lock_type,
+                         _rank,
+                         _assertion,
+                         __state->{window_name}_window);
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_lock(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinLockMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_rank", "_lock_type", "_assertion"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
diff --git a/dace/libraries/mpi/nodes/win_unlock.py b/dace/libraries/mpi/nodes/win_unlock.py
new file mode 100644
index 0000000000..7bd6963fa9
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_unlock.py
@@ -0,0 +1,43 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinUnlockMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        window_name = node.window_name
+        code = f"""
+            MPI_Win_unlock(_rank, __state->{window_name}_window);
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_unlock(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinUnlockMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_rank"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
diff --git a/tests/library/mpi/win_passive_sync_test.py b/tests/library/mpi/win_passive_sync_test.py
new file mode 100644
index 0000000000..8a0ac7c3d7
--- /dev/null
+++ b/tests/library/mpi/win_passive_sync_test.py
@@ -0,0 +1,332 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_passive_sync")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("lock_type", [1], dtype=dace.int32, transient=False)
+    sdfg.add_array("assertion", [1], dtype=dace.int32, transient=False)
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("send_buffer", [n], dtype=dtype, transient=False)
+    sdfg.add_array("target_rank", [1], dace.dtypes.int32, transient=False)
+
+    win_buffer = window_state.add_access("win_buffer")
+
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                          None,
+                          win_create_node,
+                          '_win_buffer',
+                          Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+
+    lock_state = sdfg.add_state("win_lock")
+
+    sdfg.add_edge(window_state, lock_state, dace.InterstateEdge())
+
+    lock_name = sdfg.add_rma_ops(window_name, "lock")
+    win_lock_node = mpi.nodes.win_lock.Win_lock(lock_name, window_name)
+
+    # pseudo access for ordering
+    window_node = lock_state.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    lock_state.add_edge(window_node,
+                        None,
+                        win_lock_node,
+                        None,
+                        Memlet.from_array(window_name, window_desc))
+
+    lock_type_node = lock_state.add_access("lock_type")
+
+    target_rank_node = lock_state.add_access("target_rank")
+
+    assertion_node = lock_state.add_access("assertion")
+
+    lock_state.add_edge(lock_type_node,
+                        None,
+                        win_lock_node,
+                        '_lock_type',
+                        Memlet.simple(lock_type_node, "0:1", num_accesses=1))
+
+    lock_state.add_edge(target_rank_node,
+                        None,
+                        win_lock_node,
+                        '_rank',
+                        Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    lock_state.add_edge(assertion_node,
+                        None,
+                        win_lock_node,
+                        '_assertion',
+                        Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(lock_name, dace.int32, transient=True)
+    wnode = lock_state.add_write(lock_name)
+    lock_state.add_edge(win_lock_node,
+                           "_out",
+                           wnode,
+                           None,
+                           Memlet.from_array(lock_name, scal))
+
+###############################################################################
+
+    put_state = sdfg.add_state("win_put")
+
+    sdfg.add_edge(lock_state, put_state, dace.InterstateEdge())
+
+    put_name = sdfg.add_rma_ops(window_name, "put")
+    win_put_node = mpi.nodes.win_put.Win_put(put_name, window_name)
+
+    # pseudo access for ordering
+    lock_node = put_state.add_access(lock_name)
+    lock_desc = sdfg.arrays[lock_name]
+
+    send_buffer = put_state.add_access("send_buffer")
+
+    target_rank = put_state.add_access("target_rank")
+
+    put_state.add_edge(lock_node,
+                       None,
+                       win_put_node,
+                       "_in",
+                       Memlet.from_array(lock_name, lock_desc))
+
+    put_state.add_edge(send_buffer,
+                       None,
+                       win_put_node,
+                       "_inbuffer",
+                       Memlet.simple(send_buffer, "0:n", num_accesses=n))
+
+    put_state.add_edge(target_rank,
+                       None,
+                       win_put_node,
+                       "_target_rank",
+                       Memlet.simple(target_rank, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(put_name, dace.int32, transient=True)
+    wnode = put_state.add_write(put_name)
+    put_state.add_edge(win_put_node,
+                       "_out",
+                       wnode,
+                       None,
+                       Memlet.from_array(put_name, scal))
+
+###############################################################################
+
+    flush_state = sdfg.add_state("win_flush")
+
+    sdfg.add_edge(put_state, flush_state, dace.InterstateEdge())
+
+    flush_name = sdfg.add_rma_ops(window_name, "flush")
+    win_flush_node = mpi.nodes.win_flush.Win_flush(flush_name, window_name)
+
+    # pseudo access for ordering
+    put_node = flush_state.add_access(put_name)
+    put_desc = sdfg.arrays[put_name]
+
+    flush_state.add_edge(put_node,
+                         None,
+                         win_flush_node,
+                         None,
+                         Memlet.from_array(put_name, put_desc))
+
+    target_rank_node = flush_state.add_access("target_rank")
+
+    flush_state.add_edge(target_rank_node,
+                        None,
+                        win_flush_node,
+                        '_rank',
+                        Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(flush_name, dace.int32, transient=True)
+    wnode = flush_state.add_write(flush_name)
+    flush_state.add_edge(win_flush_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(flush_name, scal))
+
+###############################################################################
+
+    unlock_state = sdfg.add_state("win_unlock")
+
+    sdfg.add_edge(flush_state, unlock_state, dace.InterstateEdge())
+
+    unlock_name = sdfg.add_rma_ops(window_name, "unlock")
+    win_unlock_node = mpi.nodes.win_unlock.Win_unlock(unlock_name, window_name)
+
+    # pseudo access for ordering
+    flush_node = unlock_state.add_access(flush_name)
+    flush_desc = sdfg.arrays[flush_name]
+
+    unlock_state.add_edge(flush_node,
+                         None,
+                         win_unlock_node,
+                         None,
+                         Memlet.from_array(flush_name, flush_desc))
+
+    target_rank_node = unlock_state.add_access("target_rank")
+
+    unlock_state.add_edge(target_rank_node,
+                        None,
+                        win_unlock_node,
+                        '_rank',
+                        Memlet.simple(target_rank_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(unlock_name, dace.int32, transient=True)
+    wnode = unlock_state.add_write(unlock_name)
+    unlock_state.add_edge(win_unlock_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(unlock_name, scal))
+
+# added these two fences as Barrier to ensure that every rank has completed
+# since every rank are running independently
+# some ranks might exit(since they completed) the transmission
+# while others are still transmitting
+###############################################################################
+
+    fence_state_1 = sdfg.add_state("win_fence")
+
+    sdfg.add_edge(unlock_state, fence_state_1, dace.InterstateEdge())
+
+    fence_name_1 = sdfg.add_rma_ops(window_name, "fence")
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name_1, window_name)
+
+    # pseudo access for ordering
+    unlock_node = fence_state_1.add_access(unlock_name)
+    unlock_desc = sdfg.arrays[unlock_name]
+
+    fence_state_1.add_edge(unlock_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(unlock_name, unlock_desc))
+
+    assertion_node = fence_state_1.add_access("assertion")
+
+    fence_state_1.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name_1, dace.int32, transient=True)
+    wnode = fence_state_1.add_write(fence_name_1)
+    fence_state_1.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name_1, scal))
+
+###############################################################################
+
+    fence_state_2 = sdfg.add_state("win_fence")
+
+    sdfg.add_edge(fence_state_1, fence_state_2, dace.InterstateEdge())
+
+    fence_name_2 = sdfg.add_rma_ops(window_name, "fence")
+    win_fence_node = mpi.nodes.win_fence.Win_fence(fence_name_2, window_name)
+
+    # pseudo access for ordering
+    fence_node = fence_state_2.add_access(fence_name_1)
+    fence_desc = sdfg.arrays[fence_name_1]
+
+    fence_state_2.add_edge(fence_node,
+                         None,
+                         win_fence_node,
+                         None,
+                         Memlet.from_array(fence_name_1, fence_desc))
+
+    assertion_node = fence_state_2.add_access("assertion")
+
+    fence_state_2.add_edge(assertion_node,
+                         None,
+                         win_fence_node,
+                         '_assertion',
+                         Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    _, scal = sdfg.add_scalar(fence_name_2, dace.int32, transient=True)
+    wnode = fence_state_2.add_write(fence_name_2)
+    fence_state_2.add_edge(win_fence_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(fence_name_2, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_put(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
+
+    window_size = 10
+    win_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+    send_buffer = np.full(window_size, comm_rank, dtype=np_dtype)
+
+    target_rank = np.array([(comm_rank + 1) % comm_size], dtype=np.int32)
+    lock_type = np.full([1], MPI.LOCK_SHARED, dtype=np.int32)
+    assertion = np.full([1], 0, dtype=np.int32)
+
+    mpi_func(lock_type=lock_type,
+             assertion=assertion,
+             win_buffer=win_buffer,
+             send_buffer=send_buffer,
+             target_rank=target_rank,
+             n=window_size)
+
+    correct_data = np.full(window_size, (comm_rank - 1) % comm_size, dtype=np_dtype)
+    if (not np.allclose(win_buffer, correct_data)):
+        raise (ValueError("The received values are not what I expected on root."))
+
+if __name__ == "__main__":
+    test_win_put(dace.int32)
+    test_win_put(dace.float32)

From f8feae9278e99bc643a862f50ac3167466f56543 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Tue, 29 Aug 2023 15:48:19 +0800
Subject: [PATCH 16/28] Implemented replacement and their tests for passive
 sync. nodes

---
 dace/frontend/common/distr.py    | 246 ++++++++++++++++++++++++++++---
 tests/library/mpi/mpi4py_test.py | 121 +++++++++++++++
 2 files changed, 349 insertions(+), 18 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 63acc4a449..e6f50bec6d 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1022,6 +1022,200 @@ def _rma_fence(pv: ProgramVisitor,
     return window_name
 
 
+@oprepo.replaces_method('RMA_window', 'Flush')
+def _rma_flush(pv: ProgramVisitor,
+               sdfg: SDFG,
+               state: SDFGState,
+               window_name: str,
+               rank: Union[str, sp.Expr, Number]):
+    """ Adds a RMA flush to the DaCe Program.
+        flush will completes all outdtanding RMA operations
+
+        :param window_name: The name of the window to be sychronized.
+        :param rank: A value or scalar to specify the target rank.
+        :return: Name of the flush.
+    """
+
+    from dace.libraries.mpi.nodes.win_flush import Win_flush
+
+    # fine a new flush name
+    flush_name = sdfg.add_rma_ops(window_name, "flush")
+
+    _, rank_node = _get_int_arg_node(pv, sdfg, state, rank)
+
+    flush_node = Win_flush(flush_name, window_name)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(flush_name) - 1]
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+
+    # for ordering
+    state.add_edge(last_rma_op_node,
+                   None,
+                   flush_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    state.add_edge(rank_node,
+                   None,
+                   flush_node,
+                   '_rank',
+                   Memlet.simple(rank_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(flush_name, dace.int32, transient=True)
+    wnode = state.add_write(flush_name)
+    state.add_edge(flush_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(flush_name, scal))
+
+    return window_name
+
+
+@oprepo.replaces_method('RMA_window', 'Lock')
+def _rma_lock(pv: ProgramVisitor,
+              sdfg: SDFG,
+              state: SDFGState,
+              window_name: str,
+              rank: Union[str, sp.Expr, Number],
+              lock_type: Union[str, sp.Expr, Number] = 234, # MPI.LOCK_EXCLUSIVE = 234
+              assertion: Union[str, sp.Expr, Number] = 0):
+    """ Adds a RMA lock to the DaCe Program.
+
+        :param window_name: The name of the window to be sychronized.
+        :param assertion: A value or scalar for lock assertion.
+        :return: Name of the lock.
+    """
+
+    from dace.libraries.mpi.nodes.win_lock import Win_lock
+
+    # fine a new lock name
+    lock_name = sdfg.add_rma_ops(window_name, "lock")
+    lock_node = Win_lock(lock_name, window_name)
+
+    _, rank_node = _get_int_arg_node(pv, sdfg, state, rank)
+    _, lock_type_node = _get_int_arg_node(pv, sdfg, state, lock_type)
+    _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(lock_name) - 1]
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+
+    # for window lock ordering
+    state.add_edge(last_rma_op_node,
+                   None,
+                   lock_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    state.add_edge(rank_node,
+                   None,
+                   lock_node,
+                   '_rank',
+                   Memlet.simple(rank_node, "0:1", num_accesses=1))
+
+    state.add_edge(lock_type_node,
+                   None,
+                   lock_node,
+                   '_lock_type',
+                   Memlet.simple(lock_type_node, "0:1", num_accesses=1))
+
+    state.add_edge(assertion_node,
+                   None,
+                   lock_node,
+                   '_assertion',
+                   Memlet.simple(assertion_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(lock_name, dace.int32, transient=True)
+    wnode = state.add_write(lock_name)
+    state.add_edge(lock_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(lock_name, scal))
+
+    return window_name
+
+
+@oprepo.replaces_method('RMA_window', 'Unlock')
+def _rma_unlock(pv: ProgramVisitor,
+               sdfg: SDFG,
+               state: SDFGState,
+               window_name: str,
+               rank: Union[str, sp.Expr, Number]):
+    """ Adds a RMA unlock to the DaCe Program.
+        Completes an RMA access epoch at the target process
+
+        :param window_name: The name of the window to be sychronized.
+        :param rank: A value or scalar to specify the target rank.
+        :return: Name of the Unlock.
+    """
+
+    from dace.libraries.mpi.nodes.win_unlock import Win_unlock
+
+    # fine a new unlock name
+    unlock_name = sdfg.add_rma_ops(window_name, "unlock")
+
+    _, rank_node = _get_int_arg_node(pv, sdfg, state, rank)
+
+    unlock_node = Win_unlock(unlock_name, window_name)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(unlock_name) - 1]
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+
+    # for ordering
+    state.add_edge(last_rma_op_node,
+                   None,
+                   unlock_node,
+                   None,
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    state.add_edge(rank_node,
+                   None,
+                   unlock_node,
+                   '_rank',
+                   Memlet.simple(rank_node, "0:1", num_accesses=1))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(unlock_name, dace.int32, transient=True)
+    wnode = state.add_write(unlock_name)
+    state.add_edge(unlock_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(unlock_name, scal))
+
+    return window_name
+
+
 @oprepo.replaces_method('RMA_window', 'Put')
 def _rma_put(pv: ProgramVisitor,
              sdfg: SDFG,
@@ -1048,14 +1242,20 @@ def _rma_put(pv: ProgramVisitor,
     cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
                            if f"{window_name}_fence" in rma_op]
 
-    if len(cur_window_fences) % 2:
-        # if only odd number of fences,
-        # that means we're in a ongoing epoch
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1]
-    else:
+    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1]
+
+    # if only odd number of fences,
+    # that means we're in a ongoing epoch
+    if len(cur_window_fences) % 2 == 0:
         # if even number of fences,
         # that means this operation is either a passive sync. one or a corrupted one
-        raise ValueError("Wrong synchronization of RMA calls!")
+        # same logic applies to passive sync.
+        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
+                                    if "lock" in rma_op]
+        # if we don't have even number of syncs, give user a warning
+        if len(cur_window_passive_syncs) % 2 == 0:
+            print("You might have a bad synchronization of RMA calls!")
+
 
     put_node = Win_put(put_name, window_name)
 
@@ -1120,14 +1320,19 @@ def _rma_get(pv: ProgramVisitor,
     cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
                            if f"{window_name}_fence" in rma_op]
 
-    if len(cur_window_fences) % 2:
-        # if only odd number of fences,
-        # that means we're in a ongoing epoch
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1]
-    else:
+    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1]
+
+    # if only odd number of fences,
+    # that means we're in a ongoing epoch
+    if len(cur_window_fences) % 2 == 0:
         # if even number of fences,
         # that means this operation is either a passive sync. one or a corrupted one
-        raise ValueError("Wrong synchronization of RMA calls!")
+        # same logic applies to passive sync.
+        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
+                                    if "lock" in rma_op]
+        # if we don't have even number of syncs, give user a warning
+        if len(cur_window_passive_syncs) % 2 == 0:
+            print("You might have a bad synchronization of RMA calls!")
 
     get_node = Win_get(get_name, window_name)
 
@@ -1197,14 +1402,19 @@ def _rma_accumulate(pv: ProgramVisitor,
     cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
                            if f"{window_name}_fence" in rma_op]
 
-    if len(cur_window_fences) % 2:
-        # if only odd number of fences,
-        # that means we're in a ongoing epoch
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1]
-    else:
+    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1]
+
+    # if only odd number of fences,
+    # that means we're in a ongoing epoch
+    if len(cur_window_fences) % 2 == 0:
         # if even number of fences,
         # that means this operation is either a passive sync. one or a corrupted one
-        raise ValueError("Wrong synchronization of RMA calls!")
+        # same logic applies to passive sync.
+        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
+                                    if "lock" in rma_op]
+        # if we don't have even number of syncs, give user a warning
+        if len(cur_window_passive_syncs) % 2 == 0:
+            print("You might have a bad synchronization of RMA calls!")
 
     accumulate_node = Win_accumulate(accumulate_name, window_name, op)
 
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index df34bb6e65..55cbff160b 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -138,6 +138,124 @@ def mpi4py_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], ran
 
     assert (np.array_equal(win_buffer, win_buffer_ref))
 
+
+@pytest.mark.mpi
+def test_passive_RMA_put():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def mpi4py_passive_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Lock(rank)
+        win.Put(send_buf, target_rank=rank)
+        win.Flush(rank)
+        win.Unlock(rank)
+
+        # as MPI barrier
+        win.Fence(0)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_passive_rma_put.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    win_buffer_ref = np.full(window_size, rank, dtype=np.int32)
+    send_buffer = np.full(window_size, rank, dtype=np.int32)
+
+
+    func(win_buf=win_buffer, send_buf=send_buffer, rank=((rank + 1) % size))
+    mpi4py_passive_rma_put.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=((rank + 1) % size))
+
+    assert (np.array_equal(win_buffer, win_buffer_ref))
+
+
+@pytest.mark.mpi
+def test_passive_RMA_get():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    @dace.program
+    def mpi4py_passive_rma_get(win_buf: dace.int32[10], recv_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Lock(rank)
+        win.Get(recv_buf, target_rank=rank)
+        win.Flush(rank)
+        win.Unlock(rank)
+
+        # as MPI barrier
+        win.Fence(0)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_passive_rma_get.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    recv_buf = np.full(window_size, rank, dtype=np.int32)
+    recv_buf_ref = np.full(window_size, rank, dtype=np.int32)
+
+    func(win_buf=win_buffer, recv_buf=recv_buf, rank=((rank + 1) % size))
+    mpi4py_passive_rma_get.f(win_buf=win_buffer, recv_buf=recv_buf_ref, rank=((rank + 1) % size))
+
+    assert (np.array_equal(recv_buf, recv_buf_ref))
+
+
+@pytest.mark.mpi
+def test_RMA_passive_accumulate():
+    from mpi4py import MPI
+    commworld = MPI.COMM_WORLD
+    rank = commworld.Get_rank()
+    size = commworld.Get_size()
+
+    # sum all rank at rank 0
+    @dace.program
+    def mpi4py_passive_rma_accumulate(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace.int32):
+        win = MPI.Win.Create(win_buf, comm=commworld)
+        win.Lock(rank)
+        win.Accumulate(send_buf, target_rank=rank, op=MPI.SUM)
+        win.Flush(rank)
+        win.Unlock(rank)
+
+        # as MPI barrier
+        win.Fence(0)
+        win.Fence(0)
+
+    if size < 2:
+        raise ValueError("Please run this test with at least two processes.")
+
+    sdfg = None
+    if rank == 0:
+        sdfg = mpi4py_passive_rma_accumulate.to_sdfg()
+    func = utils.distributed_compile(sdfg, commworld)
+
+    window_size = 10
+    win_buffer = np.full(window_size, rank, dtype=np.int32)
+    win_buffer_ref = np.full(window_size, rank, dtype=np.int32)
+    send_buffer = np.full(window_size, rank, dtype=np.int32)
+
+    func(win_buf=win_buffer, send_buf=send_buffer, rank=0)
+    mpi4py_passive_rma_accumulate.f(win_buf=win_buffer_ref, send_buf=send_buffer, rank=0)
+
+    if rank == 0:
+        assert (np.array_equal(win_buffer, win_buffer_ref))
+
+
 @pytest.mark.mpi
 def test_external_comm_bcast():
 
@@ -450,3 +568,6 @@ def mpi4py_alltoall(rank: dace.int32, size: dace.compiletime):
     test_RMA_put()
     test_RMA_get()
     test_RMA_accumulate()
+    test_passive_RMA_put()
+    test_passive_RMA_get()
+    test_RMA_passive_accumulate()

From ada74b333a1544856335b67ebd70d1fec9873a32 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 31 Aug 2023 22:29:09 +0800
Subject: [PATCH 17/28] Added lock type validator for better MPI compatibility

---
 dace/frontend/common/distr.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index e6f50bec6d..85d0753f3a 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1088,7 +1088,7 @@ def _rma_lock(pv: ProgramVisitor,
               state: SDFGState,
               window_name: str,
               rank: Union[str, sp.Expr, Number],
-              lock_type: Union[str, sp.Expr, Number] = 234, # MPI.LOCK_EXCLUSIVE = 234
+              lock_type: Union[str, sp.Expr, Number] = 234, # in intel MPI MPI.LOCK_EXCLUSIVE = 234
               assertion: Union[str, sp.Expr, Number] = 0):
     """ Adds a RMA lock to the DaCe Program.
 
@@ -1103,6 +1103,11 @@ def _rma_lock(pv: ProgramVisitor,
     lock_name = sdfg.add_rma_ops(window_name, "lock")
     lock_node = Win_lock(lock_name, window_name)
 
+    # different MPI might get other value
+    if lock_type == 234:
+        from mpi4py import MPI
+        lock_type = MPI.LOCK_EXCLUSIVE
+
     _, rank_node = _get_int_arg_node(pv, sdfg, state, rank)
     _, lock_type_node = _get_int_arg_node(pv, sdfg, state, lock_type)
     _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)

From 892dda54c35a100ae1398ae593f6dc3078ecc91b Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 7 Sep 2023 00:17:48 +0800
Subject: [PATCH 18/28] Added a GEMM sample for RMA

---
 samples/mpi/mat_mul.py | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 samples/mpi/mat_mul.py

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
new file mode 100644
index 0000000000..8e3396c793
--- /dev/null
+++ b/samples/mpi/mat_mul.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+dim_1 = 200
+dim_2 = 300
+
+a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2)
+b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1)
+
+def matrix_mul(a, b):
+  a_mat = np.array(a)
+  b_mat = np.array(b)
+  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]))
+
+  if a_mat.shape[1] != b_mat.shape[0]:
+    raise ValueError("A, B matrix dimension mismatched!")
+
+  # more or less like C stationary
+  for i in range(a_mat.shape[0]):
+    for j in range(b_mat.shape[1]):
+      for k in range(a_mat.shape[1]):
+        c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+
+  return c_mat
+
+
+# print(matrix_mul(a,b))
+# print(np.matmul(a,b))
+
+print("Result correctness:", np.allclose(matrix_mul(a,b), np.matmul(a,b)))
\ No newline at end of file

From 8149219ad88a42cde819b0bdc63a6b99142e6b5f Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 7 Sep 2023 23:57:22 +0800
Subject: [PATCH 19/28] Updated mat_mul to tiled version

---
 samples/mpi/mat_mul.py | 63 ++++++++++++++++++++++++++++++------------
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index 8e3396c793..47d42e7f06 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -1,29 +1,58 @@
 import numpy as np
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+import time
 
-dim_1 = 200
-dim_2 = 300
+dim_1 = 1024
+dim_2 = 1024
+
+tile = 128
 
 a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2)
 b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1)
 
 def matrix_mul(a, b):
-  a_mat = np.array(a)
-  b_mat = np.array(b)
-  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]))
+  a_mat = np.array(a, dtype=np.int64)
+  b_mat = np.array(b, dtype=np.int64)
+  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64)
 
   if a_mat.shape[1] != b_mat.shape[0]:
     raise ValueError("A, B matrix dimension mismatched!")
 
   # more or less like C stationary
-  for i in range(a_mat.shape[0]):
-    for j in range(b_mat.shape[1]):
-      for k in range(a_mat.shape[1]):
-        c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
-
-  return c_mat
-
-
-# print(matrix_mul(a,b))
-# print(np.matmul(a,b))
-
-print("Result correctness:", np.allclose(matrix_mul(a,b), np.matmul(a,b)))
\ No newline at end of file
+  # for i in range(a_mat.shape[0]):
+  #   for j in range(b_mat.shape[1]):
+  #     for k in range(a_mat.shape[1]):
+  #       c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+  
+  @dace.program
+  def mpi4py_passive_rma_put(a_mat: dace.int64[dim_1,dim_2], b_mat: dace.int64[dim_1,dim_2], c_mat: dace.int64[dim_1,dim_2], tile: dace.int64):
+    for i_tile in range(a_mat.shape[0] // tile):
+      for j_tile in range(b_mat.shape[1] // tile):
+        for k_tile in range(a_mat.shape[1] // tile):
+          for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])):
+            for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])):
+              for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])):
+                c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+
+  sdfg = None
+  sdfg = mpi4py_passive_rma_put.to_sdfg()
+  sdfg.openmp_sections = False
+  func = sdfg.compile()
+  
+  start = time.time()
+  func(a_mat, b_mat, c_mat, tile)
+  time_con = time.time() - start
+
+  return c_mat, time_con
+
+c_mat, time_con = matrix_mul(a,b)
+print(c_mat, time_con)
+
+start = time.time()
+c_np = np.matmul(a,b)
+time_con = time.time() - start
+print(c_np, time_con)
+
+print("Result correctness:", np.allclose(c_mat, c_np))

From 380b5e462e0a5d0a450aba1f003303c7673c5fac Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 8 Sep 2023 14:51:24 +0800
Subject: [PATCH 20/28] Implemented distributed version mat_mul.py

---
 samples/mpi/mat_mul.py | 134 +++++++++++++++++++++++++++++------------
 1 file changed, 95 insertions(+), 39 deletions(-)

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index 47d42e7f06..dc6bc6f80c 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -2,57 +2,113 @@
 import dace
 from dace.sdfg import utils
 import dace.dtypes as dtypes
+from mpi4py import MPI
 import time
 
-dim_1 = 1024
-dim_2 = 1024
 
-tile = 128
+# to check if this process owns this chunk of data
+# compare given i and j with grid_i and grid_j
+def owner(i, j, grid_i, grid_j):
+  if i == grid_i and j == grid_j:
+    return True
+  else:
+    return False
 
-a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2)
-b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1)
 
-def matrix_mul(a, b):
-  a_mat = np.array(a, dtype=np.int64)
-  b_mat = np.array(b, dtype=np.int64)
-  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64)
+# get matrix form remote rank
+def get_mat(win, buffer, dim_0, dim_1, grid_dim):
+  rank = dim_0 * grid_dim + dim_1
+  win.Lock(rank)
+  win.Get(buffer, target_rank=rank)
+  win.Flush(rank)
+  win.Unlock(rank)
 
-  if a_mat.shape[1] != b_mat.shape[0]:
+
+def matrix_mul(comm_world, a, b):
+  # check if matrix multiplication is valid
+  if a.shape[1] != b.shape[0]:
     raise ValueError("A, B matrix dimension mismatched!")
 
-  # more or less like C stationary
-  # for i in range(a_mat.shape[0]):
-  #   for j in range(b_mat.shape[1]):
-  #     for k in range(a_mat.shape[1]):
-  #       c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
-  
-  @dace.program
-  def mpi4py_passive_rma_put(a_mat: dace.int64[dim_1,dim_2], b_mat: dace.int64[dim_1,dim_2], c_mat: dace.int64[dim_1,dim_2], tile: dace.int64):
-    for i_tile in range(a_mat.shape[0] // tile):
-      for j_tile in range(b_mat.shape[1] // tile):
-        for k_tile in range(a_mat.shape[1] // tile):
-          for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])):
-            for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])):
-              for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])):
-                c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
-
-  sdfg = None
-  sdfg = mpi4py_passive_rma_put.to_sdfg()
-  sdfg.openmp_sections = False
-  func = sdfg.compile()
-  
+  # comm init
+  comm_rank = comm_world.Get_rank()
+  comm_size = comm_world.Get_size()
+
+  grid_dim = int(np.floor(np.sqrt(comm_size)))
+  grid_i = comm_rank // grid_dim
+  grid_j = comm_rank % grid_dim
+
+  local_i_dim = a.shape[0]
+  local_j_dim = b.shape[1]
+  local_k_dim = a.shape[1]
+
+  whole_i_dim = grid_dim * a.shape[0]
+  whole_j_dim = grid_dim * b.shape[1]
+  whole_k_dim = grid_dim * a.shape[1]
+
+  a_mat = np.array(a + comm_rank, dtype=np.int32)
+  b_mat = np.array(b + comm_rank, dtype=np.int32)
+  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32)
+
+  # local buffers for remote fetching
+  foreign_a_mat = np.zeros(a.shape, dtype=np.int32)
+  foreign_b_mat = np.zeros(b.shape, dtype=np.int32)
+
+  # RMA windows
+  a_win = MPI.Win.Create(a_mat, comm=comm_world)
+  b_win = MPI.Win.Create(b_mat, comm=comm_world)
+
   start = time.time()
-  func(a_mat, b_mat, c_mat, tile)
+  for i in range(whole_i_dim // local_i_dim):
+    for j in range(whole_j_dim // local_j_dim):
+      for k in range(whole_k_dim // local_k_dim):
+        if owner(i, j, grid_i, grid_j):
+          get_mat(a_win, foreign_a_mat, i, k, grid_dim)
+          get_mat(b_win, foreign_b_mat, k, j, grid_dim)
+
+          c_mat += np.matmul(foreign_a_mat, foreign_b_mat)
   time_con = time.time() - start
 
+  # to ensure every process completed the calculation
+  comm_world.Barrier()
+
   return c_mat, time_con
 
-c_mat, time_con = matrix_mul(a,b)
-print(c_mat, time_con)
 
-start = time.time()
-c_np = np.matmul(a,b)
-time_con = time.time() - start
-print(c_np, time_con)
+if __name__ == "__main__":
+  comm_world = MPI.COMM_WORLD
+  comm_rank = comm_world.Get_rank()
+  comm_size = comm_world.Get_size()
+
+  grid_dim = int(np.floor(np.sqrt(comm_size)))
+  grid_i = comm_rank // grid_dim
+  grid_j = comm_rank % grid_dim
+
+  dim_1 = 256
+  dim_2 = 256
+
+  a = np.ones((dim_1, dim_2), dtype=np.int32)
+  b = np.ones((dim_2, dim_1), dtype=np.int32)
+
+  c_mat, time_con = matrix_mul(comm_world, a, b)
+  # print(comm_rank, c_mat)
+  # print(comm_rank, "matrix_mul time:", time_con)
+
+  whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.int32)
+  for i in range(grid_dim):
+    for j in range(grid_dim):
+      whole_a[i * dim_1:(i+1) * dim_1, j * dim_2:(j+1) * dim_2] += i * grid_dim + j
+
+  whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.int32)
+  for i in range(grid_dim):
+    for j in range(grid_dim):
+      whole_b[i * dim_2:(i+1) * dim_2, j * dim_1:(j+1) * dim_1] += i * grid_dim + j
+
+  start = time.time()
+  c_np = np.matmul(whole_a, whole_b)
+  time_con = time.time() - start
+
+  # print(comm_rank, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2])
+  # print(comm_rank, "np.matmul time:", time_con)
 
-print("Result correctness:", np.allclose(c_mat, c_np))
+  # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2]))
+  assert(np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2]))

From c8477179ff14a9fe1b956b8045498b6a5705fb04 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 8 Sep 2023 15:25:18 +0800
Subject: [PATCH 21/28] Functionized the distibuted computation in mat_mul.py

---
 samples/mpi/mat_mul.py   |  80 +++++++++++++-------------
 samples/mpi/ping_pong.py | 117 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 155 insertions(+), 42 deletions(-)
 create mode 100644 samples/mpi/ping_pong.py

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index dc6bc6f80c..c5df0c6c32 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -5,23 +5,46 @@
 from mpi4py import MPI
 import time
 
+def dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size):
+  grid_dim = int(np.floor(np.sqrt(comm_size)))
+  grid_i = comm_rank // grid_dim
+  grid_j = comm_rank % grid_dim
 
-# to check if this process owns this chunk of data
-# compare given i and j with grid_i and grid_j
-def owner(i, j, grid_i, grid_j):
-  if i == grid_i and j == grid_j:
-    return True
-  else:
-    return False
+  local_i_dim = a_mat.shape[0]
+  local_j_dim = b_mat.shape[1]
+  local_k_dim = a_mat.shape[1]
 
+  whole_i_dim = grid_dim * a_mat.shape[0]
+  whole_j_dim = grid_dim * b_mat.shape[1]
+  whole_k_dim = grid_dim * a_mat.shape[1]
 
-# get matrix form remote rank
-def get_mat(win, buffer, dim_0, dim_1, grid_dim):
-  rank = dim_0 * grid_dim + dim_1
-  win.Lock(rank)
-  win.Get(buffer, target_rank=rank)
-  win.Flush(rank)
-  win.Unlock(rank)
+  # local buffers for remote fetching
+  foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32)
+  foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32)
+
+  # RMA windows
+  a_win = MPI.Win.Create(a_mat, comm=comm_world)
+  b_win = MPI.Win.Create(b_mat, comm=comm_world)
+  for i in range(whole_i_dim // local_i_dim):
+    for j in range(whole_j_dim // local_j_dim):
+      for k in range(whole_k_dim // local_k_dim):
+        # check if this process owns this chunk of data
+        if i == grid_i and j == grid_j:
+          target_rank_a = i * grid_dim + k
+          target_rank_b = k * grid_dim + j
+          a_win.Lock(target_rank_a)
+          b_win.Lock(target_rank_b)
+
+          a_win.Get(foreign_a_mat, target_rank=target_rank_a)
+          b_win.Get(foreign_b_mat, target_rank=target_rank_b)
+
+          a_win.Flush(target_rank_a)
+          b_win.Flush(target_rank_b)
+
+          a_win.Unlock(target_rank_a)
+          b_win.Unlock(target_rank_b)
+
+          c_mat += np.matmul(foreign_a_mat, foreign_b_mat)
 
 
 def matrix_mul(comm_world, a, b):
@@ -33,39 +56,12 @@ def matrix_mul(comm_world, a, b):
   comm_rank = comm_world.Get_rank()
   comm_size = comm_world.Get_size()
 
-  grid_dim = int(np.floor(np.sqrt(comm_size)))
-  grid_i = comm_rank // grid_dim
-  grid_j = comm_rank % grid_dim
-
-  local_i_dim = a.shape[0]
-  local_j_dim = b.shape[1]
-  local_k_dim = a.shape[1]
-
-  whole_i_dim = grid_dim * a.shape[0]
-  whole_j_dim = grid_dim * b.shape[1]
-  whole_k_dim = grid_dim * a.shape[1]
-
   a_mat = np.array(a + comm_rank, dtype=np.int32)
   b_mat = np.array(b + comm_rank, dtype=np.int32)
   c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32)
 
-  # local buffers for remote fetching
-  foreign_a_mat = np.zeros(a.shape, dtype=np.int32)
-  foreign_b_mat = np.zeros(b.shape, dtype=np.int32)
-
-  # RMA windows
-  a_win = MPI.Win.Create(a_mat, comm=comm_world)
-  b_win = MPI.Win.Create(b_mat, comm=comm_world)
-
   start = time.time()
-  for i in range(whole_i_dim // local_i_dim):
-    for j in range(whole_j_dim // local_j_dim):
-      for k in range(whole_k_dim // local_k_dim):
-        if owner(i, j, grid_i, grid_j):
-          get_mat(a_win, foreign_a_mat, i, k, grid_dim)
-          get_mat(b_win, foreign_b_mat, k, j, grid_dim)
-
-          c_mat += np.matmul(foreign_a_mat, foreign_b_mat)
+  dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size)
   time_con = time.time() - start
 
   # to ensure every process completed the calculation
diff --git a/samples/mpi/ping_pong.py b/samples/mpi/ping_pong.py
new file mode 100644
index 0000000000..d8cf490f62
--- /dev/null
+++ b/samples/mpi/ping_pong.py
@@ -0,0 +1,117 @@
+import numpy as np
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from mpi4py import MPI
+import time
+
+dim_1 = 128
+dim_2 = 128
+
+a = np.arange(dim_1 * dim_2).reshape(dim_1, dim_2)
+b = np.arange(dim_1 * dim_2).reshape(dim_2, dim_1)
+
+# to check if this process owns this chunk of data
+# compare given i and j with grid_i and grid_j
+@dace.program
+def owner(i, j, grid_i, grid_j):
+  if i == grid_i and j == grid_j:
+    return True
+  else:
+    return False
+
+# get matrix form remote rank
+@dace.program
+def get_mat(win: dace.RMA_window, buffer: dace.int32[dim_1,dim_2], dim_0: dace.int32, dim_1: dace.int32, grid_dim: dace.int32):
+  rank = dim_0 * grid_dim + dim_1
+  win.Lock(rank)
+  win.Get(buffer, target_rank=rank)
+  win.Flush(rank)
+  win.Unlock(rank)
+
+def matrix_mul(a, b):
+  # check if matrix multiplication is valid
+  if a.shape[1] != b.shape[0]:
+    raise ValueError("A, B matrix dimension mismatched!")
+  
+  # comm init
+  comm_world = MPI.COMM_WORLD
+  comm_rank = comm_world.Get_rank()
+  comm_size = comm_world.Get_size()
+
+  grid_dim = 2
+  grid_i = comm_rank // grid_dim
+  grid_j = comm_rank % grid_dim
+
+  if comm_size != 2:
+      raise ValueError("Please run this test with two processes.")
+
+  a_mat = np.array(a + comm_rank, dtype=np.int64)
+  b_mat = np.array(b + comm_rank, dtype=np.int64)
+  foreign_a_mat = np.zeros(a.shape, dtype=np.int64)
+  foreign_b_mat = np.zeros(b.shape, dtype=np.int64)
+  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int64)
+
+
+  # more or less like C stationary
+  # for i in range(a_mat.shape[0]):
+  #   for j in range(b_mat.shape[1]):
+  #     for k in range(a_mat.shape[1]):
+  #       c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+
+  
+  @dace.program
+  def mpi4py_send_recv(comm_rank: dace.int32, a_mat: dace.int32[dim_1,dim_2], foreign_a_mat: dace.int32[dim_1,dim_2], grid_dim: dace.int32):
+    a_win = MPI.Win.Create(a_mat, comm=comm_world)
+    if comm_rank == 0:
+      get_mat(a_win, foreign_a_mat, 0, 1, grid_dim)
+    else:
+      get_mat(a_win, foreign_a_mat, 0, 0, grid_dim)
+    return foreign_a_mat
+
+  sdfg = None
+  if comm_rank == 0:
+      sdfg = mpi4py_send_recv.to_sdfg(simplify=True)
+  func = utils.distributed_compile(sdfg, comm_world)
+
+
+  start = time.time()
+
+  foreign_a_mat = func(comm_rank=comm_rank, a_mat=a_mat, foreign_a_mat=foreign_a_mat, grid_dim=grid_dim)
+  if comm_rank == 0:
+    if(np.allclose(a_mat+1, foreign_a_mat)):
+      print("Good")
+  else:
+    if(np.allclose(a_mat-1, foreign_a_mat)):
+      print("Good")
+
+  time_con = time.time() - start
+
+
+  # to ensure every process completed the calculation
+  comm_world.Barrier()
+
+matrix_mul(a,b)
+
+  # more or less like C stationary
+  # for i in range(a_mat.shape[0]):
+  #   for j in range(b_mat.shape[1]):
+  #     for k in range(a_mat.shape[1]):
+  #       c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+  
+  # @dace.program
+  # def mpi4py_passive_rma_put(a_mat: dace.int32[dim_1,dim_2], b_mat: dace.int32[dim_1,dim_2], c_mat: dace.int32[dim_1,dim_2], tile: dace.int32):
+  #   for i_tile in range(a_mat.shape[0] // tile):
+  #     for j_tile in range(b_mat.shape[1] // tile):
+  #       for k_tile in range(a_mat.shape[1] // tile):
+  #         for i in range(i_tile * tile, min((i_tile + 1) * tile, a_mat.shape[0])):
+  #           for j in range(j_tile * tile, min((j_tile + 1) * tile, b_mat.shape[1])):
+  #             for k in range(k_tile * tile, min((k_tile + 1) * tile, a_mat.shape[1])):
+  #               c_mat[i][j] += a_mat[i][k] * b_mat[k][j]
+
+  # sdfg = None
+  # sdfg = mpi4py_passive_rma_put.to_sdfg()
+  # sdfg.openmp_sections = False
+  # func = sdfg.compile()
+  
+  # func(a_mat, b_mat, c_mat, tile)
\ No newline at end of file

From ca65d87d84868aa739f0cdb841af6705b9ee9c56 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 11 Sep 2023 14:46:48 +0800
Subject: [PATCH 22/28] Enabled dace acceleration for mat_mul.py

---
 samples/mpi/mat_mul.py | 103 +++++++++++++++++++++++------------------
 1 file changed, 58 insertions(+), 45 deletions(-)

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index c5df0c6c32..dd8fcbee2c 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -5,47 +5,6 @@
 from mpi4py import MPI
 import time
 
-def dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size):
-  grid_dim = int(np.floor(np.sqrt(comm_size)))
-  grid_i = comm_rank // grid_dim
-  grid_j = comm_rank % grid_dim
-
-  local_i_dim = a_mat.shape[0]
-  local_j_dim = b_mat.shape[1]
-  local_k_dim = a_mat.shape[1]
-
-  whole_i_dim = grid_dim * a_mat.shape[0]
-  whole_j_dim = grid_dim * b_mat.shape[1]
-  whole_k_dim = grid_dim * a_mat.shape[1]
-
-  # local buffers for remote fetching
-  foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32)
-  foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32)
-
-  # RMA windows
-  a_win = MPI.Win.Create(a_mat, comm=comm_world)
-  b_win = MPI.Win.Create(b_mat, comm=comm_world)
-  for i in range(whole_i_dim // local_i_dim):
-    for j in range(whole_j_dim // local_j_dim):
-      for k in range(whole_k_dim // local_k_dim):
-        # check if this process owns this chunk of data
-        if i == grid_i and j == grid_j:
-          target_rank_a = i * grid_dim + k
-          target_rank_b = k * grid_dim + j
-          a_win.Lock(target_rank_a)
-          b_win.Lock(target_rank_b)
-
-          a_win.Get(foreign_a_mat, target_rank=target_rank_a)
-          b_win.Get(foreign_b_mat, target_rank=target_rank_b)
-
-          a_win.Flush(target_rank_a)
-          b_win.Flush(target_rank_b)
-
-          a_win.Unlock(target_rank_a)
-          b_win.Unlock(target_rank_b)
-
-          c_mat += np.matmul(foreign_a_mat, foreign_b_mat)
-
 
 def matrix_mul(comm_world, a, b):
   # check if matrix multiplication is valid
@@ -60,12 +19,66 @@ def matrix_mul(comm_world, a, b):
   b_mat = np.array(b + comm_rank, dtype=np.int32)
   c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32)
 
+  @dace.program
+  def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]],
+                    b_mat: dace.int32[b_mat.shape[0], b_mat.shape[1]],
+                    c_mat: dace.int32[a_mat.shape[0], b_mat.shape[1]],
+                    comm_rank: dace.int32,
+                    comm_size: dace.int32):
+    grid_dim = int(np.floor(np.sqrt(comm_size)))
+    grid_i = comm_rank // grid_dim
+    grid_j = comm_rank % grid_dim
+
+    local_i_dim = a_mat.shape[0]
+    local_j_dim = b_mat.shape[1]
+    local_k_dim = a_mat.shape[1]
+
+    whole_i_dim = grid_dim * a_mat.shape[0]
+    whole_j_dim = grid_dim * b_mat.shape[1]
+    whole_k_dim = grid_dim * a_mat.shape[1]
+
+    # local buffers for remote fetching
+    foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32)
+    foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32)
+
+    # RMA windows
+    a_win = MPI.Win.Create(a_mat, comm=comm_world)
+    b_win = MPI.Win.Create(b_mat, comm=comm_world)
+    for i in range(whole_i_dim // local_i_dim):
+      for j in range(whole_j_dim // local_j_dim):
+        for k in range(whole_k_dim // local_k_dim):
+          # check if this process owns this chunk of data
+          if i == grid_i and j == grid_j:
+            target_rank_a = i * grid_dim + k
+            target_rank_b = k * grid_dim + j
+            a_win.Lock(target_rank_a)
+            a_win.Get(foreign_a_mat, target_rank=target_rank_a)
+            a_win.Flush(target_rank_a)
+            a_win.Unlock(target_rank_a)
+
+            b_win.Lock(target_rank_b)
+            b_win.Get(foreign_b_mat, target_rank=target_rank_b)
+            b_win.Flush(target_rank_b)
+            b_win.Unlock(target_rank_b)
+
+            c_mat += foreign_a_mat @ foreign_b_mat
+
+    # as MPI barrier
+    # to ensure every process completed the calculation
+    a_win.Fence(0)
+    a_win.Fence(0)
+
+  sdfg = None
+  if comm_rank == 0:
+    # ValueError: Node type "Win_lock" not supported for promotion
+    sdfg = dist_mat_mult.to_sdfg(simplify=False)
+  func = utils.distributed_compile(sdfg, comm_world)
+
   start = time.time()
-  dist_mat_mult(a_mat, b_mat, c_mat, comm_rank, comm_size)
-  time_con = time.time() - start
 
-  # to ensure every process completed the calculation
-  comm_world.Barrier()
+  func(a_mat=a_mat, b_mat=b_mat, c_mat=c_mat, comm_rank=comm_rank, comm_size=comm_size)
+
+  time_con = time.time() - start
 
   return c_mat, time_con
 

From f7231a08572b3aa71313cb3d346df0827e7de618 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 11 Sep 2023 16:06:44 +0800
Subject: [PATCH 23/28] Added MPI RMA free library node, replacement, and tests
 for both frontend and backend

---
 dace/frontend/common/distr.py        |  52 ++++++++++++++
 dace/libraries/mpi/nodes/__init__.py |   1 +
 dace/libraries/mpi/nodes/win_free.py |  43 +++++++++++
 tests/library/mpi/mpi4py_test.py     |   3 +
 tests/library/mpi/mpi_free_test.py   | 102 +++++++++++++++++++++++++++
 5 files changed, 201 insertions(+)
 create mode 100644 dace/libraries/mpi/nodes/win_free.py
 create mode 100644 tests/library/mpi/mpi_free_test.py

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index 85d0753f3a..fb40a22d4c 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -1082,6 +1082,58 @@ def _rma_flush(pv: ProgramVisitor,
     return window_name
 
 
+@oprepo.replaces_method('RMA_window', 'Free')
+def _rma_free(pv: ProgramVisitor,
+              sdfg: SDFG,
+              state: SDFGState,
+              window_name: str,
+              assertion: Union[str, sp.Expr, Number] = 0):
+    """ Adds a RMA free to the DaCe Program.
+
+        :param window_name: The name of the window to be freed.
+        :return: Name of the free.
+    """
+
+    from dace.libraries.mpi.nodes.win_free import Win_free
+
+    # fine a new free name
+    free_name = sdfg.add_rma_ops(window_name, "free")
+
+    _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
+
+    free_node = Win_free(free_name, window_name)
+
+    # check for the last RMA operation
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(free_name) - 1]
+
+    last_rma_op_node = state.add_read(last_rma_op_name)
+    last_rma_op_desc = sdfg.arrays[last_rma_op_name]
+
+    # for window free ordering
+    state.add_edge(last_rma_op_node,
+                   None,
+                   free_node,
+                   "_in",
+                   Memlet.from_array(last_rma_op_name, last_rma_op_desc))
+
+    # Pseudo-writing for newast.py #3195 check and complete Processcomm creation
+    _, scal = sdfg.add_scalar(free_name, dace.int32, transient=True)
+    wnode = state.add_write(free_name)
+    state.add_edge(free_node,
+                   "_out",
+                   wnode,
+                   None,
+                   Memlet.from_array(free_name, scal))
+
+    return window_name
+
+
 @oprepo.replaces_method('RMA_window', 'Lock')
 def _rma_lock(pv: ProgramVisitor,
               sdfg: SDFG,
diff --git a/dace/libraries/mpi/nodes/__init__.py b/dace/libraries/mpi/nodes/__init__.py
index 3d3e0ac8f9..91d97091ac 100644
--- a/dace/libraries/mpi/nodes/__init__.py
+++ b/dace/libraries/mpi/nodes/__init__.py
@@ -21,3 +21,4 @@
 from .win_lock import Win_lock
 from .win_unlock import Win_unlock
 from .win_flush import Win_flush
+from .win_free import Win_free
diff --git a/dace/libraries/mpi/nodes/win_free.py b/dace/libraries/mpi/nodes/win_free.py
new file mode 100644
index 0000000000..81009093fc
--- /dev/null
+++ b/dace/libraries/mpi/nodes/win_free.py
@@ -0,0 +1,43 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace.library
+import dace.properties
+import dace.sdfg.nodes
+from dace.transformation.transformation import ExpandTransformation
+from .. import environments
+from dace.libraries.mpi.nodes.node import MPINode
+
+
+@dace.library.expansion
+class ExpandWinFreeMPI(ExpandTransformation):
+
+    environments = [environments.mpi.MPI]
+
+    @staticmethod
+    def expansion(node, parent_state, parent_sdfg, **kwargs):
+        window_name = node.window_name
+        code = f"""
+            MPI_Win_free(&__state->{window_name}_window);
+            """
+        tasklet = dace.sdfg.nodes.Tasklet(node.name,
+                                          node.in_connectors,
+                                          node.out_connectors,
+                                          code,
+                                          language=dace.dtypes.Language.CPP,
+                                          side_effects=True)
+        return tasklet
+
+
+@dace.library.node
+class Win_free(MPINode):
+
+    # Global properties
+    implementations = {
+        "MPI": ExpandWinFreeMPI,
+    }
+    default_implementation = "MPI"
+
+    window_name = dace.properties.Property(dtype=str, default=None)
+
+    def __init__(self, name, window_name, *args, **kwargs):
+        super().__init__(name, *args, inputs={"_in"}, outputs={"_out"}, **kwargs)
+        self.window_name = window_name
diff --git a/tests/library/mpi/mpi4py_test.py b/tests/library/mpi/mpi4py_test.py
index 55cbff160b..124f4299dd 100644
--- a/tests/library/mpi/mpi4py_test.py
+++ b/tests/library/mpi/mpi4py_test.py
@@ -52,6 +52,7 @@ def mpi4py_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], rank: dace
         win.Fence(0)
         win.Put(send_buf, target_rank=rank)
         win.Fence(0)
+        win.Free()
 
     if size < 2:
         raise ValueError("Please run this test with at least two processes.")
@@ -158,6 +159,8 @@ def mpi4py_passive_rma_put(win_buf: dace.int32[10], send_buf: dace.int32[10], ra
         win.Fence(0)
         win.Fence(0)
 
+        win.Free()
+
     if size < 2:
         raise ValueError("Please run this test with at least two processes.")
 
diff --git a/tests/library/mpi/mpi_free_test.py b/tests/library/mpi/mpi_free_test.py
new file mode 100644
index 0000000000..f87220e4bc
--- /dev/null
+++ b/tests/library/mpi/mpi_free_test.py
@@ -0,0 +1,102 @@
+# Copyright 2019-2023 ETH Zurich and the DaCe authors. All rights reserved.
+import dace
+from dace.sdfg import utils
+import dace.dtypes as dtypes
+from dace.memlet import Memlet
+import dace.libraries.mpi as mpi
+import dace.frontend.common.distr as comm
+import numpy as np
+import pytest
+
+
+###############################################################################
+
+
+def make_sdfg(dtype):
+    n = dace.symbol("n")
+
+    sdfg = dace.SDFG("mpi_win_free")
+    window_state = sdfg.add_state("create_window")
+
+    sdfg.add_array("win_buffer", [n], dtype=dtype, transient=False)
+    win_buffer = window_state.add_access("win_buffer")
+
+    window_name = sdfg.add_window()
+    win_create_node = mpi.nodes.win_create.Win_create(window_name)
+
+    window_state.add_edge(win_buffer,
+                   None,
+                   win_create_node,
+                   '_win_buffer',
+                   Memlet.simple(win_buffer, "0:n", num_accesses=n))
+
+    # for other nodes depends this window to connect
+    _, scal = sdfg.add_scalar(window_name, dace.int32, transient=True)
+    wnode = window_state.add_write(window_name)
+    window_state.add_edge(win_create_node,
+                          "_out",
+                          wnode,
+                          None,
+                          Memlet.from_array(window_name, scal))
+
+###############################################################################
+
+    free_state = sdfg.add_state("win_free")
+
+    sdfg.add_edge(window_state, free_state, dace.InterstateEdge())
+
+    free_name = sdfg.add_rma_ops(window_name, "free")
+    win_free_node = mpi.nodes.win_free.Win_free(free_name, window_name)
+
+    # pseudo access for ordering
+    window_node = free_state.add_access(window_name)
+    window_desc = sdfg.arrays[window_name]
+
+    free_state.add_edge(window_node,
+                         None,
+                         win_free_node,
+                         "_in",
+                         Memlet.from_array(window_name, window_desc))
+
+    _, scal = sdfg.add_scalar(free_name, dace.int32, transient=True)
+    wnode = free_state.add_write(free_name)
+    free_state.add_edge(win_free_node,
+                         "_out",
+                         wnode,
+                         None,
+                         Memlet.from_array(free_name, scal))
+
+    return sdfg
+
+
+###############################################################################
+
+@pytest.mark.parametrize("implementation, dtype", [
+    pytest.param("MPI", dace.float32, marks=pytest.mark.mpi),
+    pytest.param("MPI", dace.int32, marks=pytest.mark.mpi)
+])
+def test_win_free(dtype):
+    from mpi4py import MPI
+    np_dtype = getattr(np, dtype.to_string())
+    comm_world = MPI.COMM_WORLD
+    comm_rank = comm_world.Get_rank()
+    comm_size = comm_world.Get_size()
+
+    if comm_size < 2:
+        raise ValueError("This test is supposed to be run with at least two processes!")
+
+    mpi_func = None
+    for r in range(0, comm_size):
+        if r == comm_rank:
+            sdfg = make_sdfg(dtype)
+            mpi_func = sdfg.compile()
+        comm_world.Barrier()
+
+    window_size = 10
+    win_buffer = np.arange(0, window_size, dtype=np_dtype)
+
+    mpi_func(win_buffer=win_buffer, n=window_size)
+
+if __name__ == "__main__":
+    test_win_free(dace.int32)
+    test_win_free(dace.float32)

From da62e8e201cd0184104c2eb6d54fb68b9915b37d Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Mon, 11 Sep 2023 17:01:05 +0800
Subject: [PATCH 24/28] Refactored RMA last op checker to a function

---
 dace/frontend/common/distr.py | 54 ++++++++++++-----------------------
 1 file changed, 19 insertions(+), 35 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index fb40a22d4c..a64647d453 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -915,6 +915,20 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
+def get_last_rma_op(sdfg: SDFG,
+                    cur_op_name: str,
+                    window_name: str):
+    all_rma_ops_name = list(sdfg._rma_ops.keys())
+    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
+                           if f"{window_name}_" in rma_op]
+    if len(cur_window_rma_ops) == 1:
+        last_rma_op_name = window_name
+    else:
+        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(cur_op_name) - 1]
+
+    return last_rma_op_name
+
+
 @oprepo.replaces('mpi4py.MPI.Win.Create')
 @oprepo.replaces('dace.Win.Create')
 def _rma_window_create(pv: ProgramVisitor,
@@ -986,13 +1000,7 @@ def _rma_fence(pv: ProgramVisitor,
     fence_node = Win_fence(fence_name, window_name)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    if len(cur_window_rma_ops) == 1:
-        last_rma_op_name = window_name
-    else:
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(fence_name) - 1]
+    last_rma_op_name = get_last_rma_op(sdfg, fence_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1046,13 +1054,7 @@ def _rma_flush(pv: ProgramVisitor,
     flush_node = Win_flush(flush_name, window_name)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    if len(cur_window_rma_ops) == 1:
-        last_rma_op_name = window_name
-    else:
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(flush_name) - 1]
+    last_rma_op_name = get_last_rma_op(sdfg, flush_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1104,13 +1106,7 @@ def _rma_free(pv: ProgramVisitor,
     free_node = Win_free(free_name, window_name)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    if len(cur_window_rma_ops) == 1:
-        last_rma_op_name = window_name
-    else:
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(free_name) - 1]
+    last_rma_op_name = get_last_rma_op(sdfg, free_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1165,13 +1161,7 @@ def _rma_lock(pv: ProgramVisitor,
     _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    if len(cur_window_rma_ops) == 1:
-        last_rma_op_name = window_name
-    else:
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(lock_name) - 1]
+    last_rma_op_name = get_last_rma_op(sdfg, lock_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1237,13 +1227,7 @@ def _rma_unlock(pv: ProgramVisitor,
     unlock_node = Win_unlock(unlock_name, window_name)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    if len(cur_window_rma_ops) == 1:
-        last_rma_op_name = window_name
-    else:
-        last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(unlock_name) - 1]
+    last_rma_op_name = get_last_rma_op(sdfg, unlock_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]

From 54fab2e8a0dee31e87e4203638f8dab6360e54e3 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Tue, 12 Sep 2023 23:55:20 +0800
Subject: [PATCH 25/28] Added RMA sync logic checking to last op checker

---
 dace/frontend/common/distr.py | 101 ++++++++++++----------------------
 1 file changed, 35 insertions(+), 66 deletions(-)

diff --git a/dace/frontend/common/distr.py b/dace/frontend/common/distr.py
index a64647d453..26130e0560 100644
--- a/dace/frontend/common/distr.py
+++ b/dace/frontend/common/distr.py
@@ -915,9 +915,20 @@ def _wait(pv: ProgramVisitor, sdfg: SDFG, state: SDFGState, request: str):
     return None
 
 
-def get_last_rma_op(sdfg: SDFG,
-                    cur_op_name: str,
-                    window_name: str):
+def _get_last_rma_op(sdfg: SDFG,
+                     cur_op_name: str,
+                     window_name: str,
+                     is_trans: bool = False):
+    """ Get last RMA operation name of a window from the SDFG.
+        And do some logical checks if is_trans is True.
+
+        :param sdfg: The sdfg for searching.
+        :param cur_op_name: current operation in the window.
+        :param window_name: The RMA window name for searching.
+        :param is_trans: check RMA sync is exist before op if this param is true
+        :return: Name of the last RMA operation.
+    """
+
     all_rma_ops_name = list(sdfg._rma_ops.keys())
     cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
                            if f"{window_name}_" in rma_op]
@@ -926,6 +937,19 @@ def get_last_rma_op(sdfg: SDFG,
     else:
         last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(cur_op_name) - 1]
 
+    if is_trans:
+        # if only odd number of fences or locks,
+        # that means we're in a ongoing epoch
+        # if even number,
+        # that means this operation might have corrupted sync
+        cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
+                            if f"{window_name}_fence" in rma_op]
+        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
+                                    if "lock" in rma_op]
+        if len(cur_window_fences) % 2 == 0 and len(cur_window_passive_syncs) % 2 == 0:
+            # if we don't have even number of syncs, give user a warning
+            print("You might have a bad synchronization of RMA calls!")
+
     return last_rma_op_name
 
 
@@ -1000,7 +1024,7 @@ def _rma_fence(pv: ProgramVisitor,
     fence_node = Win_fence(fence_name, window_name)
 
     # check for the last RMA operation
-    last_rma_op_name = get_last_rma_op(sdfg, fence_name, window_name)
+    last_rma_op_name = _get_last_rma_op(sdfg, fence_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1054,7 +1078,7 @@ def _rma_flush(pv: ProgramVisitor,
     flush_node = Win_flush(flush_name, window_name)
 
     # check for the last RMA operation
-    last_rma_op_name = get_last_rma_op(sdfg, flush_name, window_name)
+    last_rma_op_name = _get_last_rma_op(sdfg, flush_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1106,7 +1130,7 @@ def _rma_free(pv: ProgramVisitor,
     free_node = Win_free(free_name, window_name)
 
     # check for the last RMA operation
-    last_rma_op_name = get_last_rma_op(sdfg, free_name, window_name)
+    last_rma_op_name = _get_last_rma_op(sdfg, free_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1161,7 +1185,7 @@ def _rma_lock(pv: ProgramVisitor,
     _, assertion_node = _get_int_arg_node(pv, sdfg, state, assertion)
 
     # check for the last RMA operation
-    last_rma_op_name = get_last_rma_op(sdfg, lock_name, window_name)
+    last_rma_op_name = _get_last_rma_op(sdfg, lock_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1227,7 +1251,7 @@ def _rma_unlock(pv: ProgramVisitor,
     unlock_node = Win_unlock(unlock_name, window_name)
 
     # check for the last RMA operation
-    last_rma_op_name = get_last_rma_op(sdfg, unlock_name, window_name)
+    last_rma_op_name = _get_last_rma_op(sdfg, unlock_name, window_name)
 
     last_rma_op_node = state.add_read(last_rma_op_name)
     last_rma_op_desc = sdfg.arrays[last_rma_op_name]
@@ -1277,26 +1301,7 @@ def _rma_put(pv: ProgramVisitor,
     put_name = sdfg.add_rma_ops(window_name, "put")
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
-                           if f"{window_name}_fence" in rma_op]
-
-    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(put_name) - 1]
-
-    # if only odd number of fences,
-    # that means we're in a ongoing epoch
-    if len(cur_window_fences) % 2 == 0:
-        # if even number of fences,
-        # that means this operation is either a passive sync. one or a corrupted one
-        # same logic applies to passive sync.
-        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
-                                    if "lock" in rma_op]
-        # if we don't have even number of syncs, give user a warning
-        if len(cur_window_passive_syncs) % 2 == 0:
-            print("You might have a bad synchronization of RMA calls!")
-
+    last_rma_op_name = _get_last_rma_op(sdfg, put_name, window_name, is_trans=True)
 
     put_node = Win_put(put_name, window_name)
 
@@ -1355,25 +1360,7 @@ def _rma_get(pv: ProgramVisitor,
     get_name = sdfg.add_rma_ops(window_name, "get")
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
-                           if f"{window_name}_fence" in rma_op]
-
-    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(get_name) - 1]
-
-    # if only odd number of fences,
-    # that means we're in a ongoing epoch
-    if len(cur_window_fences) % 2 == 0:
-        # if even number of fences,
-        # that means this operation is either a passive sync. one or a corrupted one
-        # same logic applies to passive sync.
-        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
-                                    if "lock" in rma_op]
-        # if we don't have even number of syncs, give user a warning
-        if len(cur_window_passive_syncs) % 2 == 0:
-            print("You might have a bad synchronization of RMA calls!")
+    last_rma_op_name = _get_last_rma_op(sdfg, get_name, window_name, is_trans=True)
 
     get_node = Win_get(get_name, window_name)
 
@@ -1437,25 +1424,7 @@ def _rma_accumulate(pv: ProgramVisitor,
         op = _mpi4py_to_MPI(MPI, op)
 
     # check for the last RMA operation
-    all_rma_ops_name = list(sdfg._rma_ops.keys())
-    cur_window_rma_ops = [rma_op for rma_op in all_rma_ops_name
-                           if f"{window_name}_" in rma_op]
-    cur_window_fences = [rma_op for rma_op in cur_window_rma_ops
-                           if f"{window_name}_fence" in rma_op]
-
-    last_rma_op_name = cur_window_rma_ops[cur_window_rma_ops.index(accumulate_name) - 1]
-
-    # if only odd number of fences,
-    # that means we're in a ongoing epoch
-    if len(cur_window_fences) % 2 == 0:
-        # if even number of fences,
-        # that means this operation is either a passive sync. one or a corrupted one
-        # same logic applies to passive sync.
-        cur_window_passive_syncs = [rma_op for rma_op in cur_window_rma_ops
-                                    if "lock" in rma_op]
-        # if we don't have even number of syncs, give user a warning
-        if len(cur_window_passive_syncs) % 2 == 0:
-            print("You might have a bad synchronization of RMA calls!")
+    last_rma_op_name = _get_last_rma_op(sdfg, accumulate_name, window_name, is_trans=True)
 
     accumulate_node = Win_accumulate(accumulate_name, window_name, op)
 

From 59f8e72f7a1a0159ee8b9115456d4ca982b1ca4c Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Thu, 14 Sep 2023 22:13:19 +0800
Subject: [PATCH 26/28] changed the data type of matrix from int32 to float32

---
 samples/mpi/mat_mul.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index dd8fcbee2c..d2399720b7 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -15,14 +15,14 @@ def matrix_mul(comm_world, a, b):
   comm_rank = comm_world.Get_rank()
   comm_size = comm_world.Get_size()
 
-  a_mat = np.array(a + comm_rank, dtype=np.int32)
-  b_mat = np.array(b + comm_rank, dtype=np.int32)
-  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.int32)
+  a_mat = np.array(a + comm_rank, dtype=np.float32)
+  b_mat = np.array(b + comm_rank, dtype=np.float32)
+  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.float32)
 
   @dace.program
-  def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]],
-                    b_mat: dace.int32[b_mat.shape[0], b_mat.shape[1]],
-                    c_mat: dace.int32[a_mat.shape[0], b_mat.shape[1]],
+  def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]],
+                    b_mat: dace.float32[b_mat.shape[0], b_mat.shape[1]],
+                    c_mat: dace.float32[a_mat.shape[0], b_mat.shape[1]],
                     comm_rank: dace.int32,
                     comm_size: dace.int32):
     grid_dim = int(np.floor(np.sqrt(comm_size)))
@@ -38,8 +38,8 @@ def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]],
     whole_k_dim = grid_dim * a_mat.shape[1]
 
     # local buffers for remote fetching
-    foreign_a_mat = np.zeros(a_mat.shape, dtype=np.int32)
-    foreign_b_mat = np.zeros(b_mat.shape, dtype=np.int32)
+    foreign_a_mat = np.zeros(a_mat.shape, dtype=np.float32)
+    foreign_b_mat = np.zeros(b_mat.shape, dtype=np.float32)
 
     # RMA windows
     a_win = MPI.Win.Create(a_mat, comm=comm_world)
@@ -92,22 +92,22 @@ def dist_mat_mult(a_mat: dace.int32[a_mat.shape[0], a_mat.shape[1]],
   grid_i = comm_rank // grid_dim
   grid_j = comm_rank % grid_dim
 
-  dim_1 = 256
-  dim_2 = 256
+  dim_1 = 1024
+  dim_2 = 1024
 
-  a = np.ones((dim_1, dim_2), dtype=np.int32)
-  b = np.ones((dim_2, dim_1), dtype=np.int32)
+  a = np.ones((dim_1, dim_2), dtype=np.float32)
+  b = np.ones((dim_2, dim_1), dtype=np.float32)
 
   c_mat, time_con = matrix_mul(comm_world, a, b)
   # print(comm_rank, c_mat)
   # print(comm_rank, "matrix_mul time:", time_con)
 
-  whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.int32)
+  whole_a = np.ones((dim_1 * grid_dim, dim_2 * grid_dim), dtype=np.float32)
   for i in range(grid_dim):
     for j in range(grid_dim):
       whole_a[i * dim_1:(i+1) * dim_1, j * dim_2:(j+1) * dim_2] += i * grid_dim + j
 
-  whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.int32)
+  whole_b = np.ones((dim_2 * grid_dim, dim_1 * grid_dim), dtype=np.float32)
   for i in range(grid_dim):
     for j in range(grid_dim):
       whole_b[i * dim_2:(i+1) * dim_2, j * dim_1:(j+1) * dim_1] += i * grid_dim + j

From 1241422fb4f227ed9a4e32bef7a106ab5fb84123 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 15 Sep 2023 00:17:50 +0800
Subject: [PATCH 27/28] Implemented strong scaling benchmark for mat_mul.py

---
 samples/mpi/mat_mul.py | 76 ++++++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 17 deletions(-)

diff --git a/samples/mpi/mat_mul.py b/samples/mpi/mat_mul.py
index d2399720b7..94a67139df 100644
--- a/samples/mpi/mat_mul.py
+++ b/samples/mpi/mat_mul.py
@@ -6,18 +6,14 @@
 import time
 
 
-def matrix_mul(comm_world, a, b):
-  # check if matrix multiplication is valid
-  if a.shape[1] != b.shape[0]:
-    raise ValueError("A, B matrix dimension mismatched!")
-
+def matrix_mul(comm_world, dim_1, dim_2):
   # comm init
   comm_rank = comm_world.Get_rank()
   comm_size = comm_world.Get_size()
 
-  a_mat = np.array(a + comm_rank, dtype=np.float32)
-  b_mat = np.array(b + comm_rank, dtype=np.float32)
-  c_mat = np.zeros((a_mat.shape[0], b_mat.shape[1]), dtype=np.float32)
+  a_mat = np.full((dim_1, dim_2), 1 + comm_rank, dtype=np.float32)
+  b_mat = np.full((dim_2, dim_1), 1 + comm_rank, dtype=np.float32)
+  c_mat = np.zeros((dim_1, dim_1), dtype=np.float32)
 
   @dace.program
   def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]],
@@ -83,11 +79,7 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]],
   return c_mat, time_con
 
 
-if __name__ == "__main__":
-  comm_world = MPI.COMM_WORLD
-  comm_rank = comm_world.Get_rank()
-  comm_size = comm_world.Get_size()
-
+def weak_scaling(comm_world, comm_rank, comm_size):
   grid_dim = int(np.floor(np.sqrt(comm_size)))
   grid_i = comm_rank // grid_dim
   grid_j = comm_rank % grid_dim
@@ -95,10 +87,7 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]],
   dim_1 = 1024
   dim_2 = 1024
 
-  a = np.ones((dim_1, dim_2), dtype=np.float32)
-  b = np.ones((dim_2, dim_1), dtype=np.float32)
-
-  c_mat, time_con = matrix_mul(comm_world, a, b)
+  c_mat, time_con = matrix_mul(comm_world, dim_1, dim_2)
   # print(comm_rank, c_mat)
   # print(comm_rank, "matrix_mul time:", time_con)
 
@@ -121,3 +110,56 @@ def dist_mat_mult(a_mat: dace.float32[a_mat.shape[0], a_mat.shape[1]],
 
   # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2]))
   assert(np.allclose(c_mat, c_np[grid_i * dim_1:(grid_i+1) * dim_1, grid_j* dim_2:(grid_j+1) * dim_2]))
+
+
+def strong_scaling(comm_world, comm_rank, comm_size):
+  grid_dim = int(np.floor(np.sqrt(comm_size)))
+  grid_i = comm_rank // grid_dim
+  grid_j = comm_rank % grid_dim
+
+  total_dim = 8192
+  dim_1 = total_dim
+  dim_2 = total_dim
+  if total_dim % comm_size > 0:
+    dim_1 += comm_size - total_dim % comm_size
+    dim_2 += comm_size - total_dim % comm_size
+
+  local_dim_1 = dim_1 // grid_dim
+  local_dim_2 = dim_2 // grid_dim
+
+  a = np.ones((local_dim_1, local_dim_2), dtype=np.float32)
+  b = np.ones((local_dim_2, local_dim_1), dtype=np.float32)
+
+  c_mat, time_con = matrix_mul(comm_world, local_dim_1, local_dim_2)
+  # print(comm_rank, c_mat)
+  # print(comm_rank, "matrix_mul time:", time_con)
+
+  # validation, since it will compute the whole matrix in the edge
+  # whole_a = np.ones((local_dim_1 * grid_dim, local_dim_2 * grid_dim), dtype=np.float32)
+  # for i in range(grid_dim):
+  #   for j in range(grid_dim):
+  #     whole_a[i * local_dim_1:(i+1) * local_dim_1, j * local_dim_2:(j+1) * local_dim_2] += i * grid_dim + j
+
+  # whole_b = np.ones((local_dim_2 * grid_dim, local_dim_1 * grid_dim), dtype=np.float32)
+  # for i in range(grid_dim):
+  #   for j in range(grid_dim):
+  #     whole_b[i * local_dim_2:(i+1) * local_dim_2, j * local_dim_1:(j+1) * local_dim_1] += i * grid_dim + j
+
+  # start = time.time()
+  # c_np = np.matmul(whole_a, whole_b)
+  # time_con = time.time() - start
+  # # print("Result correctness:", np.allclose(c_mat, c_np[grid_i * local_dim_1:(grid_i+1) * local_dim_1, grid_j* local_dim_2:(grid_j+1) * local_dim_2]))
+  # assert(np.allclose(c_mat, c_np[grid_i * local_dim_1:(grid_i+1) * local_dim_1, grid_j* local_dim_2:(grid_j+1) * local_dim_2]))
+
+if __name__ == "__main__":
+  comm_world = MPI.COMM_WORLD
+  comm_rank = comm_world.Get_rank()
+  comm_size = comm_world.Get_size()
+
+  grid_dim = int(np.floor(np.sqrt(comm_size)))
+
+  if comm_size != grid_dim ** 2:
+      raise ValueError("Please run this test with a square number of processes.")
+
+  # weak_scaling(comm_world, comm_rank, comm_size)
+  strong_scaling(comm_world, comm_rank, comm_size)

From e7baaf73b47166d70e26dbe2e229bdcd078a56c4 Mon Sep 17 00:00:00 2001
From: "Fu-Chiang, Chang" <fuchiang137@gmail.com>
Date: Fri, 15 Sep 2023 23:27:18 +0800
Subject: [PATCH 28/28] Fixed the window size configuration in window creation

---
 dace/libraries/mpi/nodes/win_create.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dace/libraries/mpi/nodes/win_create.py b/dace/libraries/mpi/nodes/win_create.py
index e3f7ba10d0..7abfc02b96 100644
--- a/dace/libraries/mpi/nodes/win_create.py
+++ b/dace/libraries/mpi/nodes/win_create.py
@@ -29,7 +29,7 @@ def expansion(node, parent_state, parent_sdfg, **kwargs):
 
         code = f"""
             MPI_Win_create(_win_buffer,
-                           {win_buf_count_str},
+                           {win_buf_count_str} * sizeof({win_buffer_dtype}),
                            sizeof({win_buffer_dtype}),
                            MPI_INFO_NULL,
                            {comm},