FEniCS · garth-wells · Nov 15, 2024 · Nov 11, 2024 · Nov 12, 2024 · Nov 12, 2024
diff --git a/cpp/dolfinx/common/IndexMap.cpp b/cpp/dolfinx/common/IndexMap.cpp
@@ -26,8 +26,8 @@ namespace
 /// @param comm MPI communicator.
 /// @param owners List of ranks that own each ghost index.
 /// @return (src ranks, destination ranks). Both lists are sorted.
-std::array<std::vector<int>, 2> build_src_dest(MPI_Comm comm,
-                                               std::span<const int> owners)
+std::array<std::vector<int>, 2>
+build_src_dest(MPI_Comm comm, std::span<const int> owners, int tag)
 {
   if (dolfinx::MPI::size(comm) == 1)
   {
@@ -40,8 +40,9 @@ std::array<std::vector<int>, 2> build_src_dest(MPI_Comm comm,
   auto [unique_end, range_end] = std::ranges::unique(src);
   src.erase(unique_end, range_end);
   src.shrink_to_fit();
-  std::vector<int> dest = dolfinx::MPI::compute_graph_edges_nbx(comm, src);
+  std::vector<int> dest = dolfinx::MPI::compute_graph_edges_nbx(comm, src, tag);
   std::ranges::sort(dest);
+
   return {std::move(src), std::move(dest)};
 }
 
@@ -57,9 +58,9 @@ std::array<std::vector<int>, 2> build_src_dest(MPI_Comm comm,
 /// @param[in] dest Destination ranks on `comm`.
 /// @param[in] ghosts Ghost indices on calling process.
 /// @param[in] owners Owning rank for each entry in `ghosts`.
-/// @param[in] include_ghost A list of the same length as `ghosts`, whose
-/// ith entry must be non-zero (true) to include `ghost[i]`, otherwise
-/// the ghost will be excluded
+/// @param[in] include_ghost A list of the same length as `ghosts`,
+/// whose ith entry must be non-zero (true) to include `ghost[i]`,
+/// otherwise the ghost will be excluded
 /// @return 1) The ghost indices packed in a buffer for communication
 ///         2) The received indices (in receive buffer layout)
 ///         3) A map relating the position of a ghost in the packed
@@ -879,8 +880,9 @@ IndexMap::IndexMap(MPI_Comm comm, std::int32_t local_size) : _comm(comm, true)
 //-----------------------------------------------------------------------------
 IndexMap::IndexMap(MPI_Comm comm, std::int32_t local_size,
                    std::span<const std::int64_t> ghosts,
-                   std::span<const int> owners)
-    : IndexMap(comm, local_size, build_src_dest(comm, owners), ghosts, owners)
+                   std::span<const int> owners, int tag)
+    : IndexMap(comm, local_size, build_src_dest(comm, owners, tag), ghosts,
+               owners)
 {
   // Do nothing
 }
@@ -1002,7 +1004,7 @@ std::vector<std::int64_t> IndexMap::global_indices() const
 //-----------------------------------------------------------------------------
 MPI_Comm IndexMap::comm() const { return _comm.comm(); }
 //----------------------------------------------------------------------------
-graph::AdjacencyList<int> IndexMap::index_to_dest_ranks() const
+graph::AdjacencyList<int> IndexMap::index_to_dest_ranks(int tag) const
 {
   const std::int64_t offset = _local_range[0];
 
@@ -1011,7 +1013,8 @@ graph::AdjacencyList<int> IndexMap::index_to_dest_ranks() const
   std::ranges::sort(src);
   auto [unique_end, range_end] = std::ranges::unique(src);
   src.erase(unique_end, range_end);
-  auto dest = dolfinx::MPI::compute_graph_edges_nbx(_comm.comm(), src);
+  std::vector<int> dest
+      = dolfinx::MPI::compute_graph_edges_nbx(_comm.comm(), src, tag);
   std::ranges::sort(dest);
 
   // Array (local idx, ghosting rank) pairs for owned indices

diff --git a/cpp/dolfinx/common/IndexMap.h b/cpp/dolfinx/common/IndexMap.h
@@ -119,8 +119,21 @@ class IndexMap
   /// of owned entries
   /// @param[in] ghosts The global indices of ghost entries
   /// @param[in] owners Owner rank (on `comm`) of each entry in `ghosts`
+  /// @param[in] tag Tag used in non-blocking MPI calls in the consensus
+  /// algorithm.
+  /// @note A tag can sometimes be required when there are a series of
+  /// calls to this constructor, or other functions that call the
+  /// consensus algorithm, that are close together. In cases where this
+  /// constructor is called a second time on rank and another rank has
+  /// not completed its first consensus algorithm call, communications
+  /// can be corrupted if each collective call of this constructor does
+  /// not have its own `tag` value. Each collective call to this
+  /// constructor must use the same `tag` value.  An alternative to
+  /// passing a tag is to have an implicit or explicit MPI barrier
+  /// before and after the call to this constructor.
   IndexMap(MPI_Comm comm, std::int32_t local_size,
-           std::span<const std::int64_t> ghosts, std::span<const int> owners);
+           std::span<const std::int64_t> ghosts, std::span<const int> owners,
+           int tag = static_cast<int>(dolfinx::MPI::tag::consensus_nbx));
 
   /// @brief Create an overlapping (ghosted) index map.
   ///
@@ -208,8 +221,16 @@ class IndexMap
   ///
   /// @brief Compute map from each local (owned) index to the set of
   /// ranks that have the index as a ghost.
-  /// @return shared indices
-  graph::AdjacencyList<int> index_to_dest_ranks() const;
+  ///
+  /// @note Collective
+  ///
+  /// @param[in] tag Tag to pass to MPI calls.
+  /// @note See ::IndexMap(MPI_Comm,std::int32_t,std::span<const
+  /// std::int64_t>,std::span<const int>,int) for an explanation of when
+  /// `tag` is required.
+  /// @return Shared indices.
+  graph::AdjacencyList<int> index_to_dest_ranks(
+      int tag = static_cast<int>(dolfinx::MPI::tag::consensus_nbx)) const;
 
   /// @brief Build a list of owned indices that are ghosted by another
   /// rank.

diff --git a/cpp/dolfinx/common/MPI.cpp b/cpp/dolfinx/common/MPI.cpp
@@ -159,7 +159,8 @@ dolfinx::MPI::compute_graph_edges_pcx(MPI_Comm comm, std::span<const int> edges)
 }
 //-----------------------------------------------------------------------------
 std::vector<int>
-dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges)
+dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges,
+                                      int tag)
 {
   spdlog::info(
       "Computing communication graph edges (using NBX algorithm). Number "
@@ -171,9 +172,8 @@ dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges)
   std::vector<std::byte> send_buffer(edges.size());
   for (std::size_t e = 0; e < edges.size(); ++e)
   {
-    int err = MPI_Issend(send_buffer.data() + e, 1, MPI_BYTE, edges[e],
-                         static_cast<int>(tag::consensus_pex), comm,
-                         &send_requests[e]);
+    int err = MPI_Issend(send_buffer.data() + e, 1, MPI_BYTE, edges[e], tag,
+                         comm, &send_requests[e]);
     dolfinx::MPI::check_error(comm, err);
   }
 
@@ -189,8 +189,7 @@ dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges)
     // Check for message
     int request_pending;
     MPI_Status status;
-    int err = MPI_Iprobe(MPI_ANY_SOURCE, static_cast<int>(tag::consensus_pex),
-                         comm, &request_pending, &status);
+    int err = MPI_Iprobe(MPI_ANY_SOURCE, tag, comm, &request_pending, &status);
     dolfinx::MPI::check_error(comm, err);
 
     // Check if message is waiting to be processed
@@ -199,8 +198,7 @@ dolfinx::MPI::compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges)
       // Receive it
       int other_rank = status.MPI_SOURCE;
       std::byte buffer_recv;
-      int err = MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank,
-                         static_cast<int>(tag::consensus_pex), comm,
+      int err = MPI_Recv(&buffer_recv, 1, MPI_BYTE, other_rank, tag, comm,
                          MPI_STATUS_IGNORE);
       dolfinx::MPI::check_error(comm, err);
       other_ranks.push_back(other_rank);

diff --git a/cpp/dolfinx/common/MPI.h b/cpp/dolfinx/common/MPI.h
@@ -33,8 +33,9 @@ namespace dolfinx::MPI
 /// MPI communication tags
 enum class tag : int
 {
-  consensus_pcx,
-  consensus_pex
+  consensus_pcx = 1200,
+  consensus_pex = 1201,
+  consensus_nbx = 1202,
 };
 
 /// @brief A duplicate MPI communicator and manage lifetime of the
@@ -72,22 +73,21 @@ class Comm
 int rank(MPI_Comm comm);
 
 /// Return size of the group (number of processes) associated with the
-/// communicator
+/// communicator.
 int size(MPI_Comm comm);
 
 /// @brief Check MPI error code. If the error code is not equal to
 /// MPI_SUCCESS, then std::abort is called.
-/// @param[in] comm MPI communicator
-/// @param[in] code Error code returned by an MPI function call
+/// @param[in] comm MPI communicator.
+/// @param[in] code Error code returned by an MPI function call.
 void check_error(MPI_Comm comm, int code);
 
 /// @brief Return local range for the calling process, partitioning the
 /// global [0, N - 1] range across all ranks into partitions of almost
 /// equal size.
 /// @param[in] rank MPI rank of the caller
-/// @param[in] N The value to partition
-/// @param[in] size The number of MPI ranks across which to partition
-/// `N`
+/// @param[in] N The value to partition.
+/// @param[in] size Number of MPI ranks across which to partition `N`.
 constexpr std::array<std::int64_t, 2> local_range(int rank, std::int64_t N,
                                                   int size)
 {
@@ -108,10 +108,10 @@ constexpr std::array<std::int64_t, 2> local_range(int rank, std::int64_t N,
 
 /// @brief Return which rank owns index in global range [0, N - 1]
 /// (inverse of MPI::local_range).
-/// @param[in] size Number of MPI ranks
-/// @param[in] index The index to determine owning rank
-/// @param[in] N Total number of indices
-/// @return The rank of the owning process
+/// @param[in] size Number of MPI ranks.
+/// @param[in] index The index to determine the owning rank of.
+/// @param[in] N Total number of indices.
+/// @return Rank of the owning process.
 constexpr int index_owner(int size, std::size_t index, std::size_t N)
 {
   assert(index < N);
@@ -171,19 +171,21 @@ std::vector<int> compute_graph_edges_pcx(MPI_Comm comm,
 /// implements the NBX algorithm presented in
 /// https://dx.doi.org/10.1145/1837853.1693476.
 ///
-/// @note For sparse graphs, this function has \f$O(\log p)\f$ cost,
-/// where \f$p\f$is the number of MPI ranks. It is suitable for modest
-/// MPI rank counts.
-///
 /// @note The order of the returned ranks is not deterministic.
 ///
 /// @note Collective.
 ///
 /// @param[in] comm MPI communicator
 /// @param[in] edges Edges (ranks) from this rank (the caller).
-/// @return Ranks that have defined edges from them to this rank.
-std::vector<int> compute_graph_edges_nbx(MPI_Comm comm,
-                                         std::span<const int> edges);
+/// @param[in] tag Tag used in non-blocking MPI calls. A tag can be
+/// required when this function is called a second time on some ranks
+/// before a previous call has completed on all other ranks. @return
+/// Ranks that have defined edges from them to this rank. @note An
+/// alternative to passing a tag is to ensure that there is an implicit
+/// or explicit barrier before and after the call to this function.
+std::vector<int>
+compute_graph_edges_nbx(MPI_Comm comm, std::span<const int> edges,
+                        int tag = static_cast<int>(tag::consensus_nbx));
 
 /// @brief Distribute row data to 'post office' ranks.
 ///