From 3abba41a493cc1e6778b69f1cb25e1a71407d34d Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 6 Aug 2024 12:45:27 +0100
Subject: [PATCH 1/3] [MLIR][OpenMP] Remove omp.parallel from loop wrapper
 operations

This patch removes the `LoopWrapperInterface` from `omp.parallel` and updates
the semantics of the interface to make loop wrapper restrictions mandatory to
operations that have it, rather than a role they might optionally take.

MLIR operation verifiers are updated to expect the "hoisted omp.parallel"
representation for `distribute parallel do`, to be later implemented in place
of a loop wrapper `omp.parallel`.
---
 .../lib/Lower/OpenMP/DataSharingProcessor.cpp |  4 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  1 -
 .../Dialect/OpenMP/OpenMPOpsInterfaces.td     | 26 +++---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 56 +++++--------
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  3 +-
 mlir/test/Dialect/OpenMP/invalid.mlir         | 79 +++++--------------
 mlir/test/Dialect/OpenMP/ops.mlir             |  9 ++-
 7 files changed, 62 insertions(+), 116 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
index ddb5801f3cbc0e..0ae78dc5da07af 100644
--- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -235,9 +235,7 @@ void DataSharingProcessor::insertBarrier() {
 void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   mlir::omp::LoopNestOp loopOp;
   if (auto wrapper = mlir::dyn_cast<mlir::omp::LoopWrapperInterface>(op))
-    loopOp = wrapper.isWrapper()
-                 ? mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop())
-                 : nullptr;
+    loopOp = mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop());
 
   bool cmpCreated = false;
   mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index d05543f8fbd929..9fbb122170e798 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -128,7 +128,6 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove, RecipeInterface]>
 
 def ParallelOp : OpenMP_Op<"parallel", traits = [
     AttrSizedOperandSegments, AutomaticAllocationScope,
-    DeclareOpInterfaceMethods<LoopWrapperInterface>,
     DeclareOpInterfaceMethods<OutlineableOpenMPOpInterface>,
     RecursiveMemoryEffects
   ], clauses = [
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 2d1de37239c82a..ace671e04d7876 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -71,10 +71,10 @@ def ReductionClauseInterface : OpInterface<"ReductionClauseInterface"> {
 
 def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
   let description = [{
-    OpenMP operations that can wrap a single loop nest. When taking a wrapper
-    role, these operations must only contain a single region with a single block
-    in which there's a single operation and a terminator. That nested operation
-    must be another loop wrapper or an `omp.loop_nest`.
+    OpenMP operations that wrap a single loop nest. They must only contain a
+    single region with a single block in which there's a single operation and a
+    terminator. That nested operation must be another loop wrapper or an
+    `omp.loop_nest`.
   }];
 
   let cppNamespace = "::mlir::omp";
@@ -82,13 +82,13 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
   let methods = [
     InterfaceMethod<
       /*description=*/[{
-        Tell whether the operation could be taking the role of a loop wrapper.
-        That is, it has a single region with a single block in which there are
-        two operations: another wrapper (also taking a loop wrapper role) or
+        Check whether the operation is a valid loop wrapper. That is, it has a
+        single region with a single block in which there are two operations:
+        another loop wrapper (also taking a loop wrapper role) or
         `omp.loop_nest` operation and a terminator.
       }],
       /*retTy=*/"bool",
-      /*methodName=*/"isWrapper",
+      /*methodName=*/"isValidWrapper",
       (ins ), [{}], [{
         if ($_op->getNumRegions() != 1)
           return false;
@@ -107,7 +107,7 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
           return false;
 
         if (auto wrapper = ::llvm::dyn_cast<LoopWrapperInterface>(firstOp))
-          return wrapper.isWrapper();
+          return wrapper.isValidWrapper();
 
         return ::llvm::isa<LoopNestOp>(firstOp);
       }]
@@ -115,12 +115,12 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
     InterfaceMethod<
       /*description=*/[{
         If there is another loop wrapper immediately nested inside, return that
-        operation. Assumes this operation is taking a loop wrapper role.
+        operation. Assumes this operation is a valid loop wrapper.
       }],
       /*retTy=*/"::mlir::omp::LoopWrapperInterface",
       /*methodName=*/"getNestedWrapper",
       (ins), [{}], [{
-        assert($_op.isWrapper() && "Unexpected non-wrapper op");
+        assert($_op.isValidWrapper() && "Unexpected non-wrapper op");
         Operation *nested = &*$_op->getRegion(0).op_begin();
         return ::llvm::dyn_cast<LoopWrapperInterface>(nested);
       }]
@@ -128,12 +128,12 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
     InterfaceMethod<
       /*description=*/[{
         Return the loop nest nested directly or indirectly inside of this loop
-        wrapper. Assumes this operation is taking a loop wrapper role.
+        wrapper. Assumes this operation is a valid loop wrapper.
       }],
       /*retTy=*/"::mlir::Operation *",
       /*methodName=*/"getWrappedLoop",
       (ins), [{}], [{
-        assert($_op.isWrapper() && "Unexpected non-wrapper op");
+        assert($_op.isValidWrapper() && "Unexpected non-wrapper op");
         if (LoopWrapperInterface nested = $_op.getNestedWrapper())
           return nested.getWrappedLoop();
         return &*$_op->getRegion(0).op_begin();
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 5f6007561967e4..82efe62525ce12 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1604,15 +1604,15 @@ bool TargetOp::isTargetSPMDLoop() {
   if (!isa_and_present<WsloopOp>(workshareOp))
     return false;
 
-  Operation *parallelOp = workshareOp->getParentOp();
-  if (!isa_and_present<ParallelOp>(parallelOp))
+  Operation *distributeOp = workshareOp->getParentOp();
+  if (!isa_and_present<DistributeOp>(distributeOp))
     return false;
 
-  Operation *distributeOp = parallelOp->getParentOp();
-  if (!isa_and_present<DistributeOp>(distributeOp))
+  Operation *parallelOp = distributeOp->getParentOp();
+  if (!isa_and_present<ParallelOp>(parallelOp))
     return false;
 
-  Operation *teamsOp = distributeOp->getParentOp();
+  Operation *teamsOp = parallelOp->getParentOp();
   if (!isa_and_present<TeamsOp>(teamsOp))
     return false;
 
@@ -1690,22 +1690,6 @@ static LogicalResult verifyPrivateVarList(OpType &op) {
 }
 
 LogicalResult ParallelOp::verify() {
-  // Check that it is a valid loop wrapper if it's taking that role.
-  if (isa<DistributeOp>((*this)->getParentOp())) {
-    if (!isWrapper())
-      return emitOpError() << "must take a loop wrapper role if nested inside "
-                              "of 'omp.distribute'";
-
-    if (LoopWrapperInterface nested = getNestedWrapper()) {
-      // Check for the allowed leaf constructs that may appear in a composite
-      // construct directly after PARALLEL.
-      if (!isa<WsloopOp>(nested))
-        return emitError() << "only supported nested wrapper is 'omp.wsloop'";
-    } else {
-      return emitOpError() << "must not wrap an 'omp.loop_nest' directly";
-    }
-  }
-
   if (getAllocateVars().size() != getAllocatorVars().size())
     return emitError(
         "expected equal sizes for allocate and allocator variables");
@@ -1894,8 +1878,8 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult WsloopOp::verify() {
-  if (!isWrapper())
-    return emitOpError() << "must be a loop wrapper";
+  if (!isValidWrapper())
+    return emitOpError() << "must be a valid loop wrapper";
 
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     // Check for the allowed leaf constructs that may appear in a composite
@@ -1939,8 +1923,8 @@ LogicalResult SimdOp::verify() {
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
     return failure();
 
-  if (!isWrapper())
-    return emitOpError() << "must be a loop wrapper";
+  if (!isValidWrapper())
+    return emitOpError() << "must be a valid loop wrapper";
 
   if (getNestedWrapper())
     return emitOpError() << "must wrap an 'omp.loop_nest' directly";
@@ -1970,15 +1954,19 @@ LogicalResult DistributeOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
-  if (!isWrapper())
-    return emitOpError() << "must be a loop wrapper";
+  if (!isValidWrapper())
+    return emitOpError() << "must be a valid loop wrapper";
 
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     // Check for the allowed leaf constructs that may appear in a composite
     // construct directly after DISTRIBUTE.
-    if (!isa<ParallelOp, SimdOp>(nested))
-      return emitError() << "only supported nested wrappers are 'omp.parallel' "
-                            "and 'omp.simd'";
+    if (isa<WsloopOp>(nested)) {
+      if (!llvm::dyn_cast_if_present<ParallelOp>((*this)->getParentOp()))
+        return emitError() << "an 'omp.wsloop' nested wrapper is only allowed "
+                              "when 'omp.parallel' is the direct parent";
+    } else if (!isa<SimdOp>(nested))
+      return emitError() << "only supported nested wrappers are 'omp.simd' and "
+                            "'omp.wsloop'";
   }
 
   return success();
@@ -2176,8 +2164,8 @@ LogicalResult TaskloopOp::verify() {
         "may not appear on the same taskloop directive");
   }
 
-  if (!isWrapper())
-    return emitOpError() << "must be a loop wrapper";
+  if (!isValidWrapper())
+    return emitOpError() << "must be a valid loop wrapper";
 
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     // Check for the allowed leaf constructs that may appear in a composite
@@ -2269,7 +2257,7 @@ LogicalResult LoopNestOp::verify() {
   auto wrapper =
       llvm::dyn_cast_if_present<LoopWrapperInterface>((*this)->getParentOp());
 
-  if (!wrapper || !wrapper.isWrapper())
+  if (!wrapper || !wrapper.isValidWrapper())
     return emitOpError() << "expects parent op to be a valid loop wrapper";
 
   return success();
@@ -2280,8 +2268,6 @@ void LoopNestOp::gatherWrappers(
   Operation *parent = (*this)->getParentOp();
   while (auto wrapper =
              llvm::dyn_cast_if_present<LoopWrapperInterface>(parent)) {
-    if (!wrapper.isWrapper())
-      break;
     wrappers.push_back(wrapper);
     parent = parent->getParentOp();
   }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 8c72411b7b2e17..a5b289bee84b19 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3190,8 +3190,7 @@ static LogicalResult convertOmpDistribute(
     llvm::OpenMPIRBuilder::InsertPointTy *redAllocaIP,
     SmallVector<llvm::OpenMPIRBuilder::ReductionInfo> &reductionInfos) {
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
-  // FIXME: This ignores any other nested wrappers (e.g. omp.parallel +
-  // omp.wsloop, omp.simd).
+  // FIXME: This ignores any other nested wrappers (e.g. omp.wsloop, omp.simd).
   auto distributeOp = cast<omp::DistributeOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(distributeOp.getWrappedLoop());
 
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 3bed809bb6dc0d..856415655490f8 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -10,58 +10,6 @@ func.func @unknown_clause() {
 
 // -----
 
-func.func @not_wrapper() {
-  // expected-error@+1 {{op must be a loop wrapper}}
-  omp.distribute {
-    omp.parallel {
-      %0 = arith.constant 0 : i32
-      omp.terminator
-    }
-    omp.terminator
-  }
-
-  return
-}
-
-// -----
-
-func.func @invalid_nested_wrapper(%lb : index, %ub : index, %step : index) {
-  omp.distribute {
-    // expected-error@+1 {{only supported nested wrapper is 'omp.wsloop'}}
-    omp.parallel {
-      omp.simd {
-        omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
-          omp.yield
-        }
-        omp.terminator
-      }
-      omp.terminator
-    }
-    omp.terminator
-  }
-
-  return
-}
-
-// -----
-
-func.func @no_nested_wrapper(%lb : index, %ub : index, %step : index) {
-  omp.distribute {
-    // expected-error@+1 {{op must not wrap an 'omp.loop_nest' directly}}
-    omp.parallel {
-      omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
-        omp.yield
-      }
-      omp.terminator
-    }
-    omp.terminator
-  }
-
-  return
-}
-
-// -----
-
 func.func @if_once(%n : i1) {
   // expected-error@+1 {{`if` clause can appear at most once in the expansion of the oilist directive}}
   omp.parallel if(%n) if(%n) {
@@ -188,7 +136,7 @@ func.func @iv_number_mismatch(%lb : index, %ub : index, %step : index) {
 // -----
 
 func.func @no_wrapper(%lb : index, %ub : index, %step : index) {
-  // expected-error @below {{op must be a loop wrapper}}
+  // expected-error @below {{op must be a valid loop wrapper}}
   omp.wsloop {
     %0 = arith.constant 0 : i32
     omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
@@ -374,7 +322,7 @@ llvm.func @test_omp_wsloop_dynamic_wrong_modifier3(%lb : i64, %ub : i64, %step :
 // -----
 
 func.func @omp_simd() -> () {
-  // expected-error @below {{op must be a loop wrapper}}
+  // expected-error @below {{op must be a valid loop wrapper}}
   omp.simd {
     omp.terminator
   }
@@ -1939,7 +1887,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
 // -----
 
 func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
-  // expected-error @below {{op must be a loop wrapper}}
+  // expected-error @below {{op must be a valid loop wrapper}}
   omp.taskloop {
     %0 = arith.constant 0 : i32
     omp.terminator
@@ -2148,7 +2096,7 @@ func.func @omp_distribute_allocate(%data_var : memref<i32>) -> () {
 // -----
 
 func.func @omp_distribute_wrapper() -> () {
-  // expected-error @below {{op must be a loop wrapper}}
+  // expected-error @below {{op must be a valid loop wrapper}}
   omp.distribute {
       %0 = arith.constant 0 : i32
       "omp.terminator"() : () -> ()
@@ -2157,8 +2105,8 @@ func.func @omp_distribute_wrapper() -> () {
 
 // -----
 
-func.func @omp_distribute_nested_wrapper(%lb: index, %ub: index, %step: index) -> () {
-  // expected-error @below {{only supported nested wrappers are 'omp.parallel' and 'omp.simd'}}
+func.func @omp_distribute_nested_wrapper1(%lb: index, %ub: index, %step: index) -> () {
+  // expected-error @below {{an 'omp.wsloop' nested wrapper is only allowed when 'omp.parallel' is the direct parent}}
   omp.distribute {
     "omp.wsloop"() ({
       omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
@@ -2172,6 +2120,21 @@ func.func @omp_distribute_nested_wrapper(%lb: index, %ub: index, %step: index) -
 
 // -----
 
+func.func @omp_distribute_nested_wrapper2(%lb: index, %ub: index, %step: index) -> () {
+  // expected-error @below {{only supported nested wrappers are 'omp.simd' and 'omp.wsloop'}}
+  omp.distribute {
+    "omp.taskloop"() ({
+      omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+        "omp.yield"() : () -> ()
+      }
+      "omp.terminator"() : () -> ()
+    }) : () -> ()
+    "omp.terminator"() : () -> ()
+  }
+}
+
+// -----
+
 func.func @omp_distribute_order() -> () {
 // expected-error @below {{invalid clause value: 'default'}}
   omp.distribute order(default) {
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index e49731c0d68301..92e42d31e90333 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -99,10 +99,11 @@ func.func @omp_parallel(%data_var : memref<i32>, %if_cond : i1, %num_threads : i
     omp.terminator
   }) {operandSegmentSizes = array<i32: 1,1,0,0,0,0>} : (memref<i32>, memref<i32>) -> ()
 
-  // CHECK: omp.distribute
-  omp.distribute {
-    // CHECK-NEXT: omp.parallel
-    omp.parallel {
+  // CHECK: omp.parallel
+  omp.parallel {
+    // CHECK-NOT: omp.terminator
+    // CHECK: omp.distribute
+    omp.distribute {
       // CHECK-NEXT: omp.wsloop
       omp.wsloop {
         // CHECK-NEXT: omp.loop_nest

From 01c368d9189d9e3c49ae9ad7c19b7c91289e4a43 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Tue, 6 Aug 2024 16:15:31 +0100
Subject: [PATCH 2/3] [Flang][OpenMP] Re-implement lowering of 'DISTRIBUTE
 PARALLEL DO'

This patch updates the Flang lowering process for `distribute parallel do` to
follow the "hoisted `omp.parallel`" representation. Now temporary allocations
produced while lowering the loop body reside inside of that operation instead
of the loop wrappers' parent region.

Special handling of `omp.parallel` with regards to alloca creation is removed,
as it's no longer necessary to make this distinction. Impacted Lit tests are
updated according to the new representation.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 135 +++++++++------
 flang/lib/Optimizer/Builder/FIRBuilder.cpp    |  24 +--
 flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp |  13 +-
 .../lib/Optimizer/Transforms/StackArrays.cpp  |  30 +---
 .../test/Lower/OpenMP/eval-outside-target.f90 |  44 ++---
 .../Lower/OpenMP/hlfir-to-fir-conv-omp.mlir   |  11 +-
 flang/test/Lower/OpenMP/if-clause.f90         | 160 +++++++++---------
 flang/test/Lower/OpenMP/loop-compound.f90     |  12 +-
 flang/test/Lower/OpenMP/target_private.f90    |  10 +-
 9 files changed, 211 insertions(+), 228 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 662d0b56665f32..245a5e63ea1b7b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return parallelOp;
 }
 
-// TODO: Replace with genWrapperOp calls.
-static mlir::omp::ParallelOp genParallelWrapperOp(
+static mlir::omp::ParallelOp genParallelCompositeOp(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
-    lower::pft::Evaluation &eval, mlir::Location loc,
-    const mlir::omp::ParallelOperands &clauseOps,
+    const List<Clause> &clauses, lower::pft::Evaluation &eval,
+    mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
     mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
     llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
     llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
     DataSharingProcessor &dsp) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  // Create omp.parallel wrapper.
+  if (enableDelayedPrivatization) {
+    const auto &privateClauseOps = dsp.getPrivateClauseOps();
+    clauseOps.privateVars = privateClauseOps.privateVars;
+    clauseOps.privateSyms = privateClauseOps.privateSyms;
+  }
+
+  // Create omp.parallel operation.
   auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);
 
   if (numThreadsClauseOps.numThreads) {
@@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
   }
 
   // Populate entry block arguments with reduction and private variables.
-  mlir::OperandRange privateVars = parallelOp.getPrivateVars();
-
   llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
                                               reductionTypes.end());
-  blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
-  llvm::transform(privateVars, std::back_inserter(blockArgTypes),
-                  [](mlir::Value v) { return v.getType(); });
-
   llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
-  blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
-  llvm::transform(privateVars, std::back_inserter(blockArgLocs),
-                  [](mlir::Value v) { return v.getLoc(); });
+  llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);
+
+  if (enableDelayedPrivatization) {
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+
+    blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(blockArgTypes),
+                    [](mlir::Value v) { return v.getType(); });
 
-  firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
+    blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(blockArgLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
+  }
+
+  mlir::Region &region = parallelOp.getRegion();
+  firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
                            blockArgLocs);
 
+  // Bind syms to block args.
+  unsigned argIdx = 0;
+  for (const semantics::Symbol *arg : blockSyms) {
+    auto bind = [&](const semantics::Symbol *sym) {
+      mlir::BlockArgument blockArg = region.getArgument(argIdx++);
+      converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
+                                     loc, firOpBuilder, hlfir::Entity{blockArg},
+                                     /*contiguousHint=*/
+                                     evaluate::IsSimplyContiguous(
+                                         *sym, converter.getFoldingContext()))
+                                     .first);
+    };
+
+    if (const auto *commonDet =
+            arg->detailsIf<semantics::CommonBlockDetails>()) {
+      for (const auto &mem : commonDet->objects())
+        bind(&*mem);
+    } else
+      bind(arg);
+  }
+
+  // Handle threadprivate and copyin, which would normally be done as part of
+  // `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
+  // composite construct, we can't recursively lower its contents. This prevents
+  // us from being able to rely on the existing `genOpWithBody()` flow.
+  {
+    mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
+    threadPrivatizeVars(converter, eval);
+  }
+  ClauseProcessor(converter, semaCtx, clauses).processCopyin();
+
   firOpBuilder.setInsertionPoint(
       lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));
 
@@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
       findParentTargetOp(converter.getFirOpBuilder());
   bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
-  // Clause processing.
-  mlir::omp::DistributeOperands distributeClauseOps;
-  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                       distributeClauseOps);
-
+  // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
   mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
                      evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
                      parallelReductionTypes, parallelReductionSyms);
 
-  const auto &privateClauseOps = dsp.getPrivateClauseOps();
-  parallelClauseOps.privateVars = privateClauseOps.privateVars;
-  parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+  genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+                         parallelClauseOps, numThreadsClauseOps,
+                         parallelReductionSyms, parallelReductionTypes,
+                         evalOutsideTarget ? targetOp : nullptr, dsp);
+
+  // Clause processing.
+  mlir::omp::DistributeOperands distributeClauseOps;
+  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       distributeClauseOps);
 
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2538,11 +2583,6 @@ static void genCompositeDistributeParallelDo(
   auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
       converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
-  auto parallelOp = genParallelWrapperOp(
-      converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
-      parallelReductionSyms, parallelReductionTypes,
-      evalOutsideTarget ? targetOp : nullptr, dsp);
-
   // TODO: Add private variables to entry block arguments.
   auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
       converter, loc, wsloopClauseOps, wsloopReductionTypes);
@@ -2550,14 +2590,10 @@ static void genCompositeDistributeParallelDo(
   // Construct wrapper entry block list and associated symbols. It is important
   // that the symbol order and the block argument order match, so that the
   // symbol-value bindings created are correct.
-  auto wrapperSyms =
-      llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
-          parallelReductionSyms, dsp.getDelayedPrivSyms(),
-          wsloopReductionSyms));
+  auto &wrapperSyms = wsloopReductionSyms;
 
   auto wrapperArgs = llvm::to_vector(
       llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
-                                        parallelOp.getRegion().getArguments(),
                                         wsloopOp.getRegion().getArguments()));
 
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
       findParentTargetOp(converter.getFirOpBuilder());
   bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
-  // Clause processing.
-  mlir::omp::DistributeOperands distributeClauseOps;
-  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                       distributeClauseOps);
-
+  // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
   mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
                      evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
                      parallelReductionTypes, parallelReductionSyms);
 
-  const auto &privateClauseOps = dsp.getPrivateClauseOps();
-  parallelClauseOps.privateVars = privateClauseOps.privateVars;
-  parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+  genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+                         parallelClauseOps, numThreadsClauseOps,
+                         parallelReductionSyms, parallelReductionTypes,
+                         evalOutsideTarget ? targetOp : nullptr, dsp);
+
+  // Clause processing.
+  mlir::omp::DistributeOperands distributeClauseOps;
+  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       distributeClauseOps);
 
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
   auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
       converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
-  auto parallelOp = genParallelWrapperOp(
-      converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
-      parallelReductionSyms, parallelReductionTypes,
-      evalOutsideTarget ? targetOp : nullptr, dsp);
-
   // TODO: Add private variables to entry block arguments.
   auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
       converter, loc, wsloopClauseOps, wsloopReductionTypes);
@@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
   // Construct wrapper entry block list and associated symbols. It is important
   // that the symbol order and the block argument order match, so that the
   // symbol-value bindings created are correct.
-  auto wrapperSyms =
-      llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
-          parallelReductionSyms, dsp.getDelayedPrivSyms(),
-          wsloopReductionSyms));
+  auto &wrapperSyms = wsloopReductionSyms;
 
   auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
       distributeOp.getRegion().getArguments(),
-      parallelOp.getRegion().getArguments(),
       wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));
 
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
   bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
                   llvm::omp::Association::Loop;
   if (loopLeaf) {
+    // Used delayed privatization for 'distribute parallel do [simd]'.
+    bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
     symTable.pushScope();
     loopDsp.emplace(converter, semaCtx, item->clauses, eval,
                     /*shouldCollectPreDeterminedSymbols=*/true,
-                    /*useDelayedPrivatization=*/false, &symTable);
+                    useDelayedPrivatization, &symTable);
     loopDsp->processStep1();
     loopDsp->processStep2();
   }
diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
index c45d3b9d7a2f3d..d54715d3fa3f56 100644
--- a/flang/lib/Optimizer/Builder/FIRBuilder.cpp
+++ b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto ompOutlineableIface =
           getRegion()
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
-    // omp.parallel can work as a block construct but it can also be a loop
-    // wrapper when part of a composite construct. Make sure it's only treated
-    // as a block if it's not a wrapper.
-    auto parallelOp =
-        llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
-    if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                           parallelOp->getParentOp()))
-      return ompOutlineableIface.getAllocaBlock();
-
-    if (auto parentOutlineable =
-            parallelOp
-                ->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
-      return parentOutlineable.getAllocaBlock();
+    return ompOutlineableIface.getAllocaBlock();
   }
 
   if (auto recipeIface =
@@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
     llvm::ArrayRef<mlir::NamedAttribute> attrs) {
   assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   // If the alloca is inside an OpenMP Op which will be outlined then pin
-  // the alloca here. Make sure that an omp.parallel operation that is taking
-  // a loop wrapper role is not detected as outlineable here.
-  auto iface =
-      getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
-  auto parallelOp =
-      iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
+  // the alloca here.
   const bool pinned =
-      iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                                   parallelOp->getParentOp()));
+      getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
   mlir::Value temp =
       create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
                             pinned, lenParams, shape, attrs);
diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
index 22026f1d258716..0e114f069be66d 100644
--- a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
+++ b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
 // 4. The first ancestor that is one of the above.
 mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
     mlir::Operation *op, mlir::Region *parentRegion) const {
-  if (auto iface =
-          mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
-    // omp.parallel can work as a block construct but it can also be a loop
-    // wrapper when it's part of a composite construct. Make sure it's only
-    // treated as a block if it's not a wrapper.
-    auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
-    if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                           parallelOp->getParentOp()))
-      return iface.getAllocaBlock();
-  }
+  if (auto outlineableIface =
+          mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
+    return outlineableIface.getAllocaBlock();
   if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
     return recipeIface.getAllocaBlock(*parentRegion);
   if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
index d6eae5ec5fdde0..bdc2d9cd9c6c43 100644
--- a/flang/lib/Optimizer/Transforms/StackArrays.cpp
+++ b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -589,31 +589,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
     return {point};
   };
 
-  // Find the first OpenMP outlineable parent region while taking into account
-  // the possibility of finding an omp.parallel region that is taking a loop
-  // wrapper role. These operations must be skipped, as they cannot hold
-  // allocations.
-  const auto findOmpRegion = [](mlir::Operation *op) {
-    auto findOmpRegionImpl =
-        [](mlir::Operation *op,
-           auto &findOmpRegion) -> mlir::omp::OutlineableOpenMPOpInterface {
-      auto ompRegion =
-          op->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
-      if (!ompRegion)
-        return nullptr;
-
-      if (auto parallelOp =
-              mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(*ompRegion)) {
-        mlir::Operation *parentOp = parallelOp->getParentOp();
-        if (mlir::isa_and_present<mlir::omp::DistributeOp>(parentOp))
-          return findOmpRegion(parentOp, findOmpRegion);
-      }
-      return ompRegion;
-    };
-    return findOmpRegionImpl(op, findOmpRegionImpl);
-  };
-
-  auto oldOmpRegion = findOmpRegion(oldAlloc);
+  auto oldOmpRegion =
+      oldAlloc->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
 
   // Find when the last operand value becomes available
   mlir::Block *operandsBlock = nullptr;
@@ -641,7 +618,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
     LLVM_DEBUG(llvm::dbgs()
                << "--Placing after last operand: " << *lastOperand << "\n");
     // check we aren't moving out of an omp region
-    auto lastOpOmpRegion = findOmpRegion(lastOperand);
+    auto lastOpOmpRegion =
+        lastOperand->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
     if (lastOpOmpRegion == oldOmpRegion)
       return checkReturn(lastOperand);
     // Presumably this happened because the operands became ready before the
diff --git a/flang/test/Lower/OpenMP/eval-outside-target.f90 b/flang/test/Lower/OpenMP/eval-outside-target.f90
index 5d4a8a104c8952..ef578610e8e908 100644
--- a/flang/test/Lower/OpenMP/eval-outside-target.f90
+++ b/flang/test/Lower/OpenMP/eval-outside-target.f90
@@ -2,12 +2,11 @@
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 %s -o - | FileCheck %s --check-prefixes=BOTH,HOST
 ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=51 -fopenmp-is-target-device %s -o - | FileCheck %s --check-prefixes=BOTH,DEVICE
 
-! CHECK-LABEL: func.func @_QPteams
+! BOTH-LABEL: func.func @_QPteams
 subroutine teams()
   ! BOTH: omp.target
 
-  ! HOST-SAME: num_teams({{.*}})
-  ! HOST-SAME: teams_thread_limit({{.*}})
+  ! HOST-SAME: num_teams({{.*}}) teams_thread_limit({{.*}})
 
   ! DEVICE-NOT: num_teams({{.*}})
   ! DEVICE-NOT: teams_thread_limit({{.*}})
@@ -20,8 +19,7 @@ subroutine teams()
   ! HOST-NOT: thread_limit({{.*}})
   ! HOST-SAME: {
 
-  ! DEVICE-SAME: num_teams({{.*}})
-  ! DEVICE-SAME: thread_limit({{.*}})
+  ! DEVICE-SAME: num_teams({{.*}}) thread_limit({{.*}})
   !$omp teams num_teams(1) thread_limit(2)
   call foo()
   !$omp end teams
@@ -29,13 +27,13 @@ subroutine teams()
   !$omp end target
 
   ! BOTH: omp.teams
-  ! BOTH-SAME: num_teams({{.*}})
-  ! BOTH-SAME: thread_limit({{.*}})
+  ! BOTH-SAME: num_teams({{.*}}) thread_limit({{.*}})
   !$omp teams num_teams(1) thread_limit(2)
   call foo()
   !$omp end teams
 end subroutine teams
 
+! BOTH-LABEL: func.func @_QPparallel
 subroutine parallel()
   ! BOTH: omp.target
 
@@ -76,6 +74,7 @@ subroutine parallel()
   !$omp end parallel
 end subroutine parallel
 
+! BOTH-LABEL: func.func @_QPdistribute_parallel_do
 subroutine distribute_parallel_do()
   ! BOTH: omp.target
   
@@ -87,14 +86,14 @@ subroutine distribute_parallel_do()
   ! BOTH: omp.teams
   !$omp target teams
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
 
   ! HOST-NOT: num_threads({{.*}})
   ! HOST-SAME: {
-  
+
   ! DEVICE-SAME: num_threads({{.*}})
-  
+
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   !$omp distribute parallel do num_threads(1)
   do i=1,10
@@ -110,9 +109,9 @@ subroutine distribute_parallel_do()
   !$omp target teams
   call foo()
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   !$omp distribute parallel do num_threads(1)
   do i=1,10
@@ -124,9 +123,9 @@ subroutine distribute_parallel_do()
   ! BOTH: omp.teams
   !$omp teams
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   !$omp distribute parallel do num_threads(1)
   do i=1,10
@@ -136,6 +135,7 @@ subroutine distribute_parallel_do()
   !$omp end teams
 end subroutine distribute_parallel_do
 
+! BOTH-LABEL: func.func @_QPdistribute_parallel_do_simd
 subroutine distribute_parallel_do_simd()
   ! BOTH: omp.target
   
@@ -147,14 +147,14 @@ subroutine distribute_parallel_do_simd()
   ! BOTH: omp.teams
   !$omp target teams
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
 
   ! HOST-NOT: num_threads({{.*}})
   ! HOST-SAME: {
   
   ! DEVICE-SAME: num_threads({{.*}})
-  
+
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   ! BOTH-NEXT: omp.simd
   !$omp distribute parallel do simd num_threads(1)
@@ -171,9 +171,9 @@ subroutine distribute_parallel_do_simd()
   !$omp target teams
   call foo()
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   ! BOTH-NEXT: omp.simd
   !$omp distribute parallel do simd num_threads(1)
@@ -186,9 +186,9 @@ subroutine distribute_parallel_do_simd()
   ! BOTH: omp.teams
   !$omp teams
 
-  ! BOTH: omp.distribute
-  ! BOTH-NEXT: omp.parallel
+  ! BOTH: omp.parallel
   ! BOTH-SAME: num_threads({{.*}})
+  ! BOTH: omp.distribute
   ! BOTH-NEXT: omp.wsloop
   ! BOTH-NEXT: omp.simd
   !$omp distribute parallel do simd num_threads(1)
diff --git a/flang/test/Lower/OpenMP/hlfir-to-fir-conv-omp.mlir b/flang/test/Lower/OpenMP/hlfir-to-fir-conv-omp.mlir
index 62f93efde9c643..61bc8cc60ecd22 100644
--- a/flang/test/Lower/OpenMP/hlfir-to-fir-conv-omp.mlir
+++ b/flang/test/Lower/OpenMP/hlfir-to-fir-conv-omp.mlir
@@ -17,8 +17,6 @@ func.func @_QPfoo() {
   // CHECK: omp.target
   omp.target map_entries(%map_info -> %arg1 : !fir.ref<!fir.array<1xi32>>)  {
   ^bb0(%arg1: !fir.ref<!fir.array<1xi32>>):
-
-    // CHECK: %[[TO_BOX_ALLOC:.*]] = fir.alloca !fir.box<!fir.array<1xi32>> {pinned}
     %c1_2 = arith.constant 1 : index
     %21 = fir.shape %c1_2 : (index) -> !fir.shape<1>
 
@@ -30,10 +28,11 @@ func.func @_QPfoo() {
       %c1_3 = arith.constant 1 : i32
       %c10 = arith.constant 10 : i32
 
-      // CHECK: omp.distribute
-      omp.distribute {
-        // CHECK: omp.parallel
-        omp.parallel {
+      // CHECK: omp.parallel
+      omp.parallel {
+        // CHECK: %[[TO_BOX_ALLOC:.*]] = fir.alloca !fir.box<!fir.array<1xi32>> {pinned}
+        // CHECK: omp.distribute
+        omp.distribute {
           // CHECK: omp.wsloop
           omp.wsloop {
             // CHECK: omp.loop_nest
diff --git a/flang/test/Lower/OpenMP/if-clause.f90 b/flang/test/Lower/OpenMP/if-clause.f90
index 402fbe62df58ad..402fd56b2fd816 100644
--- a/flang/test/Lower/OpenMP/if-clause.f90
+++ b/flang/test/Lower/OpenMP/if-clause.f90
@@ -19,10 +19,10 @@ program main
   ! ----------------------------------------------------------------------------
   !$omp teams
 
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -36,12 +36,12 @@ program main
   end do
   !$omp end distribute parallel do simd
 
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -53,12 +53,12 @@ program main
   end do
   !$omp end distribute parallel do simd
 
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -70,12 +70,12 @@ program main
   end do
   !$omp end distribute parallel do simd
 
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -87,10 +87,10 @@ program main
   end do
   !$omp end distribute parallel do simd
 
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -111,10 +111,10 @@ program main
   ! ----------------------------------------------------------------------------
   !$omp teams
 
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -125,12 +125,12 @@ program main
   end do
   !$omp end distribute parallel do
 
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -139,12 +139,12 @@ program main
   end do
   !$omp end distribute parallel do
 
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -823,10 +823,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -843,12 +843,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -863,12 +863,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -883,10 +883,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -903,10 +903,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -923,12 +923,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -946,10 +946,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -966,12 +966,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -986,12 +986,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1006,10 +1006,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1026,10 +1026,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1046,12 +1046,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1066,10 +1066,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1314,10 +1314,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1331,12 +1331,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1348,12 +1348,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1365,10 +1365,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1382,12 +1382,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1402,10 +1402,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1422,12 +1422,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1442,12 +1442,12 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
-  ! CHECK-NOT:  if({{.*}})
-  ! CHECK-SAME: {
   ! CHECK:      omp.parallel
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.distribute
+  ! CHECK-NOT:  if({{.*}})
+  ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1462,10 +1462,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-SAME: if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
@@ -1482,11 +1482,11 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
+  ! CHECK:      omp.parallel
+  ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
-  ! CHECK-SAME: if({{.*}})
   ! CHECK:      omp.wsloop
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
@@ -1501,10 +1501,10 @@ program main
   ! CHECK:      omp.teams
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.distribute
+  ! CHECK:      omp.parallel
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
-  ! CHECK:      omp.parallel
+  ! CHECK:      omp.distribute
   ! CHECK-NOT:  if({{.*}})
   ! CHECK-SAME: {
   ! CHECK:      omp.wsloop
diff --git a/flang/test/Lower/OpenMP/loop-compound.f90 b/flang/test/Lower/OpenMP/loop-compound.f90
index 05d373d3dcdacb..cb614a0debdaf5 100644
--- a/flang/test/Lower/OpenMP/loop-compound.f90
+++ b/flang/test/Lower/OpenMP/loop-compound.f90
@@ -14,8 +14,8 @@ program main
   ! ----------------------------------------------------------------------------
   !$omp teams
 
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.simd
   ! CHECK-NEXT: omp.loop_nest
@@ -31,8 +31,8 @@ program main
   ! ----------------------------------------------------------------------------
   !$omp teams
 
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.loop_nest
   !$omp distribute parallel do
@@ -144,8 +144,8 @@ program main
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.target
   ! CHECK: omp.teams
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.simd
   ! CHECK-NEXT: omp.loop_nest
@@ -159,8 +159,8 @@ program main
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.target
   ! CHECK: omp.teams
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.loop_nest
   !$omp target teams distribute parallel do
@@ -196,8 +196,8 @@ program main
   ! TEAMS DISTRIBUTE PARALLEL DO SIMD
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.teams
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.simd
   ! CHECK-NEXT: omp.loop_nest
@@ -210,8 +210,8 @@ program main
   ! TEAMS DISTRIBUTE PARALLEL DO
   ! ----------------------------------------------------------------------------
   ! CHECK: omp.teams
+  ! CHECK: omp.parallel
   ! CHECK: omp.distribute
-  ! CHECK-NEXT: omp.parallel
   ! CHECK-NEXT: omp.wsloop
   ! CHECK-NEXT: omp.loop_nest
   !$omp teams distribute parallel do
diff --git a/flang/test/Lower/OpenMP/target_private.f90 b/flang/test/Lower/OpenMP/target_private.f90
index 52471206113ff6..e45d128f41db36 100644
--- a/flang/test/Lower/OpenMP/target_private.f90
+++ b/flang/test/Lower/OpenMP/target_private.f90
@@ -45,14 +45,14 @@ subroutine omp_target_target_do_simd()
 !CHECK: %[[IV:.*]] = omp.map.info{{.*}}map_clauses(implicit{{.*}}{name = "iv"}
 !CHECK: %[[VAR:.*]] = omp.map.info{{.*}}map_clauses(implicit{{.*}}{name = "var"}
 !CHECK: omp.target
-!CHECK-SAME: map_entries(%[[IV]] -> %{{.*}}, %[[VAR]] -> %{{.*}}
+!CHECK-SAME: map_entries(%[[IV]] -> %[[MAP_IV:.*]], %[[VAR]] -> %[[MAP_VAR:.*]] : !fir.ref<i64>, !fir.ref<f64>)
+!CHECK:       %[[MAP_IV_DECL:.*]]:2 = hlfir.declare %[[MAP_IV]]
+!CHECK:       %[[MAP_VAR_DECL:.*]]:2 = hlfir.declare %[[MAP_VAR]]
 !CHECK:       omp.teams {
-!CHECK:         %[[IV_PRIV:.*]] = fir.alloca i64 {bindc_name = "iv"
+!CHECK:         omp.parallel private(@{{.*}} %[[MAP_IV_DECL]]#0 -> %[[IV_PRIV:.*]] : !fir.ref<i64>, @{{.*}} %[[MAP_VAR_DECL]]#0 -> %[[VAR_PRIV:.*]] : !fir.ref<f64>) {
 !CHECK:         %[[IV_DECL:.*]]:2 = hlfir.declare %[[IV_PRIV]]
-!CHECK:         %[[VAR_PRIV:.*]] = fir.alloca f64 {bindc_name = "var"
 !CHECK:         %[[VAR_DECL:.*]]:2 = hlfir.declare %[[VAR_PRIV]]
-!CHECK:         omp.distribute {
-!CHECK-NEXT:      omp.parallel {
+!CHECK:           omp.distribute {
 !CHECK-NEXT:        omp.wsloop {
 !CHECK-NEXT:          omp.simd {
 !CHECK-NEXT:            omp.loop_nest

From 9a95922fbdb98440390a342f95b44bb619394c08 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 7 Aug 2024 11:47:42 +0100
Subject: [PATCH 3/3] [Flang][OpenMP] Update DO CONCURRENT conversion for the
 device

This patch makes changes to the `DoConcurrentConversion` pass to follow the
"hoisted omp.parallel" representation when converting `do concurrent`
constructs into `target teams distribute parallel do`.
---
 .../Transforms/DoConcurrentConversion.cpp     | 23 +++++++------------
 .../Transforms/DoConcurrent/basic_device.f90  |  3 +--
 .../multiple_iteration_ranges.f90             |  5 ++--
 .../DoConcurrent/not_perfectly_nested.f90     |  4 +++-
 .../DoConcurrent/skip_all_nested_loops.f90    |  4 +++-
 5 files changed, 17 insertions(+), 22 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
index a5de824c6cc0f7..912d33e0e38e9d 100644
--- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -599,9 +599,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 
       targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper,
                              outermostLoopLives, targetClauseOps);
-      genTeamsOp(doLoop.getLoc(), rewriter, loopNest, mapper,
-                 loopNestClauseOps);
-      genDistributeOp(doLoop.getLoc(), rewriter);
+      genTeamsOp(doLoop.getLoc(), rewriter);
     }
 
     mlir::omp::ParallelOp parallelOp = genParallelOp(
@@ -611,6 +609,9 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);
 
+    if (mapToDevice)
+      genDistributeOp(doLoop.getLoc(), rewriter);
+
     mlir::omp::LoopNestOp ompLoopNest =
         genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps);
 
@@ -800,18 +801,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
   }
 
   mlir::omp::TeamsOp
-  genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-             looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
-             mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  genTeamsOp(mlir::Location loc,
+             mlir::ConversionPatternRewriter &rewriter) const {
     auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(
         loc, /*clauses=*/mlir::omp::TeamsOperands{});
 
     rewriter.createBlock(&teamsOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
-    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
-    genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
-
     return teamsOp;
   }
 
@@ -905,12 +902,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     rewriter.createBlock(&parallelOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
-    // If mapping to host, the local induction variable and loop bounds need to
-    // be emitted as part of the `omp.parallel` op.
-    if (!mapToDevice) {
-      genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
-      genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
-    }
+    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+    genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
 
     return parallelOp;
   }
diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90
index b3d0f91ddd3e18..7873fa4f88db6d 100644
--- a/flang/test/Transforms/DoConcurrent/basic_device.f90
+++ b/flang/test/Transforms/DoConcurrent/basic_device.f90
@@ -43,6 +43,7 @@ program do_concurrent_basic
 
     ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]]
     ! CHECK: omp.teams {
+    ! CHECK-NEXT: omp.parallel {
 
     ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"}
     ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
@@ -54,8 +55,6 @@ program do_concurrent_basic
     ! CHECK: %[[STEP:.*]] = arith.constant 1 : index
 
     ! CHECK-NEXT: omp.distribute {
-    ! CHECK-NEXT: omp.parallel {
-
     ! CHECK-NEXT: omp.wsloop {
 
     ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
diff --git a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90 b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
index a0364612976bcb..17cf27a9b70b27 100644
--- a/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
+++ b/flang/test/Transforms/DoConcurrent/multiple_iteration_ranges.f90
@@ -65,7 +65,7 @@ program main
 ! DEVICE: omp.target
 ! DEVICE: omp.teams
 
-! HOST: omp.parallel {
+! COMMON: omp.parallel {
 
 ! COMMON-NEXT: %[[ITER_VAR_I:.*]] = fir.alloca i32 {bindc_name = "i"}
 ! COMMON-NEXT: %[[BINDING_I:.*]]:2 = hlfir.declare %[[ITER_VAR_I]] {uniq_name = "_QFEi"}
@@ -94,8 +94,7 @@ program main
 ! COMMON: %[[UB_K:.*]] = fir.convert %[[C30]] : (i32) -> index
 ! COMMON: %[[STEP_K:.*]] = arith.constant 1 : index
 
-! DEVICE:      omp.distribute
-! DEVICE-NEXT: omp.parallel
+! DEVICE: omp.distribute
 
 ! COMMON: omp.wsloop {
 ! COMMON-NEXT: omp.loop_nest
diff --git a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90 b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
index 559d26c39cba55..f3f2e78f5b3183 100644
--- a/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
+++ b/flang/test/Transforms/DoConcurrent/not_perfectly_nested.f90
@@ -39,9 +39,11 @@ program main
 ! DEVICE: %[[TARGET_K_DECL:.*]]:2 = hlfir.declare %[[K_ARG]] {uniq_name = "_QFEk"}
 
 ! DEVICE: omp.teams
-! DEVICE: omp.distribute
 
 ! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
 ! COMMON: omp.wsloop {
 ! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
 ! COMMON:   fir.do_loop %[[J_IV:.*]] = {{.*}} {
diff --git a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90 b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
index 362b8685cfd15b..429500cead1073 100644
--- a/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
+++ b/flang/test/Transforms/DoConcurrent/skip_all_nested_loops.f90
@@ -39,9 +39,11 @@ program main
 ! DEVICE: %[[TARGET_K_DECL:.*]]:2 = hlfir.declare %[[K_ARG]] {uniq_name = "_QFEk"}
 
 ! DEVICE: omp.teams
-! DEVICE: omp.distribute
 
 ! COMMON: omp.parallel {
+
+! DEVICE: omp.distribute
+
 ! COMMON: omp.wsloop {
 ! COMMON: omp.loop_nest ({{[^[:space:]]+}}) {{.*}} {
 ! COMMON:   fir.do_loop {{.*}} iter_args(%[[J_IV:.*]] = {{.*}}) -> {{.*}} {