ROCm · skatrak · Aug 13, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 7, 2024
diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
@@ -235,9 +235,7 @@ void DataSharingProcessor::insertBarrier() {
 void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
   mlir::omp::LoopNestOp loopOp;
   if (auto wrapper = mlir::dyn_cast<mlir::omp::LoopWrapperInterface>(op))
-    loopOp = wrapper.isWrapper()
-                 ? mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop())
-                 : nullptr;
+    loopOp = mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop());
 
   bool cmpCreated = false;
   mlir::OpBuilder::InsertionGuard guard(firOpBuilder);

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
   return parallelOp;
 }
 
-// TODO: Replace with genWrapperOp calls.
-static mlir::omp::ParallelOp genParallelWrapperOp(
+static mlir::omp::ParallelOp genParallelCompositeOp(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
-    lower::pft::Evaluation &eval, mlir::Location loc,
-    const mlir::omp::ParallelOperands &clauseOps,
+    const List<Clause> &clauses, lower::pft::Evaluation &eval,
+    mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
     mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
     llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
     llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
     DataSharingProcessor &dsp) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
-  // Create omp.parallel wrapper.
+  if (enableDelayedPrivatization) {
+    const auto &privateClauseOps = dsp.getPrivateClauseOps();
+    clauseOps.privateVars = privateClauseOps.privateVars;
+    clauseOps.privateSyms = privateClauseOps.privateSyms;
+  }
+
+  // Create omp.parallel operation.
   auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);
 
   if (numThreadsClauseOps.numThreads) {
@@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
   }
 
   // Populate entry block arguments with reduction and private variables.
-  mlir::OperandRange privateVars = parallelOp.getPrivateVars();
-
   llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
                                               reductionTypes.end());
-  blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
-  llvm::transform(privateVars, std::back_inserter(blockArgTypes),
-                  [](mlir::Value v) { return v.getType(); });
-
   llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
-  blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
-  llvm::transform(privateVars, std::back_inserter(blockArgLocs),
-                  [](mlir::Value v) { return v.getLoc(); });
+  llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);
+
+  if (enableDelayedPrivatization) {
+    mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+
+    blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(blockArgTypes),
+                    [](mlir::Value v) { return v.getType(); });
 
-  firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
+    blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
+    llvm::transform(privateVars, std::back_inserter(blockArgLocs),
+                    [](mlir::Value v) { return v.getLoc(); });
+
+    llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
+  }
+
+  mlir::Region &region = parallelOp.getRegion();
+  firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
                            blockArgLocs);
 
+  // Bind syms to block args.
+  unsigned argIdx = 0;
+  for (const semantics::Symbol *arg : blockSyms) {
+    auto bind = [&](const semantics::Symbol *sym) {
+      mlir::BlockArgument blockArg = region.getArgument(argIdx++);
+      converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
+                                     loc, firOpBuilder, hlfir::Entity{blockArg},
+                                     /*contiguousHint=*/
+                                     evaluate::IsSimplyContiguous(
+                                         *sym, converter.getFoldingContext()))
+                                     .first);
+    };
+
+    if (const auto *commonDet =
+            arg->detailsIf<semantics::CommonBlockDetails>()) {
+      for (const auto &mem : commonDet->objects())
+        bind(&*mem);
+    } else
+      bind(arg);
+  }
+
+  // Handle threadprivate and copyin, which would normally be done as part of
+  // `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
+  // composite construct, we can't recursively lower its contents. This prevents
+  // us from being able to rely on the existing `genOpWithBody()` flow.
+  {
+    mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
+    threadPrivatizeVars(converter, eval);
+  }
+  ClauseProcessor(converter, semaCtx, clauses).processCopyin();
+
   firOpBuilder.setInsertionPoint(
       lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));
 
@@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
       findParentTargetOp(converter.getFirOpBuilder());
   bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
-  // Clause processing.
-  mlir::omp::DistributeOperands distributeClauseOps;
-  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                       distributeClauseOps);
-
+  // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
   mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
                      evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
                      parallelReductionTypes, parallelReductionSyms);
 
-  const auto &privateClauseOps = dsp.getPrivateClauseOps();
-  parallelClauseOps.privateVars = privateClauseOps.privateVars;
-  parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+  genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+                         parallelClauseOps, numThreadsClauseOps,
+                         parallelReductionSyms, parallelReductionTypes,
+                         evalOutsideTarget ? targetOp : nullptr, dsp);
+
+  // Clause processing.
+  mlir::omp::DistributeOperands distributeClauseOps;
+  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       distributeClauseOps);
 
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2538,26 +2583,17 @@ static void genCompositeDistributeParallelDo(
   auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
       converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
-  auto parallelOp = genParallelWrapperOp(
-      converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
-      parallelReductionSyms, parallelReductionTypes,
-      evalOutsideTarget ? targetOp : nullptr, dsp);
-
   // TODO: Add private variables to entry block arguments.
   auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
       converter, loc, wsloopClauseOps, wsloopReductionTypes);
 
   // Construct wrapper entry block list and associated symbols. It is important
   // that the symbol order and the block argument order match, so that the
   // symbol-value bindings created are correct.
-  auto wrapperSyms =
-      llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
-          parallelReductionSyms, dsp.getDelayedPrivSyms(),
-          wsloopReductionSyms));
+  auto &wrapperSyms = wsloopReductionSyms;
 
   auto wrapperArgs = llvm::to_vector(
       llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
-                                        parallelOp.getRegion().getArguments(),
                                         wsloopOp.getRegion().getArguments()));
 
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
       findParentTargetOp(converter.getFirOpBuilder());
   bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
-  // Clause processing.
-  mlir::omp::DistributeOperands distributeClauseOps;
-  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
-                       distributeClauseOps);
-
+  // Create parent omp.parallel first.
   mlir::omp::ParallelOperands parallelClauseOps;
   mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
   llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
                      evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
                      parallelReductionTypes, parallelReductionSyms);
 
-  const auto &privateClauseOps = dsp.getPrivateClauseOps();
-  parallelClauseOps.privateVars = privateClauseOps.privateVars;
-  parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+  genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+                         parallelClauseOps, numThreadsClauseOps,
+                         parallelReductionSyms, parallelReductionTypes,
+                         evalOutsideTarget ? targetOp : nullptr, dsp);
+
+  // Clause processing.
+  mlir::omp::DistributeOperands distributeClauseOps;
+  genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+                       distributeClauseOps);
 
   mlir::omp::WsloopOperands wsloopClauseOps;
   llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
   auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
       converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
-  auto parallelOp = genParallelWrapperOp(
-      converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
-      parallelReductionSyms, parallelReductionTypes,
-      evalOutsideTarget ? targetOp : nullptr, dsp);
-
   // TODO: Add private variables to entry block arguments.
   auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
       converter, loc, wsloopClauseOps, wsloopReductionTypes);
@@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
   // Construct wrapper entry block list and associated symbols. It is important
   // that the symbol order and the block argument order match, so that the
   // symbol-value bindings created are correct.
-  auto wrapperSyms =
-      llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
-          parallelReductionSyms, dsp.getDelayedPrivSyms(),
-          wsloopReductionSyms));
+  auto &wrapperSyms = wsloopReductionSyms;
 
   auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
       distributeOp.getRegion().getArguments(),
-      parallelOp.getRegion().getArguments(),
       wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));
 
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
   bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
                   llvm::omp::Association::Loop;
   if (loopLeaf) {
+    // Used delayed privatization for 'distribute parallel do [simd]'.
+    bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
     symTable.pushScope();
     loopDsp.emplace(converter, semaCtx, item->clauses, eval,
                     /*shouldCollectPreDeterminedSymbols=*/true,
-                    /*useDelayedPrivatization=*/false, &symTable);
+                    useDelayedPrivatization, &symTable);
     loopDsp->processStep1();
     loopDsp->processStep2();
   }

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
   if (auto ompOutlineableIface =
           getRegion()
               .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
-    // omp.parallel can work as a block construct but it can also be a loop
-    // wrapper when part of a composite construct. Make sure it's only treated
-    // as a block if it's not a wrapper.
-    auto parallelOp =
-        llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
-    if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                           parallelOp->getParentOp()))
-      return ompOutlineableIface.getAllocaBlock();
-
-    if (auto parentOutlineable =
-            parallelOp
-                ->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
-      return parentOutlineable.getAllocaBlock();
+    return ompOutlineableIface.getAllocaBlock();
   }
 
   if (auto recipeIface =
@@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
     llvm::ArrayRef<mlir::NamedAttribute> attrs) {
   assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
   // If the alloca is inside an OpenMP Op which will be outlined then pin
-  // the alloca here. Make sure that an omp.parallel operation that is taking
-  // a loop wrapper role is not detected as outlineable here.
-  auto iface =
-      getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
-  auto parallelOp =
-      iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
+  // the alloca here.
   const bool pinned =
-      iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                                   parallelOp->getParentOp()));
+      getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
   mlir::Value temp =
       create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
                             pinned, lenParams, shape, attrs);

diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
 // 4. The first ancestor that is one of the above.
 mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
     mlir::Operation *op, mlir::Region *parentRegion) const {
-  if (auto iface =
-          mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
-    // omp.parallel can work as a block construct but it can also be a loop
-    // wrapper when it's part of a composite construct. Make sure it's only
-    // treated as a block if it's not a wrapper.
-    auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
-    if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
-                           parallelOp->getParentOp()))
-      return iface.getAllocaBlock();
-  }
+  if (auto outlineableIface =
+          mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
+    return outlineableIface.getAllocaBlock();
   if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
     return recipeIface.getAllocaBlock(*parentRegion);
   if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))

diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
@@ -599,9 +599,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
 
       targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper,
                              outermostLoopLives, targetClauseOps);
-      genTeamsOp(doLoop.getLoc(), rewriter, loopNest, mapper,
-                 loopNestClauseOps);
-      genDistributeOp(doLoop.getLoc(), rewriter);
+      genTeamsOp(doLoop.getLoc(), rewriter);
     }
 
     mlir::omp::ParallelOp parallelOp = genParallelOp(
@@ -611,6 +609,9 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
       looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
                                         rewriter);
 
+    if (mapToDevice)
+      genDistributeOp(doLoop.getLoc(), rewriter);
+
     mlir::omp::LoopNestOp ompLoopNest =
         genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps);
 
@@ -800,18 +801,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
   }
 
   mlir::omp::TeamsOp
-  genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
-             looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
-             mlir::omp::LoopNestOperands &loopNestClauseOps) const {
+  genTeamsOp(mlir::Location loc,
+             mlir::ConversionPatternRewriter &rewriter) const {
     auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(
         loc, /*clauses=*/mlir::omp::TeamsOperands{});
 
     rewriter.createBlock(&teamsOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
-    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
-    genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
-
     return teamsOp;
   }
 
@@ -905,12 +902,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
     rewriter.createBlock(&parallelOp.getRegion());
     rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
 
-    // If mapping to host, the local induction variable and loop bounds need to
-    // be emitted as part of the `omp.parallel` op.
-    if (!mapToDevice) {
-      genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
-      genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
-    }
+    genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
+    genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
 
     return parallelOp;
   }