[Flang][OpenMP] Re-implement lowering of DISTRIBUTE PARALLEL DO (#135)

This patch updates the Flang lowering process for `distribute parallel do` to follow the "hoisted `omp.parallel`" representation. Now temporary allocations produced while lowering the loop body reside inside of that operation instead of the loop wrappers' parent region. Special handling of `omp.parallel` with regards to alloca creation is removed, as it's no longer necessary to make this distinction. Impacted Lit tests are updated according to the new representation.
ROCm · Aug 13, 2024 · 13ae6e0 · 13ae6e0
1 parent 4d857ec
commit 13ae6e0
Show file tree

Hide file tree

Showing 9 changed files with 211 additions and 228 deletions.
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
  return parallelOp;
 }
 
-// TODO: Replace with genWrapperOp calls.
-static mlir::omp::ParallelOp genParallelWrapperOp(
+static mlir::omp::ParallelOp genParallelCompositeOp(
  lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
- lower::pft::Evaluation &eval, mlir::Location loc,
- const mlir::omp::ParallelOperands &clauseOps,
+ const List<Clause> &clauses, lower::pft::Evaluation &eval,
+ mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
  mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
  llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
  llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
  DataSharingProcessor &dsp) {
  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
 
- // Create omp.parallel wrapper.
+ if (enableDelayedPrivatization) {
+ const auto &privateClauseOps = dsp.getPrivateClauseOps();
+ clauseOps.privateVars = privateClauseOps.privateVars;
+ clauseOps.privateSyms = privateClauseOps.privateSyms;
+ }
+
+ // Create omp.parallel operation.
  auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);
 
  if (numThreadsClauseOps.numThreads) {
@@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
  }
 
  // Populate entry block arguments with reduction and private variables.
- mlir::OperandRange privateVars = parallelOp.getPrivateVars();
-
  llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
  reductionTypes.end());
- blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
- llvm::transform(privateVars, std::back_inserter(blockArgTypes),
- [](mlir::Value v) { return v.getType(); });
-
  llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
- blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
- llvm::transform(privateVars, std::back_inserter(blockArgLocs),
- [](mlir::Value v) { return v.getLoc(); });
+ llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);
+
+ if (enableDelayedPrivatization) {
+ mlir::OperandRange privateVars = parallelOp.getPrivateVars();
+
+ blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
+ llvm::transform(privateVars, std::back_inserter(blockArgTypes),
+ [](mlir::Value v) { return v.getType(); });
 
- firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
+ blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
+ llvm::transform(privateVars, std::back_inserter(blockArgLocs),
+ [](mlir::Value v) { return v.getLoc(); });
+
+ llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
+ }
+
+ mlir::Region &region = parallelOp.getRegion();
+ firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
  blockArgLocs);
 
+ // Bind syms to block args.
+ unsigned argIdx = 0;
+ for (const semantics::Symbol *arg : blockSyms) {
+ auto bind = [&](const semantics::Symbol *sym) {
+ mlir::BlockArgument blockArg = region.getArgument(argIdx++);
+ converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
+ loc, firOpBuilder, hlfir::Entity{blockArg},
+ /*contiguousHint=*/
+ evaluate::IsSimplyContiguous(
+ *sym, converter.getFoldingContext()))
+ .first);
+ };
+
+ if (const auto *commonDet =
+ arg->detailsIf<semantics::CommonBlockDetails>()) {
+ for (const auto &mem : commonDet->objects())
+ bind(&*mem);
+ } else
+ bind(arg);
+ }
+
+ // Handle threadprivate and copyin, which would normally be done as part of
+ // `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
+ // composite construct, we can't recursively lower its contents. This prevents
+ // us from being able to rely on the existing `genOpWithBody()` flow.
+ {
+ mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
+ threadPrivatizeVars(converter, eval);
+ }
+ ClauseProcessor(converter, semaCtx, clauses).processCopyin();
+
  firOpBuilder.setInsertionPoint(
  lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));
 
@@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
  findParentTargetOp(converter.getFirOpBuilder());
  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
- // Clause processing.
- mlir::omp::DistributeOperands distributeClauseOps;
- genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
- distributeClauseOps);
-
+ // Create parent omp.parallel first.
  mlir::omp::ParallelOperands parallelClauseOps;
  mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
  llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
  evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
  parallelReductionTypes, parallelReductionSyms);
 
- const auto &privateClauseOps = dsp.getPrivateClauseOps();
- parallelClauseOps.privateVars = privateClauseOps.privateVars;
- parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+ genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+ parallelClauseOps, numThreadsClauseOps,
+ parallelReductionSyms, parallelReductionTypes,
+ evalOutsideTarget ? targetOp : nullptr, dsp);
+
+ // Clause processing.
+ mlir::omp::DistributeOperands distributeClauseOps;
+ genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+ distributeClauseOps);
 
  mlir::omp::WsloopOperands wsloopClauseOps;
  llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2538,26 +2583,17 @@ static void genCompositeDistributeParallelDo(
  auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
  converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
- auto parallelOp = genParallelWrapperOp(
- converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
- parallelReductionSyms, parallelReductionTypes,
- evalOutsideTarget ? targetOp : nullptr, dsp);
-
  // TODO: Add private variables to entry block arguments.
  auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
  converter, loc, wsloopClauseOps, wsloopReductionTypes);
 
  // Construct wrapper entry block list and associated symbols. It is important
  // that the symbol order and the block argument order match, so that the
  // symbol-value bindings created are correct.
- auto wrapperSyms =
- llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
- parallelReductionSyms, dsp.getDelayedPrivSyms(),
- wsloopReductionSyms));
+ auto &wrapperSyms = wsloopReductionSyms;
 
  auto wrapperArgs = llvm::to_vector(
  llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
- parallelOp.getRegion().getArguments(),
  wsloopOp.getRegion().getArguments()));
 
  genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
  findParentTargetOp(converter.getFirOpBuilder());
  bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);
 
- // Clause processing.
- mlir::omp::DistributeOperands distributeClauseOps;
- genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
- distributeClauseOps);
-
+ // Create parent omp.parallel first.
  mlir::omp::ParallelOperands parallelClauseOps;
  mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
  llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
@@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
  evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
  parallelReductionTypes, parallelReductionSyms);
 
- const auto &privateClauseOps = dsp.getPrivateClauseOps();
- parallelClauseOps.privateVars = privateClauseOps.privateVars;
- parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
+ genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
+ parallelClauseOps, numThreadsClauseOps,
+ parallelReductionSyms, parallelReductionTypes,
+ evalOutsideTarget ? targetOp : nullptr, dsp);
+
+ // Clause processing.
+ mlir::omp::DistributeOperands distributeClauseOps;
+ genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
+ distributeClauseOps);
 
  mlir::omp::WsloopOperands wsloopClauseOps;
  llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
@@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
  auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
  converter, loc, distributeClauseOps, /*blockArgTypes=*/{});
 
- auto parallelOp = genParallelWrapperOp(
- converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
- parallelReductionSyms, parallelReductionTypes,
- evalOutsideTarget ? targetOp : nullptr, dsp);
-
  // TODO: Add private variables to entry block arguments.
  auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
  converter, loc, wsloopClauseOps, wsloopReductionTypes);
@@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
  // Construct wrapper entry block list and associated symbols. It is important
  // that the symbol order and the block argument order match, so that the
  // symbol-value bindings created are correct.
- auto wrapperSyms =
- llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
- parallelReductionSyms, dsp.getDelayedPrivSyms(),
- wsloopReductionSyms));
+ auto &wrapperSyms = wsloopReductionSyms;
 
  auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
  distributeOp.getRegion().getArguments(),
- parallelOp.getRegion().getArguments(),
  wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));
 
  genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
@@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
  bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
  llvm::omp::Association::Loop;
  if (loopLeaf) {
+ // Used delayed privatization for 'distribute parallel do [simd]'.
+ bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
  symTable.pushScope();
  loopDsp.emplace(converter, semaCtx, item->clauses, eval,
  /*shouldCollectPreDeterminedSymbols=*/true,
- /*useDelayedPrivatization=*/false, &symTable);
+ useDelayedPrivatization, &symTable);
  loopDsp->processStep1();
  loopDsp->processStep2();
  }

diff --git a/flang/lib/Optimizer/Builder/FIRBuilder.cpp b/flang/lib/Optimizer/Builder/FIRBuilder.cpp
@@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
  if (auto ompOutlineableIface =
  getRegion()
  .getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
- // omp.parallel can work as a block construct but it can also be a loop
- // wrapper when part of a composite construct. Make sure it's only treated
- // as a block if it's not a wrapper.
- auto parallelOp =
- llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
- if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
- parallelOp->getParentOp()))
- return ompOutlineableIface.getAllocaBlock();
-
- if (auto parentOutlineable =
- parallelOp
- ->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
- return parentOutlineable.getAllocaBlock();
+ return ompOutlineableIface.getAllocaBlock();
  }
 
  if (auto recipeIface =
@@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
  llvm::ArrayRef<mlir::NamedAttribute> attrs) {
  assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
  // If the alloca is inside an OpenMP Op which will be outlined then pin
- // the alloca here. Make sure that an omp.parallel operation that is taking
- // a loop wrapper role is not detected as outlineable here.
- auto iface =
- getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
- auto parallelOp =
- iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
+ // the alloca here.
  const bool pinned =
- iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
- parallelOp->getParentOp()));
+ getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
  mlir::Value temp =
  create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
  pinned, lenParams, shape, attrs);

diff --git a/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp b/flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
@@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
 // 4. The first ancestor that is one of the above.
 mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
  mlir::Operation *op, mlir::Region *parentRegion) const {
- if (auto iface =
- mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
- // omp.parallel can work as a block construct but it can also be a loop
- // wrapper when it's part of a composite construct. Make sure it's only
- // treated as a block if it's not a wrapper.
- auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
- if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
- parallelOp->getParentOp()))
- return iface.getAllocaBlock();
- }
+ if (auto outlineableIface =
+ mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
+ return outlineableIface.getAllocaBlock();
  if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
  return recipeIface.getAllocaBlock(*parentRegion);
  if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))

diff --git a/flang/lib/Optimizer/Transforms/StackArrays.cpp b/flang/lib/Optimizer/Transforms/StackArrays.cpp
@@ -589,31 +589,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
  return {point};
  };
 
- // Find the first OpenMP outlineable parent region while taking into account
- // the possibility of finding an omp.parallel region that is taking a loop
- // wrapper role. These operations must be skipped, as they cannot hold
- // allocations.
- const auto findOmpRegion = [](mlir::Operation *op) {
- auto findOmpRegionImpl =
- [](mlir::Operation *op,
- auto &findOmpRegion) -> mlir::omp::OutlineableOpenMPOpInterface {
- auto ompRegion =
- op->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
- if (!ompRegion)
- return nullptr;
-
- if (auto parallelOp =
- mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(*ompRegion)) {
- mlir::Operation *parentOp = parallelOp->getParentOp();
- if (mlir::isa_and_present<mlir::omp::DistributeOp>(parentOp))
- return findOmpRegion(parentOp, findOmpRegion);
- }
- return ompRegion;
- };
- return findOmpRegionImpl(op, findOmpRegionImpl);
- };
-
- auto oldOmpRegion = findOmpRegion(oldAlloc);
+ auto oldOmpRegion =
+ oldAlloc->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
 
  // Find when the last operand value becomes available
  mlir::Block *operandsBlock = nullptr;
@@ -641,7 +618,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
  LLVM_DEBUG(llvm::dbgs()
  << "--Placing after last operand: " << *lastOperand << "\n");
  // check we aren't moving out of an omp region
- auto lastOpOmpRegion = findOmpRegion(lastOperand);
+ auto lastOpOmpRegion =
+ lastOperand->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
  if (lastOpOmpRegion == oldOmpRegion)
  return checkReturn(lastOperand);
  // Presumably this happened because the operands became ready before the