Skip to content

Commit

Permalink
[Flang][OpenMP] Re-implement lowering of DISTRIBUTE PARALLEL DO (#135)
Browse files Browse the repository at this point in the history
This patch updates the Flang lowering process for `distribute parallel do` to
follow the "hoisted `omp.parallel`" representation. Now temporary allocations
produced while lowering the loop body reside inside of that operation instead
of the loop wrappers' parent region.

Special handling of `omp.parallel` with regards to alloca creation is removed,
as it's no longer necessary to make this distinction. Impacted Lit tests are
updated according to the new representation.
  • Loading branch information
skatrak committed Aug 13, 2024
1 parent 4d857ec commit 13ae6e0
Show file tree
Hide file tree
Showing 9 changed files with 211 additions and 228 deletions.
135 changes: 83 additions & 52 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
return parallelOp;
}

// TODO: Replace with genWrapperOp calls.
static mlir::omp::ParallelOp genParallelWrapperOp(
static mlir::omp::ParallelOp genParallelCompositeOp(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::pft::Evaluation &eval, mlir::Location loc,
const mlir::omp::ParallelOperands &clauseOps,
const List<Clause> &clauses, lower::pft::Evaluation &eval,
mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
DataSharingProcessor &dsp) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();

// Create omp.parallel wrapper.
if (enableDelayedPrivatization) {
const auto &privateClauseOps = dsp.getPrivateClauseOps();
clauseOps.privateVars = privateClauseOps.privateVars;
clauseOps.privateSyms = privateClauseOps.privateSyms;
}

// Create omp.parallel operation.
auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);

if (numThreadsClauseOps.numThreads) {
Expand All @@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
}

// Populate entry block arguments with reduction and private variables.
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
reductionTypes.end());
blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });
llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);

if (enableDelayedPrivatization) {
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });

llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
}

mlir::Region &region = parallelOp.getRegion();
firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
blockArgLocs);

// Bind syms to block args.
unsigned argIdx = 0;
for (const semantics::Symbol *arg : blockSyms) {
auto bind = [&](const semantics::Symbol *sym) {
mlir::BlockArgument blockArg = region.getArgument(argIdx++);
converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
loc, firOpBuilder, hlfir::Entity{blockArg},
/*contiguousHint=*/
evaluate::IsSimplyContiguous(
*sym, converter.getFoldingContext()))
.first);
};

if (const auto *commonDet =
arg->detailsIf<semantics::CommonBlockDetails>()) {
for (const auto &mem : commonDet->objects())
bind(&*mem);
} else
bind(arg);
}

// Handle threadprivate and copyin, which would normally be done as part of
// `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
// composite construct, we can't recursively lower its contents. This prevents
// us from being able to rely on the existing `genOpWithBody()` flow.
{
mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
threadPrivatizeVars(converter, eval);
}
ClauseProcessor(converter, semaCtx, clauses).processCopyin();

firOpBuilder.setInsertionPoint(
lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));

Expand Down Expand Up @@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2538,26 +2583,17 @@ static void genCompositeDistributeParallelDo(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);

// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(
llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand All @@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);
Expand All @@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand Down Expand Up @@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
llvm::omp::Association::Loop;
if (loopLeaf) {
// Used delayed privatization for 'distribute parallel do [simd]'.
bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
symTable.pushScope();
loopDsp.emplace(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
/*useDelayedPrivatization=*/false, &symTable);
useDelayedPrivatization, &symTable);
loopDsp->processStep1();
loopDsp->processStep2();
}
Expand Down
24 changes: 3 additions & 21 deletions flang/lib/Optimizer/Builder/FIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
if (auto ompOutlineableIface =
getRegion()
.getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when part of a composite construct. Make sure it's only treated
// as a block if it's not a wrapper.
auto parallelOp =
llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return ompOutlineableIface.getAllocaBlock();

if (auto parentOutlineable =
parallelOp
->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
return parentOutlineable.getAllocaBlock();
return ompOutlineableIface.getAllocaBlock();
}

if (auto recipeIface =
Expand All @@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
llvm::ArrayRef<mlir::NamedAttribute> attrs) {
assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
// If the alloca is inside an OpenMP Op which will be outlined then pin
// the alloca here. Make sure that an omp.parallel operation that is taking
// a loop wrapper role is not detected as outlineable here.
auto iface =
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
auto parallelOp =
iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
// the alloca here.
const bool pinned =
iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()));
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
mlir::Value temp =
create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
pinned, lenParams, shape, attrs);
Expand Down
13 changes: 3 additions & 10 deletions flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
// 4. The first ancestor that is one of the above.
mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
mlir::Operation *op, mlir::Region *parentRegion) const {
if (auto iface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when it's part of a composite construct. Make sure it's only
// treated as a block if it's not a wrapper.
auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return iface.getAllocaBlock();
}
if (auto outlineableIface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
return outlineableIface.getAllocaBlock();
if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
return recipeIface.getAllocaBlock(*parentRegion);
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
Expand Down
30 changes: 4 additions & 26 deletions flang/lib/Optimizer/Transforms/StackArrays.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,31 +589,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
return {point};
};

// Find the first OpenMP outlineable parent region while taking into account
// the possibility of finding an omp.parallel region that is taking a loop
// wrapper role. These operations must be skipped, as they cannot hold
// allocations.
const auto findOmpRegion = [](mlir::Operation *op) {
auto findOmpRegionImpl =
[](mlir::Operation *op,
auto &findOmpRegion) -> mlir::omp::OutlineableOpenMPOpInterface {
auto ompRegion =
op->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
if (!ompRegion)
return nullptr;

if (auto parallelOp =
mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(*ompRegion)) {
mlir::Operation *parentOp = parallelOp->getParentOp();
if (mlir::isa_and_present<mlir::omp::DistributeOp>(parentOp))
return findOmpRegion(parentOp, findOmpRegion);
}
return ompRegion;
};
return findOmpRegionImpl(op, findOmpRegionImpl);
};

auto oldOmpRegion = findOmpRegion(oldAlloc);
auto oldOmpRegion =
oldAlloc->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();

// Find when the last operand value becomes available
mlir::Block *operandsBlock = nullptr;
Expand Down Expand Up @@ -641,7 +618,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
LLVM_DEBUG(llvm::dbgs()
<< "--Placing after last operand: " << *lastOperand << "\n");
// check we aren't moving out of an omp region
auto lastOpOmpRegion = findOmpRegion(lastOperand);
auto lastOpOmpRegion =
lastOperand->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
if (lastOpOmpRegion == oldOmpRegion)
return checkReturn(lastOperand);
// Presumably this happened because the operands became ready before the
Expand Down
Loading

0 comments on commit 13ae6e0

Please sign in to comment.