Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Flang][OpenMP] Re-implement lowering of DISTRIBUTE PARALLEL DO #135

Merged
merged 2 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ void DataSharingProcessor::insertBarrier() {
void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
mlir::omp::LoopNestOp loopOp;
if (auto wrapper = mlir::dyn_cast<mlir::omp::LoopWrapperInterface>(op))
loopOp = wrapper.isWrapper()
? mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop())
: nullptr;
loopOp = mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop());

bool cmpCreated = false;
mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
Expand Down
135 changes: 83 additions & 52 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
return parallelOp;
}

// TODO: Replace with genWrapperOp calls.
static mlir::omp::ParallelOp genParallelWrapperOp(
static mlir::omp::ParallelOp genParallelCompositeOp(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::pft::Evaluation &eval, mlir::Location loc,
const mlir::omp::ParallelOperands &clauseOps,
const List<Clause> &clauses, lower::pft::Evaluation &eval,
mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
DataSharingProcessor &dsp) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();

// Create omp.parallel wrapper.
if (enableDelayedPrivatization) {
const auto &privateClauseOps = dsp.getPrivateClauseOps();
clauseOps.privateVars = privateClauseOps.privateVars;
clauseOps.privateSyms = privateClauseOps.privateSyms;
}

// Create omp.parallel operation.
auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);

if (numThreadsClauseOps.numThreads) {
Expand All @@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
}

// Populate entry block arguments with reduction and private variables.
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
reductionTypes.end());
blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });
llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);

if (enableDelayedPrivatization) {
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });

llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
}

mlir::Region &region = parallelOp.getRegion();
firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
blockArgLocs);

// Bind syms to block args.
unsigned argIdx = 0;
for (const semantics::Symbol *arg : blockSyms) {
auto bind = [&](const semantics::Symbol *sym) {
mlir::BlockArgument blockArg = region.getArgument(argIdx++);
converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
loc, firOpBuilder, hlfir::Entity{blockArg},
/*contiguousHint=*/
evaluate::IsSimplyContiguous(
*sym, converter.getFoldingContext()))
.first);
};

if (const auto *commonDet =
arg->detailsIf<semantics::CommonBlockDetails>()) {
for (const auto &mem : commonDet->objects())
bind(&*mem);
} else
bind(arg);
}

// Handle threadprivate and copyin, which would normally be done as part of
// `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
// composite construct, we can't recursively lower its contents. This prevents
// us from being able to rely on the existing `genOpWithBody()` flow.
{
mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
threadPrivatizeVars(converter, eval);
}
ClauseProcessor(converter, semaCtx, clauses).processCopyin();

firOpBuilder.setInsertionPoint(
lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));

Expand Down Expand Up @@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2538,26 +2583,17 @@ static void genCompositeDistributeParallelDo(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);

// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(
llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand All @@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);
Expand All @@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand Down Expand Up @@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
llvm::omp::Association::Loop;
if (loopLeaf) {
// Used delayed privatization for 'distribute parallel do [simd]'.
bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
symTable.pushScope();
loopDsp.emplace(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
/*useDelayedPrivatization=*/false, &symTable);
useDelayedPrivatization, &symTable);
loopDsp->processStep1();
loopDsp->processStep2();
}
Expand Down
24 changes: 3 additions & 21 deletions flang/lib/Optimizer/Builder/FIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
if (auto ompOutlineableIface =
getRegion()
.getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when part of a composite construct. Make sure it's only treated
// as a block if it's not a wrapper.
auto parallelOp =
llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return ompOutlineableIface.getAllocaBlock();

if (auto parentOutlineable =
parallelOp
->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
return parentOutlineable.getAllocaBlock();
return ompOutlineableIface.getAllocaBlock();
}

if (auto recipeIface =
Expand All @@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
llvm::ArrayRef<mlir::NamedAttribute> attrs) {
assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
// If the alloca is inside an OpenMP Op which will be outlined then pin
// the alloca here. Make sure that an omp.parallel operation that is taking
// a loop wrapper role is not detected as outlineable here.
auto iface =
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
auto parallelOp =
iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
// the alloca here.
const bool pinned =
iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()));
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
mlir::Value temp =
create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
pinned, lenParams, shape, attrs);
Expand Down
13 changes: 3 additions & 10 deletions flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
// 4. The first ancestor that is one of the above.
mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
mlir::Operation *op, mlir::Region *parentRegion) const {
if (auto iface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when it's part of a composite construct. Make sure it's only
// treated as a block if it's not a wrapper.
auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return iface.getAllocaBlock();
}
if (auto outlineableIface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
return outlineableIface.getAllocaBlock();
if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
return recipeIface.getAllocaBlock(*parentRegion);
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
Expand Down
30 changes: 4 additions & 26 deletions flang/lib/Optimizer/Transforms/StackArrays.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,31 +589,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
return {point};
};

// Find the first OpenMP outlineable parent region while taking into account
// the possibility of finding an omp.parallel region that is taking a loop
// wrapper role. These operations must be skipped, as they cannot hold
// allocations.
const auto findOmpRegion = [](mlir::Operation *op) {
auto findOmpRegionImpl =
[](mlir::Operation *op,
auto &findOmpRegion) -> mlir::omp::OutlineableOpenMPOpInterface {
auto ompRegion =
op->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
if (!ompRegion)
return nullptr;

if (auto parallelOp =
mlir::dyn_cast_if_present<mlir::omp::ParallelOp>(*ompRegion)) {
mlir::Operation *parentOp = parallelOp->getParentOp();
if (mlir::isa_and_present<mlir::omp::DistributeOp>(parentOp))
return findOmpRegion(parentOp, findOmpRegion);
}
return ompRegion;
};
return findOmpRegionImpl(op, findOmpRegionImpl);
};

auto oldOmpRegion = findOmpRegion(oldAlloc);
auto oldOmpRegion =
oldAlloc->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();

// Find when the last operand value becomes available
mlir::Block *operandsBlock = nullptr;
Expand Down Expand Up @@ -641,7 +618,8 @@ AllocMemConversion::findAllocaInsertionPoint(fir::AllocMemOp &oldAlloc) {
LLVM_DEBUG(llvm::dbgs()
<< "--Placing after last operand: " << *lastOperand << "\n");
// check we aren't moving out of an omp region
auto lastOpOmpRegion = findOmpRegion(lastOperand);
auto lastOpOmpRegion =
lastOperand->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
if (lastOpOmpRegion == oldOmpRegion)
return checkReturn(lastOperand);
// Presumably this happened because the operands became ready before the
Expand Down
Loading