Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Flang][OpenMP] Update DO CONCURRENT conversion for the device #136

Merged
merged 3 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions flang/lib/Lower/OpenMP/DataSharingProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,9 +235,7 @@ void DataSharingProcessor::insertBarrier() {
void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
mlir::omp::LoopNestOp loopOp;
if (auto wrapper = mlir::dyn_cast<mlir::omp::LoopWrapperInterface>(op))
loopOp = wrapper.isWrapper()
? mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop())
: nullptr;
loopOp = mlir::cast<mlir::omp::LoopNestOp>(wrapper.getWrappedLoop());

bool cmpCreated = false;
mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
Expand Down
135 changes: 83 additions & 52 deletions flang/lib/Lower/OpenMP/OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1905,18 +1905,23 @@ genParallelOp(lower::AbstractConverter &converter, lower::SymMap &symTable,
return parallelOp;
}

// TODO: Replace with genWrapperOp calls.
static mlir::omp::ParallelOp genParallelWrapperOp(
static mlir::omp::ParallelOp genParallelCompositeOp(
lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
lower::pft::Evaluation &eval, mlir::Location loc,
const mlir::omp::ParallelOperands &clauseOps,
const List<Clause> &clauses, lower::pft::Evaluation &eval,
mlir::Location loc, mlir::omp::ParallelOperands &clauseOps,
mlir::omp::NumThreadsClauseOps &numThreadsClauseOps,
llvm::ArrayRef<const semantics::Symbol *> reductionSyms,
llvm::ArrayRef<mlir::Type> reductionTypes, mlir::omp::TargetOp parentTarget,
DataSharingProcessor &dsp) {
fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();

// Create omp.parallel wrapper.
if (enableDelayedPrivatization) {
const auto &privateClauseOps = dsp.getPrivateClauseOps();
clauseOps.privateVars = privateClauseOps.privateVars;
clauseOps.privateSyms = privateClauseOps.privateSyms;
}

// Create omp.parallel operation.
auto parallelOp = firOpBuilder.create<mlir::omp::ParallelOp>(loc, clauseOps);

if (numThreadsClauseOps.numThreads) {
Expand All @@ -1928,22 +1933,60 @@ static mlir::omp::ParallelOp genParallelWrapperOp(
}

// Populate entry block arguments with reduction and private variables.
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

llvm::SmallVector<mlir::Type> blockArgTypes(reductionTypes.begin(),
reductionTypes.end());
blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

llvm::SmallVector<mlir::Location> blockArgLocs(reductionTypes.size(), loc);
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });
llvm::SmallVector<const semantics::Symbol *> blockSyms(reductionSyms);

if (enableDelayedPrivatization) {
mlir::OperandRange privateVars = parallelOp.getPrivateVars();

blockArgTypes.reserve(blockArgTypes.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgTypes),
[](mlir::Value v) { return v.getType(); });

firOpBuilder.createBlock(&parallelOp.getRegion(), {}, blockArgTypes,
blockArgLocs.reserve(blockArgLocs.size() + privateVars.size());
llvm::transform(privateVars, std::back_inserter(blockArgLocs),
[](mlir::Value v) { return v.getLoc(); });

llvm::append_range(blockSyms, dsp.getDelayedPrivSyms());
}

mlir::Region &region = parallelOp.getRegion();
firOpBuilder.createBlock(&region, /*insertPt=*/{}, blockArgTypes,
blockArgLocs);

// Bind syms to block args.
unsigned argIdx = 0;
for (const semantics::Symbol *arg : blockSyms) {
auto bind = [&](const semantics::Symbol *sym) {
mlir::BlockArgument blockArg = region.getArgument(argIdx++);
converter.bindSymbol(*sym, hlfir::translateToExtendedValue(
loc, firOpBuilder, hlfir::Entity{blockArg},
/*contiguousHint=*/
evaluate::IsSimplyContiguous(
*sym, converter.getFoldingContext()))
.first);
};

if (const auto *commonDet =
arg->detailsIf<semantics::CommonBlockDetails>()) {
for (const auto &mem : commonDet->objects())
bind(&*mem);
} else
bind(arg);
}

// Handle threadprivate and copyin, which would normally be done as part of
// `createBodyOfOp()`. However, when generating `omp.parallel` as part of a
// composite construct, we can't recursively lower its contents. This prevents
// us from being able to rely on the existing `genOpWithBody()` flow.
{
mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
threadPrivatizeVars(converter, eval);
}
ClauseProcessor(converter, semaCtx, clauses).processCopyin();

firOpBuilder.setInsertionPoint(
lower::genOpenMPTerminator(firOpBuilder, parallelOp, loc));

Expand Down Expand Up @@ -2505,11 +2548,7 @@ static void genCompositeDistributeParallelDo(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2518,9 +2557,15 @@ static void genCompositeDistributeParallelDo(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2538,26 +2583,17 @@ static void genCompositeDistributeParallelDo(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);

// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(
llvm::concat<mlir::BlockArgument>(distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand All @@ -2576,11 +2612,7 @@ static void genCompositeDistributeParallelDoSimd(
findParentTargetOp(converter.getFirOpBuilder());
bool evalOutsideTarget = mustEvalTeamsThreadsOutsideTarget(eval, targetOp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

// Create parent omp.parallel first.
mlir::omp::ParallelOperands parallelClauseOps;
mlir::omp::NumThreadsClauseOps numThreadsClauseOps;
llvm::SmallVector<const semantics::Symbol *> parallelReductionSyms;
Expand All @@ -2589,9 +2621,15 @@ static void genCompositeDistributeParallelDoSimd(
evalOutsideTarget, parallelClauseOps, numThreadsClauseOps,
parallelReductionTypes, parallelReductionSyms);

const auto &privateClauseOps = dsp.getPrivateClauseOps();
parallelClauseOps.privateVars = privateClauseOps.privateVars;
parallelClauseOps.privateSyms = privateClauseOps.privateSyms;
genParallelCompositeOp(converter, semaCtx, item->clauses, eval, loc,
parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// Clause processing.
mlir::omp::DistributeOperands distributeClauseOps;
genDistributeClauses(converter, semaCtx, stmtCtx, item->clauses, loc,
distributeClauseOps);

mlir::omp::WsloopOperands wsloopClauseOps;
llvm::SmallVector<const semantics::Symbol *> wsloopReductionSyms;
Expand All @@ -2612,11 +2650,6 @@ static void genCompositeDistributeParallelDoSimd(
auto distributeOp = genWrapperOp<mlir::omp::DistributeOp>(
converter, loc, distributeClauseOps, /*blockArgTypes=*/{});

auto parallelOp = genParallelWrapperOp(
converter, semaCtx, eval, loc, parallelClauseOps, numThreadsClauseOps,
parallelReductionSyms, parallelReductionTypes,
evalOutsideTarget ? targetOp : nullptr, dsp);

// TODO: Add private variables to entry block arguments.
auto wsloopOp = genWrapperOp<mlir::omp::WsloopOp>(
converter, loc, wsloopClauseOps, wsloopReductionTypes);
Expand All @@ -2628,14 +2661,10 @@ static void genCompositeDistributeParallelDoSimd(
// Construct wrapper entry block list and associated symbols. It is important
// that the symbol order and the block argument order match, so that the
// symbol-value bindings created are correct.
auto wrapperSyms =
llvm::to_vector(llvm::concat<const semantics::Symbol *const>(
parallelReductionSyms, dsp.getDelayedPrivSyms(),
wsloopReductionSyms));
auto &wrapperSyms = wsloopReductionSyms;

auto wrapperArgs = llvm::to_vector(llvm::concat<mlir::BlockArgument>(
distributeOp.getRegion().getArguments(),
parallelOp.getRegion().getArguments(),
wsloopOp.getRegion().getArguments(), simdOp.getRegion().getArguments()));

genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, item,
Expand Down Expand Up @@ -2756,10 +2785,12 @@ static void genOMPDispatch(lower::AbstractConverter &converter,
bool loopLeaf = llvm::omp::getDirectiveAssociation(item->id) ==
llvm::omp::Association::Loop;
if (loopLeaf) {
// Used delayed privatization for 'distribute parallel do [simd]'.
bool useDelayedPrivatization = llvm::omp::allParallelSet.test(item->id);
symTable.pushScope();
loopDsp.emplace(converter, semaCtx, item->clauses, eval,
/*shouldCollectPreDeterminedSymbols=*/true,
/*useDelayedPrivatization=*/false, &symTable);
useDelayedPrivatization, &symTable);
loopDsp->processStep1();
loopDsp->processStep2();
}
Expand Down
24 changes: 3 additions & 21 deletions flang/lib/Optimizer/Builder/FIRBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -256,19 +256,7 @@ mlir::Block *fir::FirOpBuilder::getAllocaBlock() {
if (auto ompOutlineableIface =
getRegion()
.getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>()) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when part of a composite construct. Make sure it's only treated
// as a block if it's not a wrapper.
auto parallelOp =
llvm::dyn_cast<mlir::omp::ParallelOp>(*ompOutlineableIface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return ompOutlineableIface.getAllocaBlock();

if (auto parentOutlineable =
parallelOp
->getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>())
return parentOutlineable.getAllocaBlock();
return ompOutlineableIface.getAllocaBlock();
}

if (auto recipeIface =
Expand All @@ -285,15 +273,9 @@ mlir::Value fir::FirOpBuilder::createTemporaryAlloc(
llvm::ArrayRef<mlir::NamedAttribute> attrs) {
assert(!mlir::isa<fir::ReferenceType>(type) && "cannot be a reference");
// If the alloca is inside an OpenMP Op which will be outlined then pin
// the alloca here. Make sure that an omp.parallel operation that is taking
// a loop wrapper role is not detected as outlineable here.
auto iface =
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
auto parallelOp =
iface ? llvm::dyn_cast<mlir::omp::ParallelOp>(*iface) : nullptr;
// the alloca here.
const bool pinned =
iface && (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()));
getRegion().getParentOfType<mlir::omp::OutlineableOpenMPOpInterface>();
mlir::Value temp =
create<fir::AllocaOp>(loc, type, /*unique_name=*/llvm::StringRef{}, name,
pinned, lenParams, shape, attrs);
Expand Down
13 changes: 3 additions & 10 deletions flang/lib/Optimizer/CodeGen/FIROpPatterns.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,16 +285,9 @@ mlir::Value ConvertFIRToLLVMPattern::computeBoxSize(
// 4. The first ancestor that is one of the above.
mlir::Block *ConvertFIRToLLVMPattern::getBlockForAllocaInsert(
mlir::Operation *op, mlir::Region *parentRegion) const {
if (auto iface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op)) {
// omp.parallel can work as a block construct but it can also be a loop
// wrapper when it's part of a composite construct. Make sure it's only
// treated as a block if it's not a wrapper.
auto parallelOp = llvm::dyn_cast<mlir::omp::ParallelOp>(*iface);
if (!parallelOp || !llvm::isa_and_present<mlir::omp::DistributeOp>(
parallelOp->getParentOp()))
return iface.getAllocaBlock();
}
if (auto outlineableIface =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(op))
return outlineableIface.getAllocaBlock();
if (auto recipeIface = mlir::dyn_cast<mlir::accomp::RecipeInterface>(op))
return recipeIface.getAllocaBlock(*parentRegion);
if (auto llvmFuncOp = mlir::dyn_cast<mlir::LLVM::LLVMFuncOp>(op))
Expand Down
23 changes: 8 additions & 15 deletions flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -599,9 +599,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {

targetOp = genTargetOp(doLoop.getLoc(), rewriter, mapper,
outermostLoopLives, targetClauseOps);
genTeamsOp(doLoop.getLoc(), rewriter, loopNest, mapper,
loopNestClauseOps);
genDistributeOp(doLoop.getLoc(), rewriter);
genTeamsOp(doLoop.getLoc(), rewriter);
}

mlir::omp::ParallelOp parallelOp = genParallelOp(
Expand All @@ -611,6 +609,9 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
looputils::localizeLoopLocalValue(local, parallelOp.getRegion(),
rewriter);

if (mapToDevice)
genDistributeOp(doLoop.getLoc(), rewriter);

mlir::omp::LoopNestOp ompLoopNest =
genWsLoopOp(rewriter, loopNest.back().first, mapper, loopNestClauseOps);

Expand Down Expand Up @@ -800,18 +801,14 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
}

mlir::omp::TeamsOp
genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter,
looputils::LoopNestToIndVarMap &loopNest, mlir::IRMapping &mapper,
mlir::omp::LoopNestOperands &loopNestClauseOps) const {
genTeamsOp(mlir::Location loc,
mlir::ConversionPatternRewriter &rewriter) const {
auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(
loc, /*clauses=*/mlir::omp::TeamsOperands{});

rewriter.createBlock(&teamsOp.getRegion());
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));

genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);

return teamsOp;
}

Expand Down Expand Up @@ -905,12 +902,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
rewriter.createBlock(&parallelOp.getRegion());
rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));

// If mapping to host, the local induction variable and loop bounds need to
// be emitted as part of the `omp.parallel` op.
if (!mapToDevice) {
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);
}
genLoopNestIndVarAllocs(rewriter, loopNest, mapper);
genLoopNestClauseOps(loc, rewriter, loopNest, mapper, loopNestClauseOps);

return parallelOp;
}
Expand Down
Loading