Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[flang][OpenMP] Add initial pass to map do concurrent to OMP #48

Merged
merged 2 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clang/include/clang/Driver/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -6473,6 +6473,10 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays",
defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stride",
PosFlag<SetTrue, [], [ClangOption], "Create unit-strided versions of loops">,
NegFlag<SetFalse, [], [ClangOption], "Do not create unit-strided loops (default)">>;

def do_concurrent_parallel_EQ : Joined<["-"], "fdo-concurrent-parallel=">,
HelpText<"Try to map `do concurrent` loops to OpenMP (on host or device)">,
Values<"none,host,device">;
} // let Visibility = [FC1Option, FlangOption]

def J : JoinedOrSeparate<["-"], "J">,
Expand Down
3 changes: 2 additions & 1 deletion clang/lib/Driver/ToolChains/Flang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ void Flang::addCodegenOptions(const ArgList &Args,
options::OPT_flang_deprecated_no_hlfir,
options::OPT_flang_experimental_polymorphism,
options::OPT_fno_ppc_native_vec_elem_order,
options::OPT_fppc_native_vec_elem_order});
options::OPT_fppc_native_vec_elem_order,
options::OPT_do_concurrent_parallel_EQ});
}

void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const {
Expand Down
2 changes: 2 additions & 0 deletions flang/include/flang/Frontend/CodeGenOptions.def
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,7 @@ ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codeg
ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use
ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers

ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP

#undef CODEGENOPT
#undef ENUM_CODEGENOPT
8 changes: 8 additions & 0 deletions flang/include/flang/Frontend/CodeGenOptions.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,14 @@ class CodeGenOptions : public CodeGenOptionsBase {
/// transformation.
OptRemark OptimizationRemarkAnalysis;

/// Optionally map `do concurrent` loops to OpenMP. This is only valid of
/// OpenMP is enabled.
enum class DoConcurrentMappingKind {
DCMK_None, // Do not lower `do concurrent` to OpenMP.
DCMK_Host, // Lower to run in parallel on the CPU.
DCMK_Device // Lower to run in parallel on the GPU.
};

// Define accessors/mutators for code generation options of enumeration type.
#define CODEGENOPT(Name, Bits, Default)
#define ENUM_CODEGENOPT(Name, Type, Bits, Default) \
Expand Down
2 changes: 2 additions & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath,
bool noNaNsFPMath, bool approxFuncFPMath,
bool noSignedZerosFPMath, bool unsafeFPMath);

std::unique_ptr<mlir::Pass> createDoConcurrentConversionPass();

// declarative passes
#define GEN_PASS_REGISTRATION
#include "flang/Optimizer/Transforms/Passes.h.inc"
Expand Down
20 changes: 20 additions & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> {
let constructor = "::fir::createFunctionAttrPass()";
}

def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> {
let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops.";

let description = [{ This is an experimental pass to map `DO CONCURRENR` loops
ergawy marked this conversation as resolved.
Show resolved Hide resolved
to their correspnding equivalent OpenMP worksharing constructs.

For now the following is supported:
- Mapping simple loops to `parallel do`.

Still to TODO:
- More extensive testing.
- Mapping to `target teams distribute parallel do`.
- Allowing the user to control mapping behavior: either to the host or
target.
}];

let constructor = "::fir::createDoConcurrentConversionPass()";
let dependentDialects = ["mlir::omp::OpenMPDialect"];
}

#endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES
28 changes: 28 additions & 0 deletions flang/lib/Frontend/CompilerInvocation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,32 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts,
return true;
}

static bool parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts,
llvm::opt::ArgList &args,
clang::DiagnosticsEngine &diags) {
llvm::opt::Arg *arg =
args.getLastArg(clang::driver::options::OPT_do_concurrent_parallel_EQ);
if (!arg)
return true;

using DoConcurrentMappingKind = Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind;
std::optional<DoConcurrentMappingKind> val =
llvm::StringSwitch<std::optional<DoConcurrentMappingKind>>(
arg->getValue())
.Case("none", DoConcurrentMappingKind::DCMK_None)
.Case("host", DoConcurrentMappingKind::DCMK_Host)
.Case("device", DoConcurrentMappingKind::DCMK_Device)
.Default(std::nullopt);

if (!val.has_value()) {
diags.Report(clang::diag::err_drv_invalid_value)
<< arg->getAsString(args) << arg->getValue();
return false;
}
opts.setDoConcurrentMapping(val.value());
return true;
}

static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts,
llvm::opt::ArgList &args,
clang::DiagnosticsEngine &diags) {
Expand Down Expand Up @@ -385,6 +411,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts,
clang::driver::options::OPT_funderscoring, false)) {
opts.Underscoring = 0;
}

parseDoConcurrentMapping(opts, args, diags);
}

/// Parses all target input arguments and populates the target
Expand Down
29 changes: 27 additions & 2 deletions flang/lib/Frontend/FrontendActions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,8 +320,9 @@ bool CodeGenAction::beginSourceFileAction() {
// Add OpenMP-related passes
// WARNING: These passes must be run immediately after the lowering to ensure
// that the FIR is correct with respect to OpenMP operations/attributes.
if (ci.getInvocation().getFrontendOpts().features.IsEnabled(
Fortran::common::LanguageFeature::OpenMP)) {
bool isOpenMPEnabled = ci.getInvocation().getFrontendOpts().features.IsEnabled(
Fortran::common::LanguageFeature::OpenMP);
if (isOpenMPEnabled) {
bool isDevice = false;
if (auto offloadMod = llvm::dyn_cast<mlir::omp::OffloadModuleInterface>(
mlirModule->getOperation()))
Expand All @@ -332,6 +333,30 @@ bool CodeGenAction::beginSourceFileAction() {
fir::createOpenMPFIRPassPipeline(pm, isDevice);
}

using DoConcurrentMappingKind =
Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind;
DoConcurrentMappingKind selectedKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping();
if (selectedKind != DoConcurrentMappingKind::DCMK_None) {
if (!isOpenMPEnabled) {
unsigned diagID = ci.getDiagnostics().getCustomDiagID(
clang::DiagnosticsEngine::Warning,
"lowering `do concurrent` loops to OpenMP is only supported if "
"OpenMP is enabled");
ci.getDiagnostics().Report(diagID);
} else {
bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device;

if (mapToDevice) {
unsigned diagID = ci.getDiagnostics().getCustomDiagID(
clang::DiagnosticsEngine::Warning,
"TODO: lowering `do concurrent` loops to OpenMP device is not "
"supported yet");
ci.getDiagnostics().Report(diagID);
} else
pm.addPass(fir::createDoConcurrentConversionPass());
}
}

pm.enableVerifier(/*verifyPasses=*/true);
pm.addPass(std::make_unique<Fortran::lower::VerifierPass>());

Expand Down
1 change: 1 addition & 0 deletions flang/lib/Optimizer/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ add_flang_library(FIRTransforms
OMPMarkDeclareTarget.cpp
VScaleAttr.cpp
FunctionAttr.cpp
DoConcurrentConversion.cpp

DEPENDS
FIRDialect
Expand Down
206 changes: 206 additions & 0 deletions flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
#include "flang/Optimizer/Transforms/Passes.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/IRMapping.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"

#include <memory>

namespace fir {
#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

#define DEBUG_TYPE "fopenmp-do-concurrent-conversion"

namespace {
class DoConcurrentConversion : public mlir::OpConversionPattern<fir::DoLoopOp> {
public:
using mlir::OpConversionPattern<fir::DoLoopOp>::OpConversionPattern;

mlir::LogicalResult
matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor,
mlir::ConversionPatternRewriter &rewriter) const override {
mlir::OpPrintingFlags flags;
flags.printGenericOpForm();

mlir::omp::ParallelOp parallelOp =
rewriter.create<mlir::omp::ParallelOp>(doLoop.getLoc());

mlir::Block *block = rewriter.createBlock(&parallelOp.getRegion());

rewriter.setInsertionPointToEnd(block);
rewriter.create<mlir::omp::TerminatorOp>(doLoop.getLoc());

rewriter.setInsertionPointToStart(block);

// ==== TODO (1) Start ====
//
// The goal of the few lines below is to collect and clone
// the list of operations that define the loop's lower and upper bounds as
// well as the step. Should we, instead of doing this here, split it into 2
// stages?
Comment on lines +54 to +57
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This sounds to me like "we want to add the lower bound, upper bound and step variables to a firstprivate clause for the parallel region". So, if I understand it correctly, maybe we don't want to do much here, but rather rely on delayed privatization when that's implemented for omp.parallel. In the meantime, your implementation below at least addresses the constant bounds case. I'd suggest emitting an error if any of these variables is not a constant or type cast (otherwise it might silently produce incorrect code or crash) and also check that they indeed have a defining op before dereferencing it, or using function arguments would make this crash altogether.

If we want the first implementation to support something other than just constants and we can't wait for delayed privatization, maybe the approach with the least friction is to try to replicate here the "firstprivate" clause handling for parallel regions currently implemented in PFT to MLIR lowering (between OpenMP.cpp and the DataSharingProcessor).

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some checks to make sure defining ops meet the current restrictions of the pass.

Both delayed privatization and do concurrent mapping are in-flight at the moment. So indeed once delayed privatization is more complete and the switch in flang is flipped we should move this to use delayed privatization. I think that should happen soon 🤞. So once I get to beef up this pass a bit more, we indeed either use delayed privatization directly or replicate some of the logic in OMP lowering.

//
// 1. **Stage 1**: add an analysis that extracts all the relevant
// operations defining the lower-bound, upper-bound, and
// step.
// 2. **Stage 2**: clone the collected operations in the parallel region.
//
// So far, the pass has been tested with very simple loops (where the bounds
// and step are constants) so the goal of **Stage 1** is to have a
// well-defined component that has the sole responsibility of collecting all
// the relevant ops relevant to the loop header. This was we can test this
// in isolation for more complex loops and better organize the code. **Stage
// 2** would then be responsible for the actual cloning of the collected
// loop header preparation/allocation operations.

// Clone the LB, UB, step defining ops inside the parallel region.
llvm::SmallVector<mlir::Value> lowerBound, upperBound, step;
lowerBound.push_back(
rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0));
upperBound.push_back(
rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0));
step.push_back(
rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0));
// ==== TODO (1) End ====

auto wsLoopOp = rewriter.create<mlir::omp::WsLoopOp>(
doLoop.getLoc(), lowerBound, upperBound, step);
wsLoopOp.setInclusive(true);

auto outlineableOp =
mlir::dyn_cast<mlir::omp::OutlineableOpenMPOpInterface>(*parallelOp);
rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock());

// ==== TODO (2) Start ====
//
// The goal of the following simple work-list algorithm and
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be replaced by a simpler check for live-ins into the loop region? That is, all values that are defined outside but used inside of it. Then, we could have some logic to decide whether each of these values should be private, firstprivate or shared and produce the corresponding omp.private ops and block args.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that makes sense. But do you mind leaving that out for now and gradually work up the support for more complex loops as we go? For now, my immediate focus is to map a simple loop like the one in the test to the device instead of the host. So I am focusing on simple loops for both the device and host and will move up to more complex stuff after that.

// the following `for` loop is to collect all the operations related to the
// allocation of the induction variable for the `do concurrent` loop. The
// operations collected by this algorithm are very similar to what is
// usually emitted for privatized variables, e.g. for omp.parallel loops.
// Therefore, I think we can:
//
// 1. **Stage 1**: Add an analysis that colects all these operations. The
// goal is similar to **Stage 1** of TODO (1): isolate the
// algorithm is an individually-testable component so that
// we properly implement and test it for more complicated
// `do concurrent` loops.
// 1. **Stage 2**: Using the collected operations, create and populate an
// `omp.private {type=private}` op to server as the
// delayed privatizer for the new work-sharing loop.

// For the induction variable, we need to privative its allocation and
// binding inside the parallel region.
llvm::SmallSetVector<mlir::Operation *, 2> workList;
// Therefore, we first discover the induction variable by discovering
// `fir.store`s where the source is the loop's block argument.
workList.insert(doLoop.getInductionVar().getUsers().begin(),
doLoop.getInductionVar().getUsers().end());
llvm::SmallSetVector<fir::StoreOp, 2> inductionVarTargetStores;

// Walk the def-chain of the loop's block argument until we hit `fir.store`.
while (!workList.empty()) {
mlir::Operation *item = workList.front();

if (auto storeOp = mlir::dyn_cast<fir::StoreOp>(item)) {
inductionVarTargetStores.insert(storeOp);
} else {
workList.insert(item->getUsers().begin(), item->getUsers().end());
}

workList.remove(item);
}

// For each collected `fir.sotre`, find the target memref's alloca's and
ergawy marked this conversation as resolved.
Show resolved Hide resolved
// declare ops.
llvm::SmallSetVector<mlir::Operation *, 4> declareAndAllocasToClone;
for (auto storeOp : inductionVarTargetStores) {
mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp();

for (auto operand : storeTarget->getOperands()) {
declareAndAllocasToClone.insert(operand.getDefiningOp());
}
declareAndAllocasToClone.insert(storeTarget);
}
// ==== TODO (2) End ====
//
// TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can
// more easily generalize the pass to work for targets other than OpenMP,
// e.g. OpenACC, I think can, can reuse the results of the analyses and only
// change the code-gen/rewriting.

mlir::IRMapping mapper;

// Collect the memref defining ops in the parallel region.
for (mlir::Operation *opToClone : declareAndAllocasToClone) {
rewriter.clone(*opToClone, mapper);
}

// Clone the loop's body inside the worksharing construct using the mapped
// memref values.
rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(),
wsLoopOp.getRegion().begin(), mapper);

mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator();
rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back());
rewriter.create<mlir::omp::YieldOp>(terminator->getLoc());
rewriter.eraseOp(terminator);

rewriter.eraseOp(doLoop);

return mlir::success();
}
};

class DoConcurrentConversionPass
: public fir::impl::DoConcurrentConversionPassBase<
DoConcurrentConversionPass> {
public:
void runOnOperation() override {
mlir::func::FuncOp func = getOperation();

if (func.isDeclaration()) {
return;
}

auto *context = &getContext();
mlir::RewritePatternSet patterns(context);
patterns.insert<DoConcurrentConversion>(context);
mlir::ConversionTarget target(*context);
target.addLegalDialect<fir::FIROpsDialect, hlfir::hlfirDialect,
mlir::arith::ArithDialect, mlir::func::FuncDialect,
mlir::omp::OpenMPDialect>();

target.addDynamicallyLegalOp<fir::DoLoopOp>(
[](fir::DoLoopOp op) { return !op.getUnordered(); });

if (mlir::failed(mlir::applyFullConversion(getOperation(), target,
std::move(patterns)))) {
mlir::emitError(mlir::UnknownLoc::get(context),
"error in converting do-concurrent op");
signalPassFailure();
}
}
};
} // namespace

std::unique_ptr<mlir::Pass> fir::createDoConcurrentConversionPass() {
return std::make_unique<DoConcurrentConversionPass>();
}

2 changes: 2 additions & 0 deletions flang/test/Driver/driver-help-hidden.f90
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
! CHECK-NEXT: -fdefault-double-8 Set the default double precision kind to an 8 byte wide type
! CHECK-NEXT: -fdefault-integer-8 Set the default integer and logical kind to an 8 byte wide type
! CHECK-NEXT: -fdefault-real-8 Set the default real kind to an 8 byte wide type
! CHECK-NEXT: -fdo-concurrent-parallel=<value>
! CHECK-NEXT: Try to map `do concurrent` loops to OpenMP (on host or device)
! CHECK-NEXT: -ffast-math Allow aggressive, lossy floating-point optimizations
! CHECK-NEXT: -ffixed-form Process source files in fixed form
! CHECK-NEXT: -ffixed-line-length=<value>
Expand Down
Loading