From e4d6564ed594fab79e9e42021a486b2896d241e1 Mon Sep 17 00:00:00 2001 From: ergawy Date: Mon, 4 Mar 2024 22:22:41 -0600 Subject: [PATCH 1/2] [flang][OpenMP] Add initial pass to map `do concurrent` to OMP Adds a new pass to map `do concurrent` loops to OpenMP. So far the pass only maps to the host, i.e. `parallel do`. Later, it will be extended to map to the device/target as well. This diff also adds the pass the code-gen pipeline flang. To invoke the new pass from flang, use `-fdo-concurrent-parallel=`. The option is only active when OpenMP is enabled throught `-fopenmp`. --- clang/include/clang/Driver/Options.td | 4 + clang/lib/Driver/ToolChains/Flang.cpp | 3 +- .../include/flang/Frontend/CodeGenOptions.def | 2 + flang/include/flang/Frontend/CodeGenOptions.h | 8 + .../flang/Optimizer/Transforms/Passes.h | 2 + .../flang/Optimizer/Transforms/Passes.td | 20 ++ flang/lib/Frontend/CompilerInvocation.cpp | 28 +++ flang/lib/Frontend/FrontendActions.cpp | 29 ++- flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 + .../Transforms/DoConcurrentConversion.cpp | 206 ++++++++++++++++++ flang/test/Driver/driver-help-hidden.f90 | 2 + flang/test/Driver/driver-help.f90 | 4 + flang/test/Transforms/DoConcurrent/basic.f90 | 44 ++++ flang/test/Transforms/DoConcurrent/basic.mlir | 60 +++++ 14 files changed, 410 insertions(+), 3 deletions(-) create mode 100644 flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp create mode 100644 flang/test/Transforms/DoConcurrent/basic.f90 create mode 100644 flang/test/Transforms/DoConcurrent/basic.mlir diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3a028fadb25b18..682c0d2c671b94 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -6473,6 +6473,10 @@ defm stack_arrays : BoolOptionWithoutMarshalling<"f", "stack-arrays", defm loop_versioning : BoolOptionWithoutMarshalling<"f", "version-loops-for-stride", PosFlag, NegFlag>; + +def do_concurrent_parallel_EQ : Joined<["-"], "fdo-concurrent-parallel=">, + HelpText<"Try to map `do concurrent` loops to OpenMP (on host or device)">, + Values<"none,host,device">; } // let Visibility = [FC1Option, FlangOption] def J : JoinedOrSeparate<["-"], "J">, diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index 6168b42dc78292..bb66ae75be2bd6 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -150,7 +150,8 @@ void Flang::addCodegenOptions(const ArgList &Args, options::OPT_flang_deprecated_no_hlfir, options::OPT_flang_experimental_polymorphism, options::OPT_fno_ppc_native_vec_elem_order, - options::OPT_fppc_native_vec_elem_order}); + options::OPT_fppc_native_vec_elem_order, + options::OPT_do_concurrent_parallel_EQ}); } void Flang::addPicOptions(const ArgList &Args, ArgStringList &CmdArgs) const { diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 9d03ec88a56b8a..a6128130baadc3 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -40,5 +40,7 @@ ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codeg ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers +ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP + #undef CODEGENOPT #undef ENUM_CODEGENOPT diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index 0c318e4023af43..cd5c5e592d8926 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -129,6 +129,14 @@ class CodeGenOptions : public CodeGenOptionsBase { /// transformation. OptRemark OptimizationRemarkAnalysis; + /// Optionally map `do concurrent` loops to OpenMP. This is only valid of + /// OpenMP is enabled. + enum class DoConcurrentMappingKind { + DCMK_None, // Do not lower `do concurrent` to OpenMP. + DCMK_Host, // Lower to run in parallel on the CPU. + DCMK_Device // Lower to run in parallel on the GPU. + }; + // Define accessors/mutators for code generation options of enumeration type. #define CODEGENOPT(Name, Bits, Default) #define ENUM_CODEGENOPT(Name, Type, Bits, Default) \ diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index e1d22c8c986da7..25a526ab0cbfcb 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -96,6 +96,8 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noNaNsFPMath, bool approxFuncFPMath, bool noSignedZerosFPMath, bool unsafeFPMath); +std::unique_ptr createDoConcurrentConversionPass(); + // declarative passes #define GEN_PASS_REGISTRATION #include "flang/Optimizer/Transforms/Passes.h.inc" diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 5fb576fd876254..06de4a1d28a929 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -397,4 +397,24 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> { let constructor = "::fir::createFunctionAttrPass()"; } +def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> { + let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops."; + + let description = [{ This is an experimental pass to map `DO CONCURRENR` loops + to their correspnding equivalent OpenMP worksharing constructs. + + For now the following is supported: + - Mapping simple loops to `parallel do`. + + Still to TODO: + - More extensive testing. + - Mapping to `target teams distribute parallel do`. + - Allowing the user to control mapping behavior: either to the host or + target. + }]; + + let constructor = "::fir::createDoConcurrentConversionPass()"; + let dependentDialects = ["mlir::omp::OpenMPDialect"]; +} + #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 4707de0e976ca7..79663b89942c0f 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -154,6 +154,32 @@ static bool parseDebugArgs(Fortran::frontend::CodeGenOptions &opts, return true; } +static bool parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, + llvm::opt::ArgList &args, + clang::DiagnosticsEngine &diags) { + llvm::opt::Arg *arg = + args.getLastArg(clang::driver::options::OPT_do_concurrent_parallel_EQ); + if (!arg) + return true; + + using DoConcurrentMappingKind = Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + std::optional val = + llvm::StringSwitch>( + arg->getValue()) + .Case("none", DoConcurrentMappingKind::DCMK_None) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Default(std::nullopt); + + if (!val.has_value()) { + diags.Report(clang::diag::err_drv_invalid_value) + << arg->getAsString(args) << arg->getValue(); + return false; + } + opts.setDoConcurrentMapping(val.value()); + return true; +} + static bool parseVectorLibArg(Fortran::frontend::CodeGenOptions &opts, llvm::opt::ArgList &args, clang::DiagnosticsEngine &diags) { @@ -385,6 +411,8 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, clang::driver::options::OPT_funderscoring, false)) { opts.Underscoring = 0; } + + parseDoConcurrentMapping(opts, args, diags); } /// Parses all target input arguments and populates the target diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 849b3c8e4dc027..b2f7a7abf64ac9 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -320,8 +320,9 @@ bool CodeGenAction::beginSourceFileAction() { // Add OpenMP-related passes // WARNING: These passes must be run immediately after the lowering to ensure // that the FIR is correct with respect to OpenMP operations/attributes. - if (ci.getInvocation().getFrontendOpts().features.IsEnabled( - Fortran::common::LanguageFeature::OpenMP)) { + bool isOpenMPEnabled = ci.getInvocation().getFrontendOpts().features.IsEnabled( + Fortran::common::LanguageFeature::OpenMP); + if (isOpenMPEnabled) { bool isDevice = false; if (auto offloadMod = llvm::dyn_cast( mlirModule->getOperation())) @@ -332,6 +333,30 @@ bool CodeGenAction::beginSourceFileAction() { fir::createOpenMPFIRPassPipeline(pm, isDevice); } + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + DoConcurrentMappingKind selectedKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); + if (selectedKind != DoConcurrentMappingKind::DCMK_None) { + if (!isOpenMPEnabled) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "lowering `do concurrent` loops to OpenMP is only supported if " + "OpenMP is enabled"); + ci.getDiagnostics().Report(diagID); + } else { + bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device; + + if (mapToDevice) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "TODO: lowering `do concurrent` loops to OpenMP device is not " + "supported yet"); + ci.getDiagnostics().Report(diagID); + } else + pm.addPass(fir::createDoConcurrentConversionPass()); + } + } + pm.enableVerifier(/*verifyPasses=*/true); pm.addPass(std::make_unique()); diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index ba2e267996150e..cf83bb496bb5e8 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -22,6 +22,7 @@ add_flang_library(FIRTransforms OMPMarkDeclareTarget.cpp VScaleAttr.cpp FunctionAttr.cpp + DoConcurrentConversion.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp new file mode 100644 index 00000000000000..4534d514b86d76 --- /dev/null +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -0,0 +1,206 @@ +//===- DoConcurrentConversion.cpp -- map `DO CONCURRENT` to OpenMP loops --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang/Optimizer/Dialect/FIRDialect.h" +#include "flang/Optimizer/Dialect/FIROps.h" +#include "flang/Optimizer/Dialect/FIRType.h" +#include "flang/Optimizer/Dialect/Support/FIRContext.h" +#include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/Transforms/Passes.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/OpenMP/OpenMPDialect.h" +#include "mlir/IR/Diagnostics.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/DialectConversion.h" + +#include + +namespace fir { +#define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS +#include "flang/Optimizer/Transforms/Passes.h.inc" +} // namespace fir + +#define DEBUG_TYPE "fopenmp-do-concurrent-conversion" + +namespace { +class DoConcurrentConversion : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { + mlir::OpPrintingFlags flags; + flags.printGenericOpForm(); + + mlir::omp::ParallelOp parallelOp = + rewriter.create(doLoop.getLoc()); + + mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion()); + + rewriter.setInsertionPointToEnd(block); + rewriter.create(doLoop.getLoc()); + + rewriter.setInsertionPointToStart(block); + + // ==== TODO (1) Start ==== + // + // The goal of the few lines below is to collect and clone + // the list of operations that define the loop's lower and upper bounds as + // well as the step. Should we, instead of doing this here, split it into 2 + // stages? + // + // 1. **Stage 1**: add an analysis that extracts all the relevant + // operations defining the lower-bound, upper-bound, and + // step. + // 2. **Stage 2**: clone the collected operations in the parallel region. + // + // So far, the pass has been tested with very simple loops (where the bounds + // and step are constants) so the goal of **Stage 1** is to have a + // well-defined component that has the sole responsibility of collecting all + // the relevant ops relevant to the loop header. This was we can test this + // in isolation for more complex loops and better organize the code. **Stage + // 2** would then be responsible for the actual cloning of the collected + // loop header preparation/allocation operations. + + // Clone the LB, UB, step defining ops inside the parallel region. + llvm::SmallVector lowerBound, upperBound, step; + lowerBound.push_back( + rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0)); + upperBound.push_back( + rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0)); + step.push_back( + rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0)); + // ==== TODO (1) End ==== + + auto wsLoopOp = rewriter.create( + doLoop.getLoc(), lowerBound, upperBound, step); + wsLoopOp.setInclusive(true); + + auto outlineableOp = + mlir::dyn_cast(*parallelOp); + rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock()); + + // ==== TODO (2) Start ==== + // + // The goal of the following simple work-list algorithm and + // the following `for` loop is to collect all the operations related to the + // allocation of the induction variable for the `do concurrent` loop. The + // operations collected by this algorithm are very similar to what is + // usually emitted for privatized variables, e.g. for omp.parallel loops. + // Therefore, I think we can: + // + // 1. **Stage 1**: Add an analysis that colects all these operations. The + // goal is similar to **Stage 1** of TODO (1): isolate the + // algorithm is an individually-testable component so that + // we properly implement and test it for more complicated + // `do concurrent` loops. + // 1. **Stage 2**: Using the collected operations, create and populate an + // `omp.private {type=private}` op to server as the + // delayed privatizer for the new work-sharing loop. + + // For the induction variable, we need to privative its allocation and + // binding inside the parallel region. + llvm::SmallSetVector workList; + // Therefore, we first discover the induction variable by discovering + // `fir.store`s where the source is the loop's block argument. + workList.insert(doLoop.getInductionVar().getUsers().begin(), + doLoop.getInductionVar().getUsers().end()); + llvm::SmallSetVector inductionVarTargetStores; + + // Walk the def-chain of the loop's block argument until we hit `fir.store`. + while (!workList.empty()) { + mlir::Operation *item = workList.front(); + + if (auto storeOp = mlir::dyn_cast(item)) { + inductionVarTargetStores.insert(storeOp); + } else { + workList.insert(item->getUsers().begin(), item->getUsers().end()); + } + + workList.remove(item); + } + + // For each collected `fir.sotre`, find the target memref's alloca's and + // declare ops. + llvm::SmallSetVector declareAndAllocasToClone; + for (auto storeOp : inductionVarTargetStores) { + mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp(); + + for (auto operand : storeTarget->getOperands()) { + declareAndAllocasToClone.insert(operand.getDefiningOp()); + } + declareAndAllocasToClone.insert(storeTarget); + } + // ==== TODO (2) End ==== + // + // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can + // more easily generalize the pass to work for targets other than OpenMP, + // e.g. OpenACC, I think can, can reuse the results of the analyses and only + // change the code-gen/rewriting. + + mlir::IRMapping mapper; + + // Collect the memref defining ops in the parallel region. + for (mlir::Operation *opToClone : declareAndAllocasToClone) { + rewriter.clone(*opToClone, mapper); + } + + // Clone the loop's body inside the worksharing construct using the mapped + // memref values. + rewriter.cloneRegionBefore(doLoop.getRegion(), wsLoopOp.getRegion(), + wsLoopOp.getRegion().begin(), mapper); + + mlir::Operation *terminator = wsLoopOp.getRegion().back().getTerminator(); + rewriter.setInsertionPointToEnd(&wsLoopOp.getRegion().back()); + rewriter.create(terminator->getLoc()); + rewriter.eraseOp(terminator); + + rewriter.eraseOp(doLoop); + + return mlir::success(); + } +}; + +class DoConcurrentConversionPass + : public fir::impl::DoConcurrentConversionPassBase< + DoConcurrentConversionPass> { +public: + void runOnOperation() override { + mlir::func::FuncOp func = getOperation(); + + if (func.isDeclaration()) { + return; + } + + auto *context = &getContext(); + mlir::RewritePatternSet patterns(context); + patterns.insert(context); + mlir::ConversionTarget target(*context); + target.addLegalDialect(); + + target.addDynamicallyLegalOp( + [](fir::DoLoopOp op) { return !op.getUnordered(); }); + + if (mlir::failed(mlir::applyFullConversion(getOperation(), target, + std::move(patterns)))) { + mlir::emitError(mlir::UnknownLoc::get(context), + "error in converting do-concurrent op"); + signalPassFailure(); + } + } +}; +} // namespace + +std::unique_ptr fir::createDoConcurrentConversionPass() { + return std::make_unique(); +} + diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90 index 44dbac44772b29..b379138db13913 100644 --- a/flang/test/Driver/driver-help-hidden.f90 +++ b/flang/test/Driver/driver-help-hidden.f90 @@ -35,6 +35,8 @@ ! CHECK-NEXT: -fdefault-double-8 Set the default double precision kind to an 8 byte wide type ! CHECK-NEXT: -fdefault-integer-8 Set the default integer and logical kind to an 8 byte wide type ! CHECK-NEXT: -fdefault-real-8 Set the default real kind to an 8 byte wide type +! CHECK-NEXT: -fdo-concurrent-parallel= +! CHECK-NEXT: Try to map `do concurrent` loops to OpenMP (on host or device) ! CHECK-NEXT: -ffast-math Allow aggressive, lossy floating-point optimizations ! CHECK-NEXT: -ffixed-form Process source files in fixed form ! CHECK-NEXT: -ffixed-line-length= diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90 index b4280a454e3128..cf645e8c3d3847 100644 --- a/flang/test/Driver/driver-help.f90 +++ b/flang/test/Driver/driver-help.f90 @@ -31,6 +31,8 @@ ! HELP-NEXT: -fdefault-double-8 Set the default double precision kind to an 8 byte wide type ! HELP-NEXT: -fdefault-integer-8 Set the default integer and logical kind to an 8 byte wide type ! HELP-NEXT: -fdefault-real-8 Set the default real kind to an 8 byte wide type +! HELP-NEXT: -fdo-concurrent-parallel= +! HELP-NEXT: Try to map `do concurrent` loops to OpenMP (on host or device) ! HELP-NEXT: -ffast-math Allow aggressive, lossy floating-point optimizations ! HELP-NEXT: -ffixed-form Process source files in fixed form ! HELP-NEXT: -ffixed-line-length= @@ -186,6 +188,8 @@ ! HELP-FC1-NEXT: -fdefault-double-8 Set the default double precision kind to an 8 byte wide type ! HELP-FC1-NEXT: -fdefault-integer-8 Set the default integer and logical kind to an 8 byte wide type ! HELP-FC1-NEXT: -fdefault-real-8 Set the default real kind to an 8 byte wide type +! HELP-FC1-NEXT: -fdo-concurrent-parallel= +! HELP-FC1-NEXT: Try to map `do concurrent` loops to OpenMP (on host or device) ! HELP-FC1-NEXT: -fembed-offload-object= ! HELP-FC1-NEXT: Embed Offloading device-side binary into host object file as a section. ! HELP-FC1-NEXT: -ffast-math Allow aggressive, lossy floating-point optimizations diff --git a/flang/test/Transforms/DoConcurrent/basic.f90 b/flang/test/Transforms/DoConcurrent/basic.f90 new file mode 100644 index 00000000000000..a555a25c9bad5d --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic.f90 @@ -0,0 +1,44 @@ +! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ +! RUN: | FileCheck %s + +! CHECK-LABEL: do_concurrent_basic +program do_concurrent_basic + ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + implicit none + integer :: a(10) + integer :: i + + ! CHECK-NOT: fir.do_loop + + ! CHECK: omp.parallel { + + ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + ! CHECK: %[[STEP:.*]] = arith.constant 1 : index + + ! CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + ! CHECK-NEXT: omp.yield + ! CHECK-NEXT: } + + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + do concurrent (integer :: i=1:10) + a(i) = i + end do + + ! CHECK-NOT: fir.do_loop +end program do_concurrent_basic diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic.mlir new file mode 100644 index 00000000000000..7d62463f36d422 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic.mlir @@ -0,0 +1,60 @@ +// Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. + +// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s + +// CHECK-LABEL: func.func @do_concurrent_basic +func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { + // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + // CHECK: %[[C1:.*]] = arith.constant 1 : i32 + // CHECK: %[[C10:.*]] = arith.constant 10 : i32 + + %0 = fir.alloca i32 {bindc_name = "i"} + %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %2 = fir.address_of(@_QFEa) : !fir.ref> + %c10 = arith.constant 10 : index + %3 = fir.shape %c10 : (index) -> !fir.shape<1> + %4:2 = hlfir.declare %2(%3) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) + %c1_i32 = arith.constant 1 : i32 + %7 = fir.convert %c1_i32 : (i32) -> index + %c10_i32 = arith.constant 10 : i32 + %8 = fir.convert %c10_i32 : (i32) -> index + %c1 = arith.constant 1 : index + + // CHECK-NOT: fir.do_loop + + // CHECK: omp.parallel { + + // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + // CHECK: %[[STEP:.*]] = arith.constant 1 : index + + // CHECK: omp.wsloop for (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + // CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + // CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + // CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + // CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + // CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + // CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[ARR]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + // CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + // CHECK-NEXT: omp.yield + // CHECK-NEXT: } + + // CHECK-NEXT: omp.terminator + // CHECK-NEXT: } + fir.do_loop %arg0 = %7 to %8 step %c1 unordered { + %13 = fir.convert %arg0 : (index) -> i32 + fir.store %13 to %1#1 : !fir.ref + %14 = fir.load %1#0 : !fir.ref + %15 = fir.load %1#0 : !fir.ref + %16 = fir.convert %15 : (i32) -> i64 + %17 = hlfir.designate %4#0 (%16) : (!fir.ref>, i64) -> !fir.ref + hlfir.assign %14 to %17 : i32, !fir.ref + } + + // CHECK-NOT: fir.do_loop + + return + } From 385768aff2412c1b8e1e0e10f6911c5cd15b50e3 Mon Sep 17 00:00:00 2001 From: ergawy Date: Mon, 11 Mar 2024 06:41:50 -0500 Subject: [PATCH 2/2] Handle review comments. --- .../flang/Optimizer/Transforms/Passes.td | 2 +- .../Transforms/DoConcurrentConversion.cpp | 40 +++++++++++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 06de4a1d28a929..95ffd199bc0fe4 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -400,7 +400,7 @@ def FunctionAttr : Pass<"function-attr", "mlir::func::FuncOp"> { def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir::func::FuncOp"> { let summary = "Map `DO CONCURRENT` loops to OpenMP worksharing loops."; - let description = [{ This is an experimental pass to map `DO CONCURRENR` loops + let description = [{ This is an experimental pass to map `DO CONCURRENT` loops to their correspnding equivalent OpenMP worksharing constructs. For now the following is supported: diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 4534d514b86d76..f5a3d925ab5d9a 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -70,13 +70,39 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { // loop header preparation/allocation operations. // Clone the LB, UB, step defining ops inside the parallel region. + mlir::Operation* lbOp = doLoop.getLowerBound().getDefiningOp(); + mlir::Operation* ubOp = doLoop.getUpperBound().getDefiningOp(); + mlir::Operation* stepOp = doLoop.getStep().getDefiningOp(); + + if (lbOp == nullptr || ubOp == nullptr || stepOp == nullptr) { + return rewriter.notifyMatchFailure( + doLoop, "At least one of the loop's LB, UB, or step doesn't have a " + "defining operation."); + } + + std::function isOpUltimatelyConstant = + [&](mlir::Operation *operation) { + if (mlir::isa_and_present(operation)) + return true; + + if (fir::ConvertOp convertOp = + mlir::dyn_cast_if_present(operation)) + return isOpUltimatelyConstant(convertOp.getValue().getDefiningOp()); + + return false; + }; + + if (!isOpUltimatelyConstant(lbOp) || !isOpUltimatelyConstant(ubOp) || + !isOpUltimatelyConstant(stepOp)) { + return rewriter.notifyMatchFailure( + doLoop, "`do concurrent` conversion is currently only supported for " + "constant LB, UB, and step values."); + } + llvm::SmallVector lowerBound, upperBound, step; - lowerBound.push_back( - rewriter.clone(*doLoop.getLowerBound().getDefiningOp())->getResult(0)); - upperBound.push_back( - rewriter.clone(*doLoop.getUpperBound().getDefiningOp())->getResult(0)); - step.push_back( - rewriter.clone(*doLoop.getStep().getDefiningOp())->getResult(0)); + lowerBound.push_back(rewriter.clone(*lbOp)->getResult(0)); + upperBound.push_back(rewriter.clone(*ubOp)->getResult(0)); + step.push_back(rewriter.clone(*stepOp)->getResult(0)); // ==== TODO (1) End ==== auto wsLoopOp = rewriter.create( @@ -127,7 +153,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { workList.remove(item); } - // For each collected `fir.sotre`, find the target memref's alloca's and + // For each collected `fir.store`, find the target memref's alloca's and // declare ops. llvm::SmallSetVector declareAndAllocasToClone; for (auto storeOp : inductionVarTargetStores) {