diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h similarity index 100% rename from flang/lib/Lower/OpenMP/Clauses.h rename to flang/include/flang/Lower/OpenMP/Clauses.h diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h similarity index 89% rename from flang/lib/Lower/OpenMP/Utils.h rename to flang/include/flang/Lower/OpenMP/Utils.h index 8fbb18fa8656ff..520349bb4ba53b 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/include/flang/Lower/OpenMP/Utils.h @@ -59,7 +59,7 @@ struct OmpMapMemberIndicesData { }; mlir::omp::MapInfoOp -createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, +createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, mlir::ArrayRef bounds, mlir::ArrayRef members, @@ -102,6 +102,15 @@ void genObjectList(const ObjectList &objects, Fortran::lower::AbstractConverter &converter, llvm::SmallVectorImpl &operands); +// TODO: consider moving this to the `omp.loop_nest` op. Would be something like +// this: +// +// ``` +// mlir::Value LoopNestOp::calculateTripCount(mlir::OpBuilder &builder, +// mlir::OpBuilder::InsertPoint ip) +// ``` +mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, + const mlir::omp::CollapseClauseOps &ops); } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 4b825c426f0299..7405796cb565cd 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -38,6 +38,7 @@ namespace fir { #define GEN_PASS_DECL_ARRAYVALUECOPY #define GEN_PASS_DECL_CHARACTERCONVERSION #define GEN_PASS_DECL_CFGCONVERSION +#define GEN_PASS_DECL_DOCONCURRENTCONVERSIONPASS #define GEN_PASS_DECL_EXTERNALNAMECONVERSION #define GEN_PASS_DECL_MEMREFDATAFLOWOPT #define GEN_PASS_DECL_SIMPLIFYINTRINSICS @@ -88,7 +89,7 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noNaNsFPMath, bool approxFuncFPMath, bool noSignedZerosFPMath, bool unsafeFPMath); -std::unique_ptr createDoConcurrentConversionPass(); +std::unique_ptr createDoConcurrentConversionPass(bool mapToDevice); void populateCfgConversionRewrites(mlir::RewritePatternSet &patterns, bool forceLoopToExecuteOnce = false); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 7cecba9e998469..540ff6fa960f34 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -416,8 +416,12 @@ def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir: target. }]; - let constructor = "::fir::createDoConcurrentConversionPass()"; let dependentDialects = ["mlir::omp::OpenMPDialect"]; + + let options = [ + Option<"mapTo", "map-to", "std::string", "", + "Try to map `do concurrent` loops to OpenMP (on host or device)">, + ]; } #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index cc3431d5b71d29..876ccb2d4a2a4a 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -324,6 +324,9 @@ inline void createHLFIRToFIRPassPipeline( pm.addPass(hlfir::createConvertHLFIRtoFIRPass()); } +using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + /// Create a pass pipeline for handling certain OpenMP transformations needed /// prior to FIR lowering. /// @@ -333,8 +336,12 @@ inline void createHLFIRToFIRPassPipeline( /// \param pm - MLIR pass manager that will hold the pipeline definition. /// \param isTargetDevice - Whether code is being generated for a target device /// rather than the host device. -inline void createOpenMPFIRPassPipeline( - mlir::PassManager &pm, bool isTargetDevice) { +inline void createOpenMPFIRPassPipeline(mlir::PassManager &pm, + bool isTargetDevice, DoConcurrentMappingKind doConcurrentMappingKind) { + if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None) + pm.addPass(fir::createDoConcurrentConversionPass( + doConcurrentMappingKind == DoConcurrentMappingKind::DCMK_Device)); + pm.addPass(fir::createOMPMapInfoFinalizationPass()); pm.addPass(fir::createOMPMarkDeclareTargetPass()); if (isTargetDevice) diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index ab66c6f921ae55..79a6f1101b1685 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -320,41 +320,34 @@ bool CodeGenAction::beginSourceFileAction() { // Add OpenMP-related passes // WARNING: These passes must be run immediately after the lowering to ensure // that the FIR is correct with respect to OpenMP operations/attributes. - bool isOpenMPEnabled = ci.getInvocation().getFrontendOpts().features.IsEnabled( + bool isOpenMPEnabled = + ci.getInvocation().getFrontendOpts().features.IsEnabled( Fortran::common::LanguageFeature::OpenMP); + + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + DoConcurrentMappingKind doConcurrentMappingKind = + ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); + + if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None && + !isOpenMPEnabled) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "lowering `do concurrent` loops to OpenMP is only supported if " + "OpenMP is enabled"); + ci.getDiagnostics().Report(diagID); + } + if (isOpenMPEnabled) { bool isDevice = false; if (auto offloadMod = llvm::dyn_cast( mlirModule->getOperation())) isDevice = offloadMod.getIsTargetDevice(); + // WARNING: This pipeline must be run immediately after the lowering to // ensure that the FIR is correct with respect to OpenMP operations/ // attributes. - fir::createOpenMPFIRPassPipeline(pm, isDevice); - } - - using DoConcurrentMappingKind = - Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; - DoConcurrentMappingKind selectedKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); - if (selectedKind != DoConcurrentMappingKind::DCMK_None) { - if (!isOpenMPEnabled) { - unsigned diagID = ci.getDiagnostics().getCustomDiagID( - clang::DiagnosticsEngine::Warning, - "lowering `do concurrent` loops to OpenMP is only supported if " - "OpenMP is enabled"); - ci.getDiagnostics().Report(diagID); - } else { - bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device; - - if (mapToDevice) { - unsigned diagID = ci.getDiagnostics().getCustomDiagID( - clang::DiagnosticsEngine::Warning, - "TODO: lowering `do concurrent` loops to OpenMP device is not " - "supported yet"); - ci.getDiagnostics().Report(diagID); - } else - pm.addPass(fir::createDoConcurrentConversionPass()); - } + fir::createOpenMPFIRPassPipeline(pm, isDevice, doConcurrentMappingKind); } pm.enableVerifier(/*verifyPasses=*/true); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index b6526b1f713bfb..816ae6f405ca17 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #include "ClauseProcessor.h" -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Parser/tools.h" #include "flang/Semantics/tools.h" diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index fe2015cb239dc8..eb5d1a9c09852a 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -12,12 +12,12 @@ #ifndef FORTRAN_LOWER_CLAUASEPROCESSOR_H #define FORTRAN_LOWER_CLAUASEPROCESSOR_H -#include "Clauses.h" #include "DirectivesCommon.h" #include "ReductionProcessor.h" -#include "Utils.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/Bridge.h" +#include "flang/Lower/OpenMP/Clauses.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Parser/dump-parse-tree.h" #include "flang/Parser/parse-tree.h" diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 97337cfc08c72a..b3f8c73d8753d4 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Common/idioms.h" #include "flang/Evaluate/expression.h" diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index eba7a130378d71..ddde5f2fe4e641 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -12,7 +12,7 @@ #include "DataSharingProcessor.h" -#include "Utils.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/SymbolMap.h" #include "flang/Optimizer/Builder/HLFIRTools.h" diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h index 4d7ce256caf842..8fa9f2718be3d4 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -12,9 +12,9 @@ #ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H #define FORTRAN_LOWER_DATASHARINGPROCESSOR_H -#include "Clauses.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/OpenMP.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/symbol.h" diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 672ccf94afafae..b3bd54a639222c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -13,15 +13,15 @@ #include "flang/Lower/OpenMP.h" #include "ClauseProcessor.h" -#include "Clauses.h" #include "DataSharingProcessor.h" #include "DirectivesCommon.h" #include "ReductionProcessor.h" -#include "Utils.h" #include "flang/Common/idioms.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/ConvertExpr.h" #include "flang/Lower/ConvertVariable.h" +#include "flang/Lower/OpenMP/Clauses.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Lower/StatementContext.h" #include "flang/Lower/SymbolMap.h" #include "flang/Optimizer/Builder/BoxValue.h" @@ -280,84 +280,6 @@ static void threadPrivatizeVars(Fortran::lower::AbstractConverter &converter, } } -static mlir::Value -calculateTripCount(Fortran::lower::AbstractConverter &converter, - mlir::Location loc, - const mlir::omp::CollapseClauseOps &ops) { - using namespace mlir::arith; - assert(ops.loopLBVar.size() == ops.loopUBVar.size() && - ops.loopLBVar.size() == ops.loopStepVar.size() && - !ops.loopLBVar.empty() && "Invalid bounds or step"); - - fir::FirOpBuilder &b = converter.getFirOpBuilder(); - - // Get the bit width of an integer-like type. - auto widthOf = [](mlir::Type ty) -> unsigned { - if (mlir::isa(ty)) { - return mlir::IndexType::kInternalStorageBitWidth; - } - if (auto tyInt = mlir::dyn_cast(ty)) { - return tyInt.getWidth(); - } - llvm_unreachable("Unexpected type"); - }; - - // For a type that is either IntegerType or IndexType, return the - // equivalent IntegerType. In the former case this is a no-op. - auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { - if (ty.isIndex()) { - return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); - } - assert(ty.isIntOrIndex() && "Unexpected type"); - return mlir::cast(ty); - }; - - // For two given values, establish a common signless IntegerType - // that can represent any value of type of x and of type of y, - // and return the pair of x, y converted to the new type. - auto unifyToSignless = - [&](fir::FirOpBuilder &b, mlir::Value x, - mlir::Value y) -> std::pair { - auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); - unsigned width = std::max(widthOf(tyX), widthOf(tyY)); - auto wideTy = mlir::IntegerType::get(b.getContext(), width, - mlir::IntegerType::Signless); - return std::make_pair(b.createConvert(loc, wideTy, x), - b.createConvert(loc, wideTy, y)); - }; - - // Start with signless i32 by default. - auto tripCount = b.createIntegerConstant(loc, b.getI32Type(), 1); - - for (auto [origLb, origUb, origStep] : - llvm::zip(ops.loopLBVar, ops.loopUBVar, ops.loopStepVar)) { - auto tmpS0 = b.createIntegerConstant(loc, origStep.getType(), 0); - auto [step, step0] = unifyToSignless(b, origStep, tmpS0); - auto reverseCond = b.create(loc, CmpIPredicate::slt, step, step0); - auto negStep = b.create(loc, step0, step); - mlir::Value absStep = b.create(loc, reverseCond, negStep, step); - - auto [lb, ub] = unifyToSignless(b, origLb, origUb); - auto start = b.create(loc, reverseCond, ub, lb); - auto end = b.create(loc, reverseCond, lb, ub); - - mlir::Value range = b.create(loc, end, start); - auto rangeCond = b.create(loc, CmpIPredicate::slt, end, start); - std::tie(range, absStep) = unifyToSignless(b, range, absStep); - // numSteps = (range /u absStep) + 1 - auto numSteps = - b.create(loc, b.create(loc, range, absStep), - b.createIntegerConstant(loc, range.getType(), 1)); - - auto trip0 = b.createIntegerConstant(loc, numSteps.getType(), 0); - auto loopTripCount = b.create(loc, rangeCond, trip0, numSteps); - auto [totalTC, thisTC] = unifyToSignless(b, tripCount, loopTripCount); - tripCount = b.create(loc, totalTC, thisTC); - } - - return tripCount; -} - static mlir::Operation * createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter, mlir::Location loc, mlir::Value indexVal, @@ -1574,8 +1496,8 @@ genLoopNestOp(Fortran::lower::AbstractConverter &converter, llvm::SmallVector iv; ClauseProcessor cp(converter, semaCtx, clauses); cp.processCollapse(loc, eval, collapseClauseOps, iv); - targetOp.getTripCountMutable().assign( - calculateTripCount(converter, loc, collapseClauseOps)); + targetOp.getTripCountMutable().assign(calculateTripCount( + converter.getFirOpBuilder(), loc, collapseClauseOps)); } return loopNestOp; } diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h index 17c8ae7b69214a..3f118b936c909a 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.h +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h @@ -13,7 +13,7 @@ #ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Semantics/symbol.h" diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index eed63b226133af..457bd25833d105 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -10,17 +10,18 @@ // //===----------------------------------------------------------------------===// -#include "Utils.h" +#include -#include "Clauses.h" #include #include +#include #include #include #include #include #include #include +#include #include #include @@ -357,6 +358,108 @@ getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) { return sym; } +mlir::omp::MapInfoOp +createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, + mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, + llvm::ArrayRef bounds, + llvm::ArrayRef members, + mlir::DenseIntElementsAttr membersIndex, uint64_t mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool partialMap) { + if (auto boxTy = llvm::dyn_cast(baseAddr.getType())) { + baseAddr = builder.create(loc, baseAddr); + retTy = baseAddr.getType(); + } + + mlir::TypeAttr varType = mlir::TypeAttr::get( + llvm::cast(retTy).getElementType()); + + mlir::omp::MapInfoOp op = builder.create( + loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds, + builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), + builder.getAttr(mapCaptureType), + builder.getStringAttr(name), builder.getBoolAttr(partialMap)); + + return op; +} + +mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, + const mlir::omp::CollapseClauseOps &ops) { + using namespace mlir::arith; + assert(ops.loopLBVar.size() == ops.loopUBVar.size() && + ops.loopLBVar.size() == ops.loopStepVar.size() && + !ops.loopLBVar.empty() && "Invalid bounds or step"); + + // Get the bit width of an integer-like type. + auto widthOf = [](mlir::Type ty) -> unsigned { + if (mlir::isa(ty)) { + return mlir::IndexType::kInternalStorageBitWidth; + } + if (auto tyInt = mlir::dyn_cast(ty)) { + return tyInt.getWidth(); + } + llvm_unreachable("Unexpected type"); + }; + + // For a type that is either IntegerType or IndexType, return the + // equivalent IntegerType. In the former case this is a no-op. + auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { + if (ty.isIndex()) { + return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); + } + assert(ty.isIntOrIndex() && "Unexpected type"); + return mlir::cast(ty); + }; + + // For two given values, establish a common signless IntegerType + // that can represent any value of type of x and of type of y, + // and return the pair of x, y converted to the new type. + auto unifyToSignless = + [&](fir::FirOpBuilder &b, mlir::Value x, + mlir::Value y) -> std::pair { + auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); + unsigned width = std::max(widthOf(tyX), widthOf(tyY)); + auto wideTy = mlir::IntegerType::get(b.getContext(), width, + mlir::IntegerType::Signless); + return std::make_pair(b.createConvert(loc, wideTy, x), + b.createConvert(loc, wideTy, y)); + }; + + // Start with signless i32 by default. + auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1); + + for (auto [origLb, origUb, origStep] : + llvm::zip(ops.loopLBVar, ops.loopUBVar, ops.loopStepVar)) { + auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0); + auto [step, step0] = unifyToSignless(builder, origStep, tmpS0); + auto reverseCond = + builder.create(loc, CmpIPredicate::slt, step, step0); + auto negStep = builder.create(loc, step0, step); + mlir::Value absStep = + builder.create(loc, reverseCond, negStep, step); + + auto [lb, ub] = unifyToSignless(builder, origLb, origUb); + auto start = builder.create(loc, reverseCond, ub, lb); + auto end = builder.create(loc, reverseCond, lb, ub); + + mlir::Value range = builder.create(loc, end, start); + auto rangeCond = + builder.create(loc, CmpIPredicate::slt, end, start); + std::tie(range, absStep) = unifyToSignless(builder, range, absStep); + // numSteps = (range /u absStep) + 1 + auto numSteps = builder.create( + loc, builder.create(loc, range, absStep), + builder.createIntegerConstant(loc, range.getType(), 1)); + + auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0); + auto loopTripCount = + builder.create(loc, rangeCond, trip0, numSteps); + auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount); + tripCount = builder.create(loc, totalTC, thisTC); + } + + return tripCount; +} } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 4b38567ec44081..a54775564cf049 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -24,6 +24,10 @@ add_flang_library(FIRTransforms FunctionAttr.cpp DebugTypeGenerator.cpp DoConcurrentConversion.cpp + # TODO Find a cleaner solution for this. This is a workaround to expose + # `Utils.cpp` so that it be used the `DoConcurrentConversion` pass. We should + # probably split this is a shared lib. + ../../Lower/OpenMP/Utils.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 44e9177a316d85..ecb79568afe8b2 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -6,11 +6,15 @@ // //===----------------------------------------------------------------------===// +#include "flang/Lower/OpenMP/Utils.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -18,8 +22,11 @@ #include "mlir/IR/IRMapping.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" #include +#include namespace fir { #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS @@ -33,46 +40,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; + DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice) + : OpConversionPattern(context), mapToDevice(mapToDevice) {} + mlir::LogicalResult matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { - mlir::OpPrintingFlags flags; - flags.printGenericOpForm(); - - mlir::omp::ParallelOp parallelOp = - rewriter.create(doLoop.getLoc()); - - mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion()); - - rewriter.setInsertionPointToEnd(block); - rewriter.create(doLoop.getLoc()); - - rewriter.setInsertionPointToStart(block); - - // ==== TODO (1) Start ==== - // - // The goal of the few lines below is to collect and clone - // the list of operations that define the loop's lower and upper bounds as - // well as the step. Should we, instead of doing this here, split it into 2 - // stages? - // - // 1. **Stage 1**: add an analysis that extracts all the relevant - // operations defining the lower-bound, upper-bound, and - // step. - // 2. **Stage 2**: clone the collected operations in the parallel region. - // - // So far, the pass has been tested with very simple loops (where the bounds - // and step are constants) so the goal of **Stage 1** is to have a - // well-defined component that has the sole responsibility of collecting all - // the relevant ops relevant to the loop header. This was we can test this - // in isolation for more complex loops and better organize the code. **Stage - // 2** would then be responsible for the actual cloning of the collected - // loop header preparation/allocation operations. - - // Clone the LB, UB, step defining ops inside the parallel region. - mlir::Operation* lbOp = doLoop.getLowerBound().getDefiningOp(); - mlir::Operation* ubOp = doLoop.getUpperBound().getDefiningOp(); - mlir::Operation* stepOp = doLoop.getStep().getDefiningOp(); + mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); + mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); + mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); if (lbOp == nullptr || ubOp == nullptr || stepOp == nullptr) { return rewriter.notifyMatchFailure( @@ -85,7 +61,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { if (mlir::isa_and_present(operation)) return true; - if (fir::ConvertOp convertOp = + if (auto convertOp = mlir::dyn_cast_if_present(operation)) return isOpUltimatelyConstant(convertOp.getValue().getDefiningOp()); @@ -99,91 +75,368 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { "constant LB, UB, and step values."); } - mlir::omp::LoopNestClauseOps clauseOps; - clauseOps.loopLBVar.push_back(rewriter.clone(*lbOp)->getResult(0)); - clauseOps.loopUBVar.push_back(rewriter.clone(*ubOp)->getResult(0)); - clauseOps.loopStepVar.push_back(rewriter.clone(*stepOp)->getResult(0)); - clauseOps.loopInclusiveAttr = rewriter.getUnitAttr(); - // ==== TODO (1) End ==== - auto wsloopOp = rewriter.create(doLoop.getLoc()); - rewriter.createBlock(&wsloopOp.getRegion()); - rewriter.setInsertionPoint( - rewriter.create(wsloopOp.getLoc())); + llvm::SmallVector liveIns; + collectLoopLiveIns(doLoop, liveIns); + assert(!liveIns.empty()); - auto loopNestOp = - rewriter.create(doLoop.getLoc(), clauseOps); + mlir::IRMapping mapper; + mlir::omp::TargetOp targetOp; + mlir::omp::LoopNestClauseOps loopNestClauseOps; + + if (mapToDevice) { + mlir::omp::TargetClauseOps clauseOps; + for (mlir::Value liveIn : liveIns) + clauseOps.mapVars.push_back(genMapInfoOpForLiveIn(rewriter, liveIn)); + targetOp = + genTargetOp(doLoop.getLoc(), rewriter, mapper, liveIns, clauseOps); + genTeamsOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + loopNestClauseOps); + genDistributeOp(doLoop.getLoc(), rewriter); + } - auto outlineableOp = - mlir::dyn_cast(*parallelOp); - rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock()); - - // ==== TODO (2) Start ==== - // - // The goal of the following simple work-list algorithm and - // the following `for` loop is to collect all the operations related to the - // allocation of the induction variable for the `do concurrent` loop. The - // operations collected by this algorithm are very similar to what is - // usually emitted for privatized variables, e.g. for omp.parallel loops. - // Therefore, I think we can: - // - // 1. **Stage 1**: Add an analysis that colects all these operations. The - // goal is similar to **Stage 1** of TODO (1): isolate the - // algorithm is an individually-testable component so that - // we properly implement and test it for more complicated - // `do concurrent` loops. - // 1. **Stage 2**: Using the collected operations, create and populate an - // `omp.private {type=private}` op to server as the - // delayed privatizer for the new work-sharing loop. - - // For the induction variable, we need to privative its allocation and - // binding inside the parallel region. - llvm::SmallSetVector workList; - // Therefore, we first discover the induction variable by discovering - // `fir.store`s where the source is the loop's block argument. - workList.insert(doLoop.getInductionVar().getUsers().begin(), - doLoop.getInductionVar().getUsers().end()); - llvm::SmallSetVector inductionVarTargetStores; - - // Walk the def-chain of the loop's block argument until we hit `fir.store`. - while (!workList.empty()) { - mlir::Operation *item = workList.front(); - - if (auto storeOp = mlir::dyn_cast(item)) { - inductionVarTargetStores.insert(storeOp); - } else { - workList.insert(item->getUsers().begin(), item->getUsers().end()); - } + genParallelOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + loopNestClauseOps); + genWsLoopOp(rewriter, doLoop, mapper, loopNestClauseOps); + + // Now that we created the nested `ws.loop` op, we set can the `target` op's + // trip count. + if (mapToDevice) { + rewriter.setInsertionPoint(targetOp); + auto parentModule = doLoop->getParentOfType(); + fir::FirOpBuilder firBuilder(rewriter, fir::getKindMapping(parentModule)); + + mlir::omp::CollapseClauseOps collapseClauseOps; + collapseClauseOps.loopLBVar.push_back(lbOp->getResult(0)); + collapseClauseOps.loopUBVar.push_back(ubOp->getResult(0)); + collapseClauseOps.loopStepVar.push_back(stepOp->getResult(0)); + + mlir::cast(targetOp).getTripCountMutable().assign( + Fortran::lower::omp::calculateTripCount(firBuilder, doLoop.getLoc(), + collapseClauseOps)); + } + + rewriter.eraseOp(doLoop); + return mlir::success(); + } + +private: + /// Collect the list of values used inside the loop but defined outside of it. + /// The first item in the returned list is always the loop's induction + /// variable. + void collectLoopLiveIns(fir::DoLoopOp doLoop, + llvm::SmallVectorImpl &liveIns) const { + // Given an operation `op`, this lambda returns true if `op`'s operand is + // ultimately the loop's induction variable. Detecting this helps finding + // the live-in value corresponding to the induction variable in case the + // induction variable is indirectly used in the loop (e.g. throught a cast + // op). + std::function isIndVarUltimateOperand = + [&](mlir::Operation *op) { + if (auto storeOp = mlir::dyn_cast_if_present(op)) { + return (storeOp.getValue() == doLoop.getInductionVar()) || + isIndVarUltimateOperand(storeOp.getValue().getDefiningOp()); + } + + if (auto convertOp = mlir::dyn_cast_if_present(op)) { + return convertOp.getOperand() == doLoop.getInductionVar() || + isIndVarUltimateOperand( + convertOp.getValue().getDefiningOp()); + } + + return false; + }; + + llvm::SmallDenseSet seenValues; + llvm::SmallDenseSet seenOps; - workList.remove(item); + mlir::visitUsedValuesDefinedAbove( + doLoop.getRegion(), [&](mlir::OpOperand *operand) { + if (!seenValues.insert(operand->get()).second) + return; + + mlir::Operation *definingOp = operand->get().getDefiningOp(); + // We want to collect ops corresponding to live-ins only once. + if (definingOp && !seenOps.insert(definingOp).second) + return; + + liveIns.push_back(operand->get()); + + if (isIndVarUltimateOperand(operand->getOwner())) + std::swap(*liveIns.begin(), *liveIns.rbegin()); + }); + } + + void genBoundsOps(mlir::ConversionPatternRewriter &rewriter, + mlir::Location loc, hlfir::DeclareOp declareOp, + llvm::SmallVectorImpl &boundsOps) const { + if (declareOp.getShape() == nullptr) { + return; } - // For each collected `fir.store`, find the target memref's alloca's and - // declare ops. - llvm::SmallSetVector declareAndAllocasToClone; - for (auto storeOp : inductionVarTargetStores) { - mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp(); + auto shapeOp = mlir::dyn_cast_if_present( + declareOp.getShape().getDefiningOp()); - for (auto operand : storeTarget->getOperands()) { - declareAndAllocasToClone.insert(operand.getDefiningOp()); - } - declareAndAllocasToClone.insert(storeTarget); + if (shapeOp == nullptr) + TODO(loc, "Shapes not defined by shape op's are not supported yet."); + + auto extents = shapeOp.getExtents(); + + auto genBoundsOp = [&](mlir::Value extent) { + mlir::Type extentType = extent.getType(); + auto lb = rewriter.create( + loc, extentType, rewriter.getIntegerAttr(extentType, 0)); + // TODO I think this caluclation might not be correct. But this is how + // it is done in PFT->OpenMP lowering. So keeping it like this until we + // double check. + mlir::Value ub = rewriter.create(loc, extent, lb); + + return rewriter.create( + loc, rewriter.getType(), lb, ub, extent, + mlir::Value{}, false, mlir::Value{}); + }; + + for (auto extent : extents) + boundsOps.push_back(genBoundsOp(extent)); + } + + mlir::omp::MapInfoOp + genMapInfoOpForLiveIn(mlir::ConversionPatternRewriter &rewriter, + mlir::Value liveIn) const { + auto declareOp = + mlir::dyn_cast_if_present(liveIn.getDefiningOp()); + + if (declareOp == nullptr) + TODO(liveIn.getLoc(), + "Values not defined by declare op's are not supported yet."); + + mlir::Type liveInType = liveIn.getType(); + mlir::Type eleType = liveInType; + if (auto refType = mlir::dyn_cast(liveInType)) + eleType = refType.getElementType(); + + llvm::omp::OpenMPOffloadMappingFlags mapFlag = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::VariableCaptureKind captureKind = + mlir::omp::VariableCaptureKind::ByRef; + + if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { + captureKind = mlir::omp::VariableCaptureKind::ByCopy; + } else if (!fir::isa_builtin_cptr_type(eleType)) { + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } - // ==== TODO (2) End ==== - // - // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can - // more easily generalize the pass to work for targets other than OpenMP, - // e.g. OpenACC, I think can, can reuse the results of the analyses and only - // change the code-gen/rewriting. - mlir::IRMapping mapper; + llvm::SmallVector boundsOps; + genBoundsOps(rewriter, liveIn.getLoc(), declareOp, boundsOps); + + return Fortran::lower::omp::createMapInfoOp( + rewriter, liveIn.getLoc(), declareOp.getBase(), /*varPtrPtr=*/{}, + declareOp.getUniqName().str(), boundsOps, /*members=*/{}, + /*membersIndex=*/mlir::DenseIntElementsAttr{}, + static_cast< + std::underlying_type_t>( + mapFlag), + captureKind, liveInType); + } + + mlir::omp::TargetOp genTargetOp(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + mlir::IRMapping &mapper, + llvm::ArrayRef liveIns, + mlir::omp::TargetClauseOps &clauseOps) const { + auto targetOp = rewriter.create(loc, clauseOps); + + mlir::Region ®ion = targetOp.getRegion(); + + llvm::SmallVector liveInTypes; + llvm::SmallVector liveInLocs; + + for (mlir::Value liveIn : liveIns) { + liveInTypes.push_back(liveIn.getType()); + liveInLocs.push_back(liveIn.getLoc()); + } + + rewriter.createBlock(®ion, {}, liveInTypes, liveInLocs); + + for (auto [arg, mapInfoOp] : + llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) { + auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); + hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); + mapper.map(miOp.getVariableOperand(0), liveInDeclare.getBase()); + } + + rewriter.setInsertionPoint( + rewriter.create(targetOp.getLoc())); + + return targetOp; + } + + hlfir::DeclareOp + genLiveInDeclare(mlir::ConversionPatternRewriter &rewriter, + mlir::Value liveInArg, + mlir::omp::MapInfoOp liveInMapInfoOp) const { + mlir::Type liveInType = liveInArg.getType(); + + if (fir::isa_ref_type(liveInType)) + liveInType = fir::unwrapRefType(liveInType); + + mlir::Value shape = [&]() -> mlir::Value { + if (hlfir::isFortranScalarNumericalType(liveInType)) + return {}; + + if (hlfir::isFortranArrayObject(liveInType)) { + llvm::SmallVector shapeOpOperands; - // Collect the memref defining ops in the parallel region. - for (mlir::Operation *opToClone : declareAndAllocasToClone) { + for (auto boundsOperand : liveInMapInfoOp.getBounds()) { + auto boundsOp = + mlir::cast(boundsOperand.getDefiningOp()); + mlir::Operation *localExtentDef = + boundsOp.getExtent().getDefiningOp()->clone(); + rewriter.getInsertionBlock()->push_back(localExtentDef); + assert(localExtentDef->getNumResults() == 1); + + shapeOpOperands.push_back(localExtentDef->getResult(0)); + } + + return rewriter.create(liveInArg.getLoc(), + shapeOpOperands); + } + + std::string opStr; + llvm::raw_string_ostream opOs(opStr); + opOs << "Unsupported type: " << liveInType; + llvm_unreachable(opOs.str().c_str()); + }(); + + return rewriter.create(liveInArg.getLoc(), liveInArg, + liveInMapInfoOp.getName().value(), + shape); + } + + mlir::omp::TeamsOp + genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, + mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + auto teamsOp = rewriter.create( + loc, /*clauses=*/mlir::omp::TeamsClauseOps{}); + + rewriter.createBlock(&teamsOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); + + genInductionVariableAlloc(rewriter, liveIns, mapper); + genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); + + return teamsOp; + } + + void + genLoopNestClauseOps(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + assert(loopNestClauseOps.loopLBVar.empty() && + "Loop nest bounds were already emitted!"); + + // Clones the chain of ops defining a certain loop bound or its step into + // the parallel region. For example, if the value of a bound is defined by a + // `fir.convert`op, this lambda clones the `fir.convert` as well as the + // value it converts from. We do this since `omp.target` regions are + // isolated from above. + std::function + cloneBoundOrStepDefChain = [&](mlir::Operation *operation) { + if (mlir::isa_and_present(operation)) + return rewriter.clone(*operation, mapper); + + if (auto convertOp = + mlir::dyn_cast_if_present(operation)) { + cloneBoundOrStepDefChain(convertOp.getValue().getDefiningOp()); + return rewriter.clone(*operation, mapper); + } + + std::string opStr; + llvm::raw_string_ostream opOs(opStr); + opOs << "Unexpected operation: " << *operation; + llvm_unreachable(opOs.str().c_str()); + }; + + mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); + mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); + mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); + + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(lbOp)->getResult(0)); + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(ubOp)->getResult(0)); + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(stepOp)->getResult(0)); + loopNestClauseOps.loopInclusiveAttr = rewriter.getUnitAttr(); + } + + mlir::omp::DistributeOp + genDistributeOp(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter) const { + auto distOp = rewriter.create( + loc, /*clauses=*/mlir::omp::DistributeClauseOps{}); + + rewriter.createBlock(&distOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); + + return distOp; + } + + void genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, + llvm::ArrayRef liveIns, + mlir::IRMapping &mapper) const { + mlir::Operation *indVarMemDef = liveIns.front().getDefiningOp(); + + assert( + indVarMemDef != nullptr && + "Induction variable memdef is expected to have a defining operation."); + + llvm::SmallSetVector indVarDeclareAndAlloc; + for (auto operand : indVarMemDef->getOperands()) + indVarDeclareAndAlloc.insert(operand.getDefiningOp()); + indVarDeclareAndAlloc.insert(indVarMemDef); + + for (mlir::Operation *opToClone : indVarDeclareAndAlloc) rewriter.clone(*opToClone, mapper); + } + + mlir::omp::ParallelOp + genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, + mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + auto parallelOp = rewriter.create(loc); + rewriter.createBlock(¶llelOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); + + // If mapping to host, the local induction variable and loop bounds need to + // be emitted as part of the `omp.parallel` op. + if (!mapToDevice) { + genInductionVariableAlloc(rewriter, liveIns, mapper); + genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); } - // Clone the loop's body inside the worksharing construct using the mapped - // memref values. + return parallelOp; + } + + mlir::omp::LoopNestOp + genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop, + mlir::IRMapping &mapper, + const mlir::omp::LoopNestClauseOps &clauseOps) const { + + auto wsloopOp = rewriter.create(doLoop.getLoc()); + rewriter.createBlock(&wsloopOp.getRegion()); + rewriter.setInsertionPoint( + rewriter.create(wsloopOp.getLoc())); + + auto loopNestOp = + rewriter.create(doLoop.getLoc(), clauseOps); + + // Clone the loop's body inside the loop nest construct using the + // mapped values. rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), loopNestOp.getRegion().begin(), mapper); @@ -192,16 +445,25 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { rewriter.create(terminator->getLoc()); rewriter.eraseOp(terminator); - rewriter.eraseOp(doLoop); - - return mlir::success(); + return loopNestOp; } + + bool mapToDevice; }; class DoConcurrentConversionPass : public fir::impl::DoConcurrentConversionPassBase< DoConcurrentConversionPass> { public: + using fir::impl::DoConcurrentConversionPassBase< + DoConcurrentConversionPass>::DoConcurrentConversionPassBase; + + DoConcurrentConversionPass() = default; + + DoConcurrentConversionPass( + const fir::DoConcurrentConversionPassOptions &options) + : DoConcurrentConversionPassBase(options) {} + void runOnOperation() override { mlir::func::FuncOp func = getOperation(); @@ -210,8 +472,16 @@ class DoConcurrentConversionPass } auto *context = &getContext(); + + if (mapTo != "host" && mapTo != "device") { + mlir::emitWarning(mlir::UnknownLoc::get(context), + "DoConcurrentConversionPass: invalid `map-to` value. " + "Valid values are: `host` or `device`"); + return; + } + mlir::RewritePatternSet patterns(context); - patterns.insert(context); + patterns.insert(context, mapTo == "device"); mlir::ConversionTarget target(*context); target.addLegalDialect fir::createDoConcurrentConversionPass() { - return std::make_unique(); -} +std::unique_ptr +fir::createDoConcurrentConversionPass(bool mapToDevice) { + DoConcurrentConversionPassOptions options; + options.mapTo = mapToDevice ? "device" : "host"; + return std::make_unique(options); +} diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 new file mode 100644 index 00000000000000..f059a7fb1c0aae --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -0,0 +1,86 @@ +! Tests mapping of a basic `do concurrent` loop to +! `!$omp target teams distribute parallel do`. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: | FileCheck %s +! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: | FileCheck %s + +! CHECK-LABEL: do_concurrent_basic +program do_concurrent_basic + implicit none + integer :: a(10) + integer :: i + + ! CHECK-DAG: %[[I_ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK: %[[I_ORIG_DECL:.*]]:2 = hlfir.declare %[[I_ORIG_ALLOC]] + + ! CHECK-DAG: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) + ! CHECK: %[[A_SHAPE:.*]] = fir.shape %[[A_EXTENT:.*]] : (index) -> !fir.shape<1> + ! CHECK: %[[A_ORIG_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]](%[[A_SHAPE]]) + + ! CHECK-NOT: fir.do_loop + + ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#0 + ! CHECK: %[[C0:.*]] = arith.constant 0 : index + ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index + + ! CHECK: %[[A_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index) + ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index) + ! CHECK-SAME: extent(%[[A_EXTENT]] : index) + + ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#0 : {{[^(]+}}) + ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]]) + + ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64 + + ! CHECK: omp.target trip_count(%[[TRIP_COUNT]] : i64) + ! CHECK-SAME: map_entries(%[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]], + ! CHECK-SAME: %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]] + + ! CHECK-NEXT: ^{{.*}}(%[[I_ARG]]: !fir.ref, %[[A_ARG]]: !fir.ref>): + + ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] + ! CHECK: omp.teams { + + ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + ! CHECK: %[[STEP:.*]] = arith.constant 1 : index + + ! CHECK-NEXT: omp.distribute { + ! CHECK-NEXT: omp.parallel { + + ! CHECK-NEXT: omp.wsloop { + + ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[A_DEV_DECL]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + ! CHECK-NEXT: omp.yield + ! CHECK-NEXT: } + + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + do concurrent (i=1:10) + a(i) = i + end do + + ! CHECK-NOT: fir.do_loop +end program do_concurrent_basic diff --git a/flang/test/Transforms/DoConcurrent/basic.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 similarity index 94% rename from flang/test/Transforms/DoConcurrent/basic.f90 rename to flang/test/Transforms/DoConcurrent/basic_host.f90 index 15faddb4f17fe1..62a9306b21cccd 100644 --- a/flang/test/Transforms/DoConcurrent/basic.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -2,12 +2,13 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ ! RUN: | FileCheck %s - +! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ +! RUN: | FileCheck %s + ! CHECK-LABEL: do_concurrent_basic program do_concurrent_basic ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 - ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + implicit none integer :: a(10) integer :: i @@ -19,7 +20,9 @@ program do_concurrent_basic ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index ! CHECK: %[[STEP:.*]] = arith.constant 1 : index @@ -39,7 +42,7 @@ program do_concurrent_basic ! CHECK-NEXT: omp.terminator ! CHECK-NEXT: } - do concurrent (integer :: i=1:10) + do concurrent (i=1:10) a(i) = i end do diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir similarity index 97% rename from flang/test/Transforms/DoConcurrent/basic.mlir rename to flang/test/Transforms/DoConcurrent/basic_host.mlir index 764e62b647f913..7eb9d2d7da39fa 100644 --- a/flang/test/Transforms/DoConcurrent/basic.mlir +++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir @@ -1,12 +1,10 @@ // Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. -// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s +// RUN: fir-opt --fopenmp-do-concurrent-conversion="map-to=host" %s | FileCheck %s // CHECK-LABEL: func.func @do_concurrent_basic func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - // CHECK: %[[C1:.*]] = arith.constant 1 : i32 - // CHECK: %[[C10:.*]] = arith.constant 10 : i32 %0 = fir.alloca i32 {bindc_name = "i"} %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -27,7 +25,9 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + // CHECK: %[[C1:.*]] = arith.constant 1 : i32 // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + // CHECK: %[[C10:.*]] = arith.constant 10 : i32 // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index // CHECK: %[[STEP:.*]] = arith.constant 1 : index diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index ee2ff8562e9ff2..2b8b00570910d6 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -139,6 +139,12 @@ static llvm::cl::opt llvm::cl::desc("enable openmp device compilation"), llvm::cl::init(false)); +static llvm::cl::opt enableDoConcurrentToOpenMPConversion( + "fdo-concurrent-parallel", + llvm::cl::desc( + "Try to map `do concurrent` loops to OpenMP [none|host|device]"), + llvm::cl::init("none")); + static llvm::cl::opt enableOpenMPGPU("fopenmp-is-gpu", llvm::cl::desc("enable openmp GPU target codegen"), @@ -258,7 +264,18 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); - fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice); + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + + auto doConcurrentMappingKind = + llvm::StringSwitch( + enableDoConcurrentToOpenMPConversion) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Default(DoConcurrentMappingKind::DCMK_None); + + fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice, + doConcurrentMappingKind); (void)mlir::applyPassManagerCLOptions(pm); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline";