From 62423e114a380771c0225bf43cd39bf4c7ca0122 Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 8 May 2024 06:40:46 -0500 Subject: [PATCH 1/8] [flang][OpenMP] Extend `do concurrent` mapping to device. For simple loops, we can now choose to map `do concurrent` to either the host (i.e. `omp parallel do`) or the device (i.e. `omp target teams distribute parallel do`). In order to use this from `flang-new`, you can pass: `-fopenmp -fdo-concurrent-parallel=[host|device|none]`; where `none` will disable the `do concurrent` mapping altogether. --- .../flang}/Lower/OpenMP/Clauses.h | 0 .../flang}/Lower/OpenMP/Utils.h | 4 +- .../flang/Optimizer/Transforms/Passes.h | 2 + .../flang/Optimizer/Transforms/Passes.td | 6 +- flang/lib/Frontend/FrontendActions.cpp | 37 +- flang/lib/Lower/OpenMP/ClauseProcessor.cpp | 26 +- flang/lib/Lower/OpenMP/ClauseProcessor.h | 4 +- flang/lib/Lower/OpenMP/Clauses.cpp | 2 +- .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 2 +- flang/lib/Lower/OpenMP/DataSharingProcessor.h | 2 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 86 +-- flang/lib/Lower/OpenMP/ReductionProcessor.h | 2 +- flang/lib/Lower/OpenMP/Utils.cpp | 106 +++- flang/lib/Optimizer/Transforms/CMakeLists.txt | 1 + .../Transforms/DoConcurrentConversion.cpp | 519 ++++++++++++++---- .../Transforms/DoConcurrent/basic_device.f90 | 86 +++ .../{basic.f90 => basic_host.f90} | 11 +- .../{basic.mlir => basic_host.mlir} | 6 +- 18 files changed, 639 insertions(+), 263 deletions(-) rename flang/{lib => include/flang}/Lower/OpenMP/Clauses.h (100%) rename flang/{lib => include/flang}/Lower/OpenMP/Utils.h (91%) create mode 100644 flang/test/Transforms/DoConcurrent/basic_device.f90 rename flang/test/Transforms/DoConcurrent/{basic.f90 => basic_host.f90} (93%) rename flang/test/Transforms/DoConcurrent/{basic.mlir => basic_host.mlir} (97%) diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/include/flang/Lower/OpenMP/Clauses.h similarity index 100% rename from flang/lib/Lower/OpenMP/Clauses.h rename to flang/include/flang/Lower/OpenMP/Clauses.h diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h similarity index 91% rename from flang/lib/Lower/OpenMP/Utils.h rename to flang/include/flang/Lower/OpenMP/Utils.h index f322580cdab8fc..7152a2c978c2d7 100644 --- a/flang/lib/Lower/OpenMP/Utils.h +++ b/flang/include/flang/Lower/OpenMP/Utils.h @@ -50,7 +50,7 @@ using DeclareTargetCapturePair = const Fortran::semantics::Symbol &>; mlir::omp::MapInfoOp -createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, +createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, mlir::ArrayRef bounds, mlir::ArrayRef members, uint64_t mapType, @@ -73,6 +73,8 @@ void genObjectList(const ObjectList &objects, Fortran::lower::AbstractConverter &converter, llvm::SmallVectorImpl &operands); +mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, + const mlir::omp::CollapseClauseOps &ops); } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 305e85c1d6f81f..ae32e90b85cf63 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -38,6 +38,7 @@ namespace fir { #define GEN_PASS_DECL_ARRAYVALUECOPY #define GEN_PASS_DECL_CHARACTERCONVERSION #define GEN_PASS_DECL_CFGCONVERSION +#define GEN_PASS_DECL_DOCONCURRENTCONVERSIONPASS #define GEN_PASS_DECL_EXTERNALNAMECONVERSION #define GEN_PASS_DECL_MEMREFDATAFLOWOPT #define GEN_PASS_DECL_SIMPLIFYINTRINSICS @@ -96,6 +97,7 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noSignedZerosFPMath, bool unsafeFPMath); std::unique_ptr createDoConcurrentConversionPass(); +std::unique_ptr createDoConcurrentConversionPass(bool mapToDevice); void populateCfgConversionRewrites(mlir::RewritePatternSet &patterns, bool forceLoopToExecuteOnce = false); diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 7d24f65f605cdc..97d550a2bf75e1 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -423,8 +423,12 @@ def DoConcurrentConversionPass : Pass<"fopenmp-do-concurrent-conversion", "mlir: target. }]; - let constructor = "::fir::createDoConcurrentConversionPass()"; let dependentDialects = ["mlir::omp::OpenMPDialect"]; + + let options = [ + Option<"mapTo", "map-to", "std::string", "", + "Try to map `do concurrent` loops to OpenMP (on host or device)">, + ]; } #endif // FLANG_OPTIMIZER_TRANSFORMS_PASSES diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 840599b04b95fc..479c22d02f9f3f 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -320,22 +320,14 @@ bool CodeGenAction::beginSourceFileAction() { // Add OpenMP-related passes // WARNING: These passes must be run immediately after the lowering to ensure // that the FIR is correct with respect to OpenMP operations/attributes. - bool isOpenMPEnabled = ci.getInvocation().getFrontendOpts().features.IsEnabled( + bool isOpenMPEnabled = + ci.getInvocation().getFrontendOpts().features.IsEnabled( Fortran::common::LanguageFeature::OpenMP); - if (isOpenMPEnabled) { - bool isDevice = false; - if (auto offloadMod = llvm::dyn_cast( - mlirModule->getOperation())) - isDevice = offloadMod.getIsTargetDevice(); - // WARNING: This pipeline must be run immediately after the lowering to - // ensure that the FIR is correct with respect to OpenMP operations/ - // attributes. - fir::createOpenMPFIRPassPipeline(pm, isDevice); - } using DoConcurrentMappingKind = Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; - DoConcurrentMappingKind selectedKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); + DoConcurrentMappingKind selectedKind = + ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); if (selectedKind != DoConcurrentMappingKind::DCMK_None) { if (!isOpenMPEnabled) { unsigned diagID = ci.getDiagnostics().getCustomDiagID( @@ -345,18 +337,21 @@ bool CodeGenAction::beginSourceFileAction() { ci.getDiagnostics().Report(diagID); } else { bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device; - - if (mapToDevice) { - unsigned diagID = ci.getDiagnostics().getCustomDiagID( - clang::DiagnosticsEngine::Warning, - "TODO: lowering `do concurrent` loops to OpenMP device is not " - "supported yet"); - ci.getDiagnostics().Report(diagID); - } else - pm.addPass(fir::createDoConcurrentConversionPass()); + pm.addPass(fir::createDoConcurrentConversionPass(mapToDevice)); } } + if (isOpenMPEnabled) { + bool isDevice = false; + if (auto offloadMod = llvm::dyn_cast( + mlirModule->getOperation())) + isDevice = offloadMod.getIsTargetDevice(); + // WARNING: This pipeline must be run immediately after the lowering to + // ensure that the FIR is correct with respect to OpenMP operations/ + // attributes. + fir::createOpenMPFIRPassPipeline(pm, isDevice); + } + pm.enableVerifier(/*verifyPasses=*/true); pm.addPass(std::make_unique()); diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index a2661ffb437832..22b65b6cdb68fa 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #include "ClauseProcessor.h" -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Parser/tools.h" #include "flang/Semantics/tools.h" @@ -807,30 +807,6 @@ bool ClauseProcessor::processLink( }); } -mlir::omp::MapInfoOp -createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc, - mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, - llvm::ArrayRef bounds, - llvm::ArrayRef members, uint64_t mapType, - mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, - bool isVal) { - if (auto boxTy = baseAddr.getType().dyn_cast()) { - baseAddr = builder.create(loc, baseAddr); - retTy = baseAddr.getType(); - } - - mlir::TypeAttr varType = mlir::TypeAttr::get( - llvm::cast(retTy).getElementType()); - - mlir::omp::MapInfoOp op = builder.create( - loc, retTy, baseAddr, varType, varPtrPtr, members, bounds, - builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), - builder.getAttr(mapCaptureType), - builder.getStringAttr(name)); - - return op; -} - bool ClauseProcessor::processMap( mlir::Location currentLocation, Fortran::lower::StatementContext &stmtCtx, mlir::omp::MapClauseOps &result, diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 050f2fef1332c0..2d5caed50e9800 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -12,12 +12,12 @@ #ifndef FORTRAN_LOWER_CLAUASEPROCESSOR_H #define FORTRAN_LOWER_CLAUASEPROCESSOR_H -#include "Clauses.h" #include "DirectivesCommon.h" #include "ReductionProcessor.h" -#include "Utils.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/Bridge.h" +#include "flang/Lower/OpenMP/Clauses.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Optimizer/Builder/Todo.h" #include "flang/Parser/dump-parse-tree.h" #include "flang/Parser/parse-tree.h" diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp index 97337cfc08c72a..b3f8c73d8753d4 100644 --- a/flang/lib/Lower/OpenMP/Clauses.cpp +++ b/flang/lib/Lower/OpenMP/Clauses.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Common/idioms.h" #include "flang/Evaluate/expression.h" diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index befadea2a98aba..fd67714be0225f 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -12,7 +12,7 @@ #include "DataSharingProcessor.h" -#include "Utils.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Lower/PFTBuilder.h" #include "flang/Lower/SymbolMap.h" #include "flang/Optimizer/Builder/Todo.h" diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h index 46f0d6d57d90b0..7ba3b85c9155c2 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -12,9 +12,9 @@ #ifndef FORTRAN_LOWER_DATASHARINGPROCESSOR_H #define FORTRAN_LOWER_DATASHARINGPROCESSOR_H -#include "Clauses.h" #include "flang/Lower/AbstractConverter.h" #include "flang/Lower/OpenMP.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Parser/parse-tree.h" #include "flang/Semantics/symbol.h" diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index a80b992c0c7465..8cad6154c7311a 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -13,15 +13,15 @@ #include "flang/Lower/OpenMP.h" #include "ClauseProcessor.h" -#include "Clauses.h" #include "DataSharingProcessor.h" #include "DirectivesCommon.h" #include "ReductionProcessor.h" -#include "Utils.h" #include "flang/Common/idioms.h" #include "flang/Lower/Bridge.h" #include "flang/Lower/ConvertExpr.h" #include "flang/Lower/ConvertVariable.h" +#include "flang/Lower/OpenMP/Clauses.h" +#include "flang/Lower/OpenMP/Utils.h" #include "flang/Lower/StatementContext.h" #include "flang/Lower/SymbolMap.h" #include "flang/Optimizer/Builder/BoxValue.h" @@ -280,84 +280,6 @@ static void threadPrivatizeVars(Fortran::lower::AbstractConverter &converter, } } -static mlir::Value -calculateTripCount(Fortran::lower::AbstractConverter &converter, - mlir::Location loc, - const mlir::omp::CollapseClauseOps &ops) { - using namespace mlir::arith; - assert(ops.loopLBVar.size() == ops.loopUBVar.size() && - ops.loopLBVar.size() == ops.loopStepVar.size() && - !ops.loopLBVar.empty() && "Invalid bounds or step"); - - fir::FirOpBuilder &b = converter.getFirOpBuilder(); - - // Get the bit width of an integer-like type. - auto widthOf = [](mlir::Type ty) -> unsigned { - if (mlir::isa(ty)) { - return mlir::IndexType::kInternalStorageBitWidth; - } - if (auto tyInt = mlir::dyn_cast(ty)) { - return tyInt.getWidth(); - } - llvm_unreachable("Unexpected type"); - }; - - // For a type that is either IntegerType or IndexType, return the - // equivalent IntegerType. In the former case this is a no-op. - auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { - if (ty.isIndex()) { - return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); - } - assert(ty.isIntOrIndex() && "Unexpected type"); - return mlir::cast(ty); - }; - - // For two given values, establish a common signless IntegerType - // that can represent any value of type of x and of type of y, - // and return the pair of x, y converted to the new type. - auto unifyToSignless = - [&](fir::FirOpBuilder &b, mlir::Value x, - mlir::Value y) -> std::pair { - auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); - unsigned width = std::max(widthOf(tyX), widthOf(tyY)); - auto wideTy = mlir::IntegerType::get(b.getContext(), width, - mlir::IntegerType::Signless); - return std::make_pair(b.createConvert(loc, wideTy, x), - b.createConvert(loc, wideTy, y)); - }; - - // Start with signless i32 by default. - auto tripCount = b.createIntegerConstant(loc, b.getI32Type(), 1); - - for (auto [origLb, origUb, origStep] : - llvm::zip(ops.loopLBVar, ops.loopUBVar, ops.loopStepVar)) { - auto tmpS0 = b.createIntegerConstant(loc, origStep.getType(), 0); - auto [step, step0] = unifyToSignless(b, origStep, tmpS0); - auto reverseCond = b.create(loc, CmpIPredicate::slt, step, step0); - auto negStep = b.create(loc, step0, step); - mlir::Value absStep = b.create(loc, reverseCond, negStep, step); - - auto [lb, ub] = unifyToSignless(b, origLb, origUb); - auto start = b.create(loc, reverseCond, ub, lb); - auto end = b.create(loc, reverseCond, lb, ub); - - mlir::Value range = b.create(loc, end, start); - auto rangeCond = b.create(loc, CmpIPredicate::slt, end, start); - std::tie(range, absStep) = unifyToSignless(b, range, absStep); - // numSteps = (range /u absStep) + 1 - auto numSteps = - b.create(loc, b.create(loc, range, absStep), - b.createIntegerConstant(loc, range.getType(), 1)); - - auto trip0 = b.createIntegerConstant(loc, numSteps.getType(), 0); - auto loopTripCount = b.create(loc, rangeCond, trip0, numSteps); - auto [totalTC, thisTC] = unifyToSignless(b, tripCount, loopTripCount); - tripCount = b.create(loc, totalTC, thisTC); - } - - return tripCount; -} - static mlir::Operation * createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter, mlir::Location loc, mlir::Value indexVal, @@ -1572,8 +1494,8 @@ genLoopNestOp(Fortran::lower::AbstractConverter &converter, llvm::SmallVector iv; ClauseProcessor cp(converter, semaCtx, clauses); cp.processCollapse(loc, eval, collapseClauseOps, iv); - targetOp.getTripCountMutable().assign( - calculateTripCount(converter, loc, collapseClauseOps)); + targetOp.getTripCountMutable().assign(calculateTripCount( + converter.getFirOpBuilder(), loc, collapseClauseOps)); } return loopNestOp; } diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h index 17c8ae7b69214a..3f118b936c909a 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.h +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h @@ -13,7 +13,7 @@ #ifndef FORTRAN_LOWER_REDUCTIONPROCESSOR_H #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H -#include "Clauses.h" +#include "flang/Lower/OpenMP/Clauses.h" #include "flang/Optimizer/Builder/FIRBuilder.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Semantics/symbol.h" diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index da3f2be73e5095..c2a03f6c656a0a 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -10,16 +10,17 @@ // //===----------------------------------------------------------------------===// -#include "Utils.h" -#include "Clauses.h" +#include #include #include +#include #include #include #include #include #include +#include llvm::cl::opt treatIndexAsSection( "openmp-treat-index-as-section", @@ -112,6 +113,107 @@ getOmpObjectSymbol(const Fortran::parser::OmpObject &ompObject) { return sym; } +mlir::omp::MapInfoOp +createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, + mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, + llvm::ArrayRef bounds, + llvm::ArrayRef members, uint64_t mapType, + mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, + bool isVal) { + if (auto boxTy = baseAddr.getType().dyn_cast()) { + baseAddr = builder.create(loc, baseAddr); + retTy = baseAddr.getType(); + } + + mlir::TypeAttr varType = mlir::TypeAttr::get( + llvm::cast(retTy).getElementType()); + + mlir::omp::MapInfoOp op = builder.create( + loc, retTy, baseAddr, varType, varPtrPtr, members, bounds, + builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), + builder.getAttr(mapCaptureType), + builder.getStringAttr(name)); + + return op; +} + +mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, + const mlir::omp::CollapseClauseOps &ops) { + using namespace mlir::arith; + assert(ops.loopLBVar.size() == ops.loopUBVar.size() && + ops.loopLBVar.size() == ops.loopStepVar.size() && + !ops.loopLBVar.empty() && "Invalid bounds or step"); + + // Get the bit width of an integer-like type. + auto widthOf = [](mlir::Type ty) -> unsigned { + if (mlir::isa(ty)) { + return mlir::IndexType::kInternalStorageBitWidth; + } + if (auto tyInt = mlir::dyn_cast(ty)) { + return tyInt.getWidth(); + } + llvm_unreachable("Unexpected type"); + }; + + // For a type that is either IntegerType or IndexType, return the + // equivalent IntegerType. In the former case this is a no-op. + auto asIntTy = [&](mlir::Type ty) -> mlir::IntegerType { + if (ty.isIndex()) { + return mlir::IntegerType::get(ty.getContext(), widthOf(ty)); + } + assert(ty.isIntOrIndex() && "Unexpected type"); + return mlir::cast(ty); + }; + + // For two given values, establish a common signless IntegerType + // that can represent any value of type of x and of type of y, + // and return the pair of x, y converted to the new type. + auto unifyToSignless = + [&](fir::FirOpBuilder &b, mlir::Value x, + mlir::Value y) -> std::pair { + auto tyX = asIntTy(x.getType()), tyY = asIntTy(y.getType()); + unsigned width = std::max(widthOf(tyX), widthOf(tyY)); + auto wideTy = mlir::IntegerType::get(b.getContext(), width, + mlir::IntegerType::Signless); + return std::make_pair(b.createConvert(loc, wideTy, x), + b.createConvert(loc, wideTy, y)); + }; + + // Start with signless i32 by default. + auto tripCount = builder.createIntegerConstant(loc, builder.getI32Type(), 1); + + for (auto [origLb, origUb, origStep] : + llvm::zip(ops.loopLBVar, ops.loopUBVar, ops.loopStepVar)) { + auto tmpS0 = builder.createIntegerConstant(loc, origStep.getType(), 0); + auto [step, step0] = unifyToSignless(builder, origStep, tmpS0); + auto reverseCond = + builder.create(loc, CmpIPredicate::slt, step, step0); + auto negStep = builder.create(loc, step0, step); + mlir::Value absStep = + builder.create(loc, reverseCond, negStep, step); + + auto [lb, ub] = unifyToSignless(builder, origLb, origUb); + auto start = builder.create(loc, reverseCond, ub, lb); + auto end = builder.create(loc, reverseCond, lb, ub); + + mlir::Value range = builder.create(loc, end, start); + auto rangeCond = + builder.create(loc, CmpIPredicate::slt, end, start); + std::tie(range, absStep) = unifyToSignless(builder, range, absStep); + // numSteps = (range /u absStep) + 1 + auto numSteps = builder.create( + loc, builder.create(loc, range, absStep), + builder.createIntegerConstant(loc, range.getType(), 1)); + + auto trip0 = builder.createIntegerConstant(loc, numSteps.getType(), 0); + auto loopTripCount = + builder.create(loc, rangeCond, trip0, numSteps); + auto [totalTC, thisTC] = unifyToSignless(builder, tripCount, loopTripCount); + tripCount = builder.create(loc, totalTC, thisTC); + } + + return tripCount; +} } // namespace omp } // namespace lower } // namespace Fortran diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 0e72b01b5f3826..fbf32690d4cfaf 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -23,6 +23,7 @@ add_flang_library(FIRTransforms VScaleAttr.cpp FunctionAttr.cpp DoConcurrentConversion.cpp + ../../Lower/OpenMP/Utils.cpp DEPENDS FIRDialect diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 44e9177a316d85..6ef77dae7c4b7b 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -6,11 +6,15 @@ // //===----------------------------------------------------------------------===// +#include "flang/Lower/OpenMP/Utils.h" +#include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIRDialect.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/Dialect/FIRType.h" #include "flang/Optimizer/Dialect/Support/FIRContext.h" #include "flang/Optimizer/HLFIR/HLFIRDialect.h" +#include "flang/Optimizer/HLFIR/HLFIROps.h" #include "flang/Optimizer/Transforms/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" @@ -18,8 +22,11 @@ #include "mlir/IR/IRMapping.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/RegionUtils.h" +#include "llvm/Frontend/OpenMP/OMPConstants.h" #include +#include namespace fir { #define GEN_PASS_DEF_DOCONCURRENTCONVERSIONPASS @@ -33,46 +40,15 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { public: using mlir::OpConversionPattern::OpConversionPattern; + DoConcurrentConversion(mlir::MLIRContext *context, bool mapToDevice) + : OpConversionPattern(context), mapToDevice(mapToDevice) {} + mlir::LogicalResult matchAndRewrite(fir::DoLoopOp doLoop, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const override { - mlir::OpPrintingFlags flags; - flags.printGenericOpForm(); - - mlir::omp::ParallelOp parallelOp = - rewriter.create(doLoop.getLoc()); - - mlir::Block *block = rewriter.createBlock(¶llelOp.getRegion()); - - rewriter.setInsertionPointToEnd(block); - rewriter.create(doLoop.getLoc()); - - rewriter.setInsertionPointToStart(block); - - // ==== TODO (1) Start ==== - // - // The goal of the few lines below is to collect and clone - // the list of operations that define the loop's lower and upper bounds as - // well as the step. Should we, instead of doing this here, split it into 2 - // stages? - // - // 1. **Stage 1**: add an analysis that extracts all the relevant - // operations defining the lower-bound, upper-bound, and - // step. - // 2. **Stage 2**: clone the collected operations in the parallel region. - // - // So far, the pass has been tested with very simple loops (where the bounds - // and step are constants) so the goal of **Stage 1** is to have a - // well-defined component that has the sole responsibility of collecting all - // the relevant ops relevant to the loop header. This was we can test this - // in isolation for more complex loops and better organize the code. **Stage - // 2** would then be responsible for the actual cloning of the collected - // loop header preparation/allocation operations. - - // Clone the LB, UB, step defining ops inside the parallel region. - mlir::Operation* lbOp = doLoop.getLowerBound().getDefiningOp(); - mlir::Operation* ubOp = doLoop.getUpperBound().getDefiningOp(); - mlir::Operation* stepOp = doLoop.getStep().getDefiningOp(); + mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); + mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); + mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); if (lbOp == nullptr || ubOp == nullptr || stepOp == nullptr) { return rewriter.notifyMatchFailure( @@ -85,7 +61,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { if (mlir::isa_and_present(operation)) return true; - if (fir::ConvertOp convertOp = + if (auto convertOp = mlir::dyn_cast_if_present(operation)) return isOpUltimatelyConstant(convertOp.getValue().getDefiningOp()); @@ -99,91 +75,378 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { "constant LB, UB, and step values."); } - mlir::omp::LoopNestClauseOps clauseOps; - clauseOps.loopLBVar.push_back(rewriter.clone(*lbOp)->getResult(0)); - clauseOps.loopUBVar.push_back(rewriter.clone(*ubOp)->getResult(0)); - clauseOps.loopStepVar.push_back(rewriter.clone(*stepOp)->getResult(0)); - clauseOps.loopInclusiveAttr = rewriter.getUnitAttr(); - // ==== TODO (1) End ==== - auto wsloopOp = rewriter.create(doLoop.getLoc()); - rewriter.createBlock(&wsloopOp.getRegion()); - rewriter.setInsertionPoint( - rewriter.create(wsloopOp.getLoc())); + llvm::SmallVector liveIns; + collectLoopLiveIns(doLoop, liveIns); + assert(!liveIns.empty()); - auto loopNestOp = - rewriter.create(doLoop.getLoc(), clauseOps); + mlir::IRMapping mapper; + mlir::omp::TargetOp targetOp = nullptr; + mlir::omp::LoopNestClauseOps loopNestClauseOps; + + if (mapToDevice) { + mlir::omp::TargetClauseOps clauseOps; + for (mlir::Value liveIn : liveIns) + clauseOps.mapVars.push_back(genMapInfoOpForLiveIn(rewriter, liveIn)); + targetOp = + genTargetOp(doLoop.getLoc(), rewriter, mapper, liveIns, clauseOps); + genTeamsOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + loopNestClauseOps); + genDistributeOp(doLoop.getLoc(), rewriter); + } - auto outlineableOp = - mlir::dyn_cast(*parallelOp); - rewriter.setInsertionPointToStart(outlineableOp.getAllocaBlock()); - - // ==== TODO (2) Start ==== - // - // The goal of the following simple work-list algorithm and - // the following `for` loop is to collect all the operations related to the - // allocation of the induction variable for the `do concurrent` loop. The - // operations collected by this algorithm are very similar to what is - // usually emitted for privatized variables, e.g. for omp.parallel loops. - // Therefore, I think we can: - // - // 1. **Stage 1**: Add an analysis that colects all these operations. The - // goal is similar to **Stage 1** of TODO (1): isolate the - // algorithm is an individually-testable component so that - // we properly implement and test it for more complicated - // `do concurrent` loops. - // 1. **Stage 2**: Using the collected operations, create and populate an - // `omp.private {type=private}` op to server as the - // delayed privatizer for the new work-sharing loop. - - // For the induction variable, we need to privative its allocation and - // binding inside the parallel region. - llvm::SmallSetVector workList; - // Therefore, we first discover the induction variable by discovering - // `fir.store`s where the source is the loop's block argument. - workList.insert(doLoop.getInductionVar().getUsers().begin(), - doLoop.getInductionVar().getUsers().end()); - llvm::SmallSetVector inductionVarTargetStores; - - // Walk the def-chain of the loop's block argument until we hit `fir.store`. - while (!workList.empty()) { - mlir::Operation *item = workList.front(); - - if (auto storeOp = mlir::dyn_cast(item)) { - inductionVarTargetStores.insert(storeOp); - } else { - workList.insert(item->getUsers().begin(), item->getUsers().end()); - } + genParallelOp(doLoop.getLoc(), rewriter, doLoop, liveIns, mapper, + loopNestClauseOps); + genWsLoopOp(rewriter, doLoop, mapper, loopNestClauseOps); + + // Now that we created the nested `ws.loop` op, we set can the `target` op's + // trip count. + if (mapToDevice) { + rewriter.setInsertionPoint(targetOp); + auto parentModule = doLoop->getParentOfType(); + fir::FirOpBuilder firBuilder(rewriter, fir::getKindMapping(parentModule)); + + mlir::omp::CollapseClauseOps collapseClauseOps; + collapseClauseOps.loopLBVar.push_back(lbOp->getResult(0)); + collapseClauseOps.loopUBVar.push_back(ubOp->getResult(0)); + collapseClauseOps.loopStepVar.push_back(stepOp->getResult(0)); + + mlir::cast(targetOp).getTripCountMutable().assign( + Fortran::lower::omp::calculateTripCount(firBuilder, doLoop.getLoc(), + collapseClauseOps)); + } + + rewriter.eraseOp(doLoop); + return mlir::success(); + } + +private: + /// Collect the list of values used inside the loop but defined outside of it. + /// The first item in the retunred list is always the loop's induction + /// variable. + void collectLoopLiveIns(fir::DoLoopOp doLoop, + llvm::SmallVectorImpl &liveIns) const { + // Given an operation `op`, this lambda returns true if `op`'s operand is + // ultimately the loop's induction variable. Detecting this helps finding + // the live-in value corresponding to the induction variable in case the + // induction variable is indirectly used in the loop (e.g. throught a cast + // op). + std::function isIndVarUltimateOperand = + [&](mlir::Operation *op) { + if (auto storeOp = mlir::dyn_cast_if_present(op)) { + return (storeOp.getValue() == doLoop.getInductionVar()) || + isIndVarUltimateOperand(storeOp.getValue().getDefiningOp()); + } + + if (auto convertOp = mlir::dyn_cast_if_present(op)) { + return convertOp.getOperand() == doLoop.getInductionVar() || + isIndVarUltimateOperand( + convertOp.getValue().getDefiningOp()); + } + + return false; + }; + + llvm::SmallDenseSet seenValues; + llvm::SmallDenseSet seenOps; - workList.remove(item); + mlir::visitUsedValuesDefinedAbove( + doLoop.getRegion(), [&](mlir::OpOperand *operand) { + if (!seenValues.insert(operand->get()).second) + return; + + mlir::Operation *definingOp = operand->get().getDefiningOp(); + // We want to collect ops corresponding to live-ins only once. + if (definingOp && !seenOps.insert(definingOp).second) + return; + + liveIns.push_back(operand->get()); + + if (isIndVarUltimateOperand(operand->getOwner())) + std::swap(*liveIns.begin(), *liveIns.rbegin()); + }); + } + + void genBoundsOps(mlir::ConversionPatternRewriter &rewriter, + mlir::Location loc, hlfir::DeclareOp declareOp, + llvm::SmallVectorImpl &boundsOps) const { + if (declareOp.getShape() == nullptr) { + return; } - // For each collected `fir.store`, find the target memref's alloca's and - // declare ops. - llvm::SmallSetVector declareAndAllocasToClone; - for (auto storeOp : inductionVarTargetStores) { - mlir::Operation *storeTarget = storeOp.getMemref().getDefiningOp(); + auto shapeOp = mlir::dyn_cast_if_present( + declareOp.getShape().getDefiningOp()); - for (auto operand : storeTarget->getOperands()) { - declareAndAllocasToClone.insert(operand.getDefiningOp()); - } - declareAndAllocasToClone.insert(storeTarget); + if (shapeOp == nullptr) + TODO(loc, "Shapes not defined by shape op's are not supported yet."); + + auto extents = shapeOp.getExtents(); + + auto genBoundsOp = [&](mlir::Value extent) { + mlir::Type extentType = extent.getType(); + auto lb = rewriter.create( + loc, extentType, rewriter.getIntegerAttr(extentType, 0)); + // TODO I think this caluclation might not be correct. But this is how + // it is done in PFT->OpenMP lowering. So keeping it like this until we + // double check. + mlir::Value ub = rewriter.create(loc, extent, lb); + + return rewriter.create( + loc, rewriter.getType(), lb, ub, extent, + mlir::Value{}, false, mlir::Value{}); + }; + + for (auto extent : extents) + boundsOps.push_back(genBoundsOp(extent)); + } + + mlir::omp::MapInfoOp + genMapInfoOpForLiveIn(mlir::ConversionPatternRewriter &rewriter, + mlir::Value liveIn) const { + auto declareOp = + mlir::dyn_cast_if_present(liveIn.getDefiningOp()); + + if (declareOp == nullptr) + TODO(liveIn.getLoc(), + "Values not defined by declare op's are not supported yet."); + + mlir::Type liveInType = liveIn.getType(); + mlir::Type eleType = liveInType; + if (auto refType = liveInType.dyn_cast()) + eleType = refType.getElementType(); + + llvm::omp::OpenMPOffloadMappingFlags mapFlag = + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT; + mlir::omp::VariableCaptureKind captureKind = + mlir::omp::VariableCaptureKind::ByRef; + + if (fir::isa_trivial(eleType) || fir::isa_char(eleType)) { + captureKind = mlir::omp::VariableCaptureKind::ByCopy; + } else if (!fir::isa_builtin_cptr_type(eleType)) { + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TO; + mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_FROM; } - // ==== TODO (2) End ==== - // - // TODO (1 & 2): Isolating analyses proposed in both TODOs, I think we can - // more easily generalize the pass to work for targets other than OpenMP, - // e.g. OpenACC, I think can, can reuse the results of the analyses and only - // change the code-gen/rewriting. - mlir::IRMapping mapper; + llvm::SmallVector boundsOps; + genBoundsOps(rewriter, liveIn.getLoc(), declareOp, boundsOps); + + return Fortran::lower::omp::createMapInfoOp( + rewriter, liveIn.getLoc(), declareOp.getBase(), /*varPtrPtr=*/{}, + declareOp.getUniqName().str(), boundsOps, /*members=*/{}, + static_cast< + std::underlying_type_t>( + mapFlag), + captureKind, liveInType); + } + + mlir::omp::TargetOp genTargetOp(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + mlir::IRMapping &mapper, + llvm::ArrayRef liveIns, + mlir::omp::TargetClauseOps &clauseOps) const { + auto targetOp = rewriter.create(loc, clauseOps); + + genBodyOfTargetOp(rewriter, targetOp, liveIns, clauseOps.mapVars, mapper); + return targetOp; + } + + void genBodyOfTargetOp(mlir::ConversionPatternRewriter &rewriter, + mlir::omp::TargetOp targetOp, + llvm::ArrayRef liveIns, + llvm::ArrayRef liveInMapInfoOps, + mlir::IRMapping &mapper) const { + mlir::Region ®ion = targetOp.getRegion(); + + llvm::SmallVector liveInTypes; + llvm::SmallVector liveInLocs; + + for (mlir::Value liveIn : liveIns) { + liveInTypes.push_back(liveIn.getType()); + liveInLocs.push_back(liveIn.getLoc()); + } + + rewriter.createBlock(®ion, {}, liveInTypes, liveInLocs); + + for (auto [arg, mapInfoOp] : + llvm::zip_equal(region.getArguments(), liveInMapInfoOps)) { + auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); + hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); + mapper.map(miOp.getVariableOperand(0), liveInDeclare.getBase()); + } + + auto terminator = + rewriter.create(targetOp.getLoc()); + rewriter.setInsertionPoint(terminator); + } + + hlfir::DeclareOp + genLiveInDeclare(mlir::ConversionPatternRewriter &rewriter, + mlir::Value liveInArg, + mlir::omp::MapInfoOp liveInMapInfoOp) const { + mlir::Type liveInType = liveInArg.getType(); + + if (fir::isa_ref_type(liveInType)) + liveInType = fir::unwrapRefType(liveInType); + + mlir::Value shape = [&]() -> mlir::Value { + if (hlfir::isFortranScalarNumericalType(liveInType)) + return {}; - // Collect the memref defining ops in the parallel region. - for (mlir::Operation *opToClone : declareAndAllocasToClone) { + if (hlfir::isFortranArrayObject(liveInType)) { + llvm::SmallVector shapeOpOperands; + + for (auto boundsOperand : liveInMapInfoOp.getBounds()) { + auto boundsOp = + mlir::cast(boundsOperand.getDefiningOp()); + mlir::Operation *localExtentDef = + boundsOp.getExtent().getDefiningOp()->clone(); + rewriter.getInsertionBlock()->push_back(localExtentDef); + assert(localExtentDef->getNumResults() == 1); + + shapeOpOperands.push_back(localExtentDef->getResult(0)); + } + + return rewriter.create(liveInArg.getLoc(), + shapeOpOperands); + } + + std::string opStr; + llvm::raw_string_ostream opOs(opStr); + opOs << "Unsupported type: " << liveInType; + llvm_unreachable(opOs.str().c_str()); + }(); + + return rewriter.create(liveInArg.getLoc(), liveInArg, + liveInMapInfoOp.getName().value(), + shape); + } + + mlir::omp::TeamsOp + genTeamsOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, + mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + auto teamsOp = rewriter.create( + loc, /*clauses=*/mlir::omp::TeamsClauseOps{}); + + mlir::Block *teamsBlock = rewriter.createBlock(&teamsOp.getRegion()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(teamsBlock); + + genInductionVariableAlloc(rewriter, liveIns, mapper); + genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); + + return teamsOp; + } + + void + genLoopNestClauseOps(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + assert(loopNestClauseOps.loopLBVar.empty() && + "Loop nest bounds were already emitted!"); + + // Clones the chain of ops defining a certain loop bound or its step into + // the parallel region. For example, if the value of a bound is defined by a + // `fir.convert`op, this lambda clones the `fir.convert` as well as the + // value it converts from. We do this since `omp.target` regions are + // isolated from above. + std::function + cloneBoundOrStepDefChain = [&](mlir::Operation *operation) { + if (mlir::isa_and_present(operation)) + return rewriter.clone(*operation, mapper); + + if (auto convertOp = + mlir::dyn_cast_if_present(operation)) { + cloneBoundOrStepDefChain(convertOp.getValue().getDefiningOp()); + return rewriter.clone(*operation, mapper); + } + + std::string opStr; + llvm::raw_string_ostream opOs(opStr); + opOs << "Unexpected operation: " << *operation; + llvm_unreachable(opOs.str().c_str()); + }; + + mlir::Operation *lbOp = doLoop.getLowerBound().getDefiningOp(); + mlir::Operation *ubOp = doLoop.getUpperBound().getDefiningOp(); + mlir::Operation *stepOp = doLoop.getStep().getDefiningOp(); + + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(lbOp)->getResult(0)); + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(ubOp)->getResult(0)); + loopNestClauseOps.loopLBVar.push_back( + cloneBoundOrStepDefChain(stepOp)->getResult(0)); + loopNestClauseOps.loopInclusiveAttr = rewriter.getUnitAttr(); + } + + mlir::omp::DistributeOp + genDistributeOp(mlir::Location loc, + mlir::ConversionPatternRewriter &rewriter) const { + auto distOp = rewriter.create( + loc, /*clauses=*/mlir::omp::DistributeClauseOps{}); + + mlir::Block *distBlock = rewriter.createBlock(&distOp.getRegion()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(distBlock); + + return distOp; + } + + void genInductionVariableAlloc(mlir::ConversionPatternRewriter &rewriter, + llvm::ArrayRef liveIns, + mlir::IRMapping &mapper) const { + mlir::Operation *indVarMemDef = liveIns.front().getDefiningOp(); + + assert( + indVarMemDef != nullptr && + "Induction variable memdef is expected to have a defining operation."); + + llvm::SmallSetVector indVarDeclareAndAlloc; + for (auto operand : indVarMemDef->getOperands()) + indVarDeclareAndAlloc.insert(operand.getDefiningOp()); + indVarDeclareAndAlloc.insert(indVarMemDef); + + for (mlir::Operation *opToClone : indVarDeclareAndAlloc) rewriter.clone(*opToClone, mapper); + } + + mlir::omp::ParallelOp + genParallelOp(mlir::Location loc, mlir::ConversionPatternRewriter &rewriter, + fir::DoLoopOp doLoop, llvm::ArrayRef liveIns, + mlir::IRMapping &mapper, + mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { + auto parallelOp = rewriter.create(loc); + mlir::Block *parRegion = rewriter.createBlock(¶llelOp.getRegion()); + rewriter.create(loc); + rewriter.setInsertionPointToStart(parRegion); + + // If mapping to host, the local induction variable and loop bounds need to + // be emitted as part of the `omp.parallel` op. + if (!mapToDevice) { + genInductionVariableAlloc(rewriter, liveIns, mapper); + genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); } - // Clone the loop's body inside the worksharing construct using the mapped - // memref values. + return parallelOp; + } + + mlir::omp::LoopNestOp + genWsLoopOp(mlir::ConversionPatternRewriter &rewriter, fir::DoLoopOp doLoop, + mlir::IRMapping &mapper, + const mlir::omp::LoopNestClauseOps &clauseOps) const { + + auto wsloopOp = rewriter.create(doLoop.getLoc()); + rewriter.createBlock(&wsloopOp.getRegion()); + rewriter.setInsertionPoint( + rewriter.create(wsloopOp.getLoc())); + + auto loopNestOp = + rewriter.create(doLoop.getLoc(), clauseOps); + + // Clone the loop's body inside the worksharing construct using the + // mapped values. rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), loopNestOp.getRegion().begin(), mapper); @@ -192,16 +455,25 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { rewriter.create(terminator->getLoc()); rewriter.eraseOp(terminator); - rewriter.eraseOp(doLoop); - - return mlir::success(); + return loopNestOp; } + + bool mapToDevice; }; class DoConcurrentConversionPass : public fir::impl::DoConcurrentConversionPassBase< DoConcurrentConversionPass> { public: + using fir::impl::DoConcurrentConversionPassBase< + DoConcurrentConversionPass>::DoConcurrentConversionPassBase; + + DoConcurrentConversionPass() = default; + + DoConcurrentConversionPass( + const fir::DoConcurrentConversionPassOptions &options) + : DoConcurrentConversionPassBase(options) {} + void runOnOperation() override { mlir::func::FuncOp func = getOperation(); @@ -210,8 +482,16 @@ class DoConcurrentConversionPass } auto *context = &getContext(); + + if (mapTo != "host" && mapTo != "device") { + mlir::emitWarning(mlir::UnknownLoc::get(context), + "DoConcurrentConversionPass: invalid `map-to` value. " + "Valid values are: `host` or `device`"); + return; + } + mlir::RewritePatternSet patterns(context); - patterns.insert(context); + patterns.insert(context, mapTo == "device"); mlir::ConversionTarget target(*context); target.addLegalDialect fir::createDoConcurrentConversionPass() { - return std::make_unique(); -} +std::unique_ptr +fir::createDoConcurrentConversionPass(bool mapToDevice) { + DoConcurrentConversionPassOptions options; + options.mapTo = mapToDevice ? "device" : "host"; + return std::make_unique(options); +} diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 new file mode 100644 index 00000000000000..d9ce40697ff086 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -0,0 +1,86 @@ +! Tests mapping of a basic `do concurrent` loop to +! `!$omp target teams distribute parallel do`. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: | FileCheck %s +! xUN: bbc -emit-hlfir --fopenmp-do-concurrent-conversion="map-to=device" %s -o - \ +! xUN: | FileCheck %s + +! CHECK-LABEL: do_concurrent_basic +program do_concurrent_basic + implicit none + integer :: a(10) + integer :: i + + ! CHECK-DAG: %[[I_ORIG_ALLOC:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK: %[[I_ORIG_DECL:.*]]:2 = hlfir.declare %[[I_ORIG_ALLOC]] + + ! CHECK-DAG: %[[A_ADDR:.*]] = fir.address_of(@_QFEa) + ! CHECK: %[[A_SHAPE:.*]] = fir.shape %[[A_EXTENT:.*]] : (index) -> !fir.shape<1> + ! CHECK: %[[A_ORIG_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]](%[[A_SHAPE]]) + + ! CHECK-NOT: fir.do_loop + + ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#0 + ! CHECK: %[[C0:.*]] = arith.constant 0 : index + ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index + + ! CHECK: %[[A_BOUNDS:.*]] = omp.map.bounds lower_bound(%[[C0]] : index) + ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index) + ! CHECK-SAME: extent(%[[A_EXTENT]] : index) + + ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#0 : {{[^(]+}}) + ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]]) + + ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64 + + ! CHECK: omp.target trip_count(%[[TRIP_COUNT]] : i64) + ! CHECK-SAME: map_entries(%[[I_MAP_INFO]] -> %[[I_ARG:[[:alnum:]]+]], + ! CHECK-SAME: %[[A_MAP_INFO]] -> %[[A_ARG:.[[:alnum:]]+]] + + ! CHECK-NEXT: ^{{.*}}(%[[I_ARG]]: !fir.ref, %[[A_ARG]]: !fir.ref>): + + ! CHECK: %[[A_DEV_DECL:.*]]:2 = hlfir.declare %[[A_ARG]] + ! CHECK: omp.teams { + + ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} + ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 + ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index + ! CHECK: %[[STEP:.*]] = arith.constant 1 : index + + ! CHECK-NEXT: omp.distribute { + ! CHECK-NEXT: omp.parallel { + + ! CHECK-NEXT: omp.wsloop { + + ! CHECK-NEXT: omp.loop_nest (%[[ARG0:.*]]) : index = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) { + ! CHECK-NEXT: %[[IV_IDX:.*]] = fir.convert %[[ARG0]] : (index) -> i32 + ! CHECK-NEXT: fir.store %[[IV_IDX]] to %[[BINDING]]#1 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL1:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL2:.*]] = fir.load %[[BINDING]]#0 : !fir.ref + ! CHECK-NEXT: %[[IV_VAL_I64:.*]] = fir.convert %[[IV_VAL2]] : (i32) -> i64 + ! CHECK-NEXT: %[[ARR_ACCESS:.*]] = hlfir.designate %[[A_DEV_DECL]]#0 (%[[IV_VAL_I64]]) : (!fir.ref>, i64) -> !fir.ref + ! CHECK-NEXT: hlfir.assign %[[IV_VAL1]] to %[[ARR_ACCESS]] : i32, !fir.ref + ! CHECK-NEXT: omp.yield + ! CHECK-NEXT: } + + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + ! CHECK-NEXT: omp.terminator + ! CHECK-NEXT: } + do concurrent (i=1:10) + a(i) = i + end do + + ! CHECK-NOT: fir.do_loop +end program do_concurrent_basic diff --git a/flang/test/Transforms/DoConcurrent/basic.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 similarity index 93% rename from flang/test/Transforms/DoConcurrent/basic.f90 rename to flang/test/Transforms/DoConcurrent/basic_host.f90 index 15faddb4f17fe1..d765149269d450 100644 --- a/flang/test/Transforms/DoConcurrent/basic.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -2,12 +2,13 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ ! RUN: | FileCheck %s - +! RUN: bbc -emit-hlfir --fopenmp-do-concurrent-conversion="map-to=host" %s -o - \ +! RUN: | FileCheck %s + ! CHECK-LABEL: do_concurrent_basic program do_concurrent_basic ! CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 - ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 + implicit none integer :: a(10) integer :: i @@ -19,7 +20,9 @@ program do_concurrent_basic ! CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} ! CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + ! CHECK: %[[C1:.*]] = arith.constant 1 : i32 ! CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + ! CHECK: %[[C10:.*]] = arith.constant 10 : i32 ! CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index ! CHECK: %[[STEP:.*]] = arith.constant 1 : index @@ -39,7 +42,7 @@ program do_concurrent_basic ! CHECK-NEXT: omp.terminator ! CHECK-NEXT: } - do concurrent (integer :: i=1:10) + do concurrent (i=1:10) a(i) = i end do diff --git a/flang/test/Transforms/DoConcurrent/basic.mlir b/flang/test/Transforms/DoConcurrent/basic_host.mlir similarity index 97% rename from flang/test/Transforms/DoConcurrent/basic.mlir rename to flang/test/Transforms/DoConcurrent/basic_host.mlir index 764e62b647f913..7eb9d2d7da39fa 100644 --- a/flang/test/Transforms/DoConcurrent/basic.mlir +++ b/flang/test/Transforms/DoConcurrent/basic_host.mlir @@ -1,12 +1,10 @@ // Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. -// RUN: fir-opt --fopenmp-do-concurrent-conversion %s | FileCheck %s +// RUN: fir-opt --fopenmp-do-concurrent-conversion="map-to=host" %s | FileCheck %s // CHECK-LABEL: func.func @do_concurrent_basic func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_basic"} { // CHECK: %[[ARR:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFEa"} : (!fir.ref>, !fir.shape<1>) -> (!fir.ref>, !fir.ref>) - // CHECK: %[[C1:.*]] = arith.constant 1 : i32 - // CHECK: %[[C10:.*]] = arith.constant 10 : i32 %0 = fir.alloca i32 {bindc_name = "i"} %1:2 = hlfir.declare %0 {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) @@ -27,7 +25,9 @@ func.func @do_concurrent_basic() attributes {fir.bindc_name = "do_concurrent_bas // CHECK-NEXT: %[[ITER_VAR:.*]] = fir.alloca i32 {bindc_name = "i"} // CHECK-NEXT: %[[BINDING:.*]]:2 = hlfir.declare %[[ITER_VAR]] {uniq_name = "_QFEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + // CHECK: %[[C1:.*]] = arith.constant 1 : i32 // CHECK: %[[LB:.*]] = fir.convert %[[C1]] : (i32) -> index + // CHECK: %[[C10:.*]] = arith.constant 10 : i32 // CHECK: %[[UB:.*]] = fir.convert %[[C10]] : (i32) -> index // CHECK: %[[STEP:.*]] = arith.constant 1 : index From 1c8ad61c030286d355af30a4ac9c173f6a1c3550 Mon Sep 17 00:00:00 2001 From: ergawy Date: Thu, 9 May 2024 06:16:03 -0500 Subject: [PATCH 2/8] use mlir::dyn_cast --- flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 6ef77dae7c4b7b..6ed24cc42c82bf 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -211,7 +211,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mlir::Type liveInType = liveIn.getType(); mlir::Type eleType = liveInType; - if (auto refType = liveInType.dyn_cast()) + if (auto refType = mlir::dyn_cast(liveInType)) eleType = refType.getElementType(); llvm::omp::OpenMPOffloadMappingFlags mapFlag = From 1bd0d4870abc862f75794b4608a0b2b5855de376 Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 14 May 2024 02:37:57 -0500 Subject: [PATCH 3/8] merge fixes --- flang/lib/Lower/OpenMP/Utils.cpp | 11 ++++++----- .../Optimizer/Transforms/DoConcurrentConversion.cpp | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 8810bb8a126512..457bd25833d105 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -362,10 +362,11 @@ mlir::omp::MapInfoOp createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, mlir::Value baseAddr, mlir::Value varPtrPtr, std::string name, llvm::ArrayRef bounds, - llvm::ArrayRef members, uint64_t mapType, + llvm::ArrayRef members, + mlir::DenseIntElementsAttr membersIndex, uint64_t mapType, mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy, - bool isVal) { - if (auto boxTy = mlir::dyn_cast(baseAddr.getType())) { + bool partialMap) { + if (auto boxTy = llvm::dyn_cast(baseAddr.getType())) { baseAddr = builder.create(loc, baseAddr); retTy = baseAddr.getType(); } @@ -374,10 +375,10 @@ createMapInfoOp(mlir::OpBuilder &builder, mlir::Location loc, llvm::cast(retTy).getElementType()); mlir::omp::MapInfoOp op = builder.create( - loc, retTy, baseAddr, varType, varPtrPtr, members, bounds, + loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds, builder.getIntegerAttr(builder.getIntegerType(64, false), mapType), builder.getAttr(mapCaptureType), - builder.getStringAttr(name)); + builder.getStringAttr(name), builder.getBoolAttr(partialMap)); return op; } diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 6ed24cc42c82bf..fcacf30f5c6d9d 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -232,6 +232,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { return Fortran::lower::omp::createMapInfoOp( rewriter, liveIn.getLoc(), declareOp.getBase(), /*varPtrPtr=*/{}, declareOp.getUniqName().str(), boundsOps, /*members=*/{}, + /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< std::underlying_type_t>( mapFlag), From b2a7d5130c9731325da368f7438d7f8f02d6f7bf Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 14 May 2024 07:35:03 -0500 Subject: [PATCH 4/8] review comments --- flang/include/flang/Lower/OpenMP/Utils.h | 7 ++++ .../flang/Optimizer/Transforms/Passes.h | 1 - flang/lib/Optimizer/Transforms/CMakeLists.txt | 3 ++ .../Transforms/DoConcurrentConversion.cpp | 39 +++++++------------ 4 files changed, 24 insertions(+), 26 deletions(-) diff --git a/flang/include/flang/Lower/OpenMP/Utils.h b/flang/include/flang/Lower/OpenMP/Utils.h index 22b9b2083ccaba..520349bb4ba53b 100644 --- a/flang/include/flang/Lower/OpenMP/Utils.h +++ b/flang/include/flang/Lower/OpenMP/Utils.h @@ -102,6 +102,13 @@ void genObjectList(const ObjectList &objects, Fortran::lower::AbstractConverter &converter, llvm::SmallVectorImpl &operands); +// TODO: consider moving this to the `omp.loop_nest` op. Would be something like +// this: +// +// ``` +// mlir::Value LoopNestOp::calculateTripCount(mlir::OpBuilder &builder, +// mlir::OpBuilder::InsertPoint ip) +// ``` mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, const mlir::omp::CollapseClauseOps &ops); } // namespace omp diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index 7634acbe118ac4..7405796cb565cd 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -89,7 +89,6 @@ createFunctionAttrPass(FunctionAttrTypes &functionAttr, bool noInfsFPMath, bool noNaNsFPMath, bool approxFuncFPMath, bool noSignedZerosFPMath, bool unsafeFPMath); -std::unique_ptr createDoConcurrentConversionPass(); std::unique_ptr createDoConcurrentConversionPass(bool mapToDevice); void populateCfgConversionRewrites(mlir::RewritePatternSet &patterns, diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt index 7746c8f78bdb8a..a54775564cf049 100644 --- a/flang/lib/Optimizer/Transforms/CMakeLists.txt +++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt @@ -24,6 +24,9 @@ add_flang_library(FIRTransforms FunctionAttr.cpp DebugTypeGenerator.cpp DoConcurrentConversion.cpp + # TODO Find a cleaner solution for this. This is a workaround to expose + # `Utils.cpp` so that it be used the `DoConcurrentConversion` pass. We should + # probably split this is a shared lib. ../../Lower/OpenMP/Utils.cpp DEPENDS diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index fcacf30f5c6d9d..ecb79568afe8b2 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -80,7 +80,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { assert(!liveIns.empty()); mlir::IRMapping mapper; - mlir::omp::TargetOp targetOp = nullptr; + mlir::omp::TargetOp targetOp; mlir::omp::LoopNestClauseOps loopNestClauseOps; if (mapToDevice) { @@ -121,7 +121,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { private: /// Collect the list of values used inside the loop but defined outside of it. - /// The first item in the retunred list is always the loop's induction + /// The first item in the returned list is always the loop's induction /// variable. void collectLoopLiveIns(fir::DoLoopOp doLoop, llvm::SmallVectorImpl &liveIns) const { @@ -246,15 +246,6 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mlir::omp::TargetClauseOps &clauseOps) const { auto targetOp = rewriter.create(loc, clauseOps); - genBodyOfTargetOp(rewriter, targetOp, liveIns, clauseOps.mapVars, mapper); - return targetOp; - } - - void genBodyOfTargetOp(mlir::ConversionPatternRewriter &rewriter, - mlir::omp::TargetOp targetOp, - llvm::ArrayRef liveIns, - llvm::ArrayRef liveInMapInfoOps, - mlir::IRMapping &mapper) const { mlir::Region ®ion = targetOp.getRegion(); llvm::SmallVector liveInTypes; @@ -268,15 +259,16 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { rewriter.createBlock(®ion, {}, liveInTypes, liveInLocs); for (auto [arg, mapInfoOp] : - llvm::zip_equal(region.getArguments(), liveInMapInfoOps)) { + llvm::zip_equal(region.getArguments(), clauseOps.mapVars)) { auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); mapper.map(miOp.getVariableOperand(0), liveInDeclare.getBase()); } - auto terminator = - rewriter.create(targetOp.getLoc()); - rewriter.setInsertionPoint(terminator); + rewriter.setInsertionPoint( + rewriter.create(targetOp.getLoc())); + + return targetOp; } hlfir::DeclareOp @@ -329,9 +321,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { auto teamsOp = rewriter.create( loc, /*clauses=*/mlir::omp::TeamsClauseOps{}); - mlir::Block *teamsBlock = rewriter.createBlock(&teamsOp.getRegion()); - rewriter.create(loc); - rewriter.setInsertionPointToStart(teamsBlock); + rewriter.createBlock(&teamsOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); genInductionVariableAlloc(rewriter, liveIns, mapper); genLoopNestClauseOps(loc, rewriter, doLoop, mapper, loopNestClauseOps); @@ -388,9 +379,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { auto distOp = rewriter.create( loc, /*clauses=*/mlir::omp::DistributeClauseOps{}); - mlir::Block *distBlock = rewriter.createBlock(&distOp.getRegion()); - rewriter.create(loc); - rewriter.setInsertionPointToStart(distBlock); + rewriter.createBlock(&distOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); return distOp; } @@ -419,9 +409,8 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { mlir::IRMapping &mapper, mlir::omp::LoopNestClauseOps &loopNestClauseOps) const { auto parallelOp = rewriter.create(loc); - mlir::Block *parRegion = rewriter.createBlock(¶llelOp.getRegion()); - rewriter.create(loc); - rewriter.setInsertionPointToStart(parRegion); + rewriter.createBlock(¶llelOp.getRegion()); + rewriter.setInsertionPoint(rewriter.create(loc)); // If mapping to host, the local induction variable and loop bounds need to // be emitted as part of the `omp.parallel` op. @@ -446,7 +435,7 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { auto loopNestOp = rewriter.create(doLoop.getLoc(), clauseOps); - // Clone the loop's body inside the worksharing construct using the + // Clone the loop's body inside the loop nest construct using the // mapped values. rewriter.cloneRegionBefore(doLoop.getRegion(), loopNestOp.getRegion(), loopNestOp.getRegion().begin(), mapper); From 71a1fa041b3f8ce12ec49d4473f0de71d825ce5d Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 14 May 2024 08:23:32 -0500 Subject: [PATCH 5/8] update cli options --- .../include/flang/Frontend/CodeGenOptions.def | 2 +- flang/include/flang/Frontend/CodeGenOptions.h | 5 ++-- flang/include/flang/Tools/CLOptions.inc | 7 +++-- flang/lib/Frontend/CompilerInvocation.cpp | 5 ++-- flang/lib/Frontend/FrontendActions.cpp | 26 +++++++++---------- .../Transforms/DoConcurrent/basic_device.f90 | 9 ++++--- .../Transforms/DoConcurrent/basic_host.f90 | 4 +-- flang/tools/bbc/bbc.cpp | 8 +++++- 8 files changed, 38 insertions(+), 28 deletions(-) diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index a6128130baadc3..95d6929a9bc721 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -40,7 +40,7 @@ ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codeg ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers -ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP +ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_Disable) ///< Map `do concurrent` to OpenMP #undef CODEGENOPT #undef ENUM_CODEGENOPT diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index f6cc9af04d98b9..f70f17f7775c33 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -132,9 +132,8 @@ class CodeGenOptions : public CodeGenOptionsBase { /// Optionally map `do concurrent` loops to OpenMP. This is only valid of /// OpenMP is enabled. enum class DoConcurrentMappingKind { - DCMK_None, // Do not lower `do concurrent` to OpenMP. - DCMK_Host, // Lower to run in parallel on the CPU. - DCMK_Device // Lower to run in parallel on the GPU. + DCMK_Enable, // Do not lower `do concurrent` to OpenMP. + DCMK_Disable, // Lower to run in parallel on the CPU or the GPU. }; // Define accessors/mutators for code generation options of enumeration type. diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index cc3431d5b71d29..9a5eb4cda3c023 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -333,8 +333,11 @@ inline void createHLFIRToFIRPassPipeline( /// \param pm - MLIR pass manager that will hold the pipeline definition. /// \param isTargetDevice - Whether code is being generated for a target device /// rather than the host device. -inline void createOpenMPFIRPassPipeline( - mlir::PassManager &pm, bool isTargetDevice) { +inline void createOpenMPFIRPassPipeline(mlir::PassManager &pm, + bool isTargetDevice, bool enableDoConcurrentConversion) { + if (enableDoConcurrentConversion) + pm.addPass(fir::createDoConcurrentConversionPass(isTargetDevice)); + pm.addPass(fir::createOMPMapInfoFinalizationPass()); pm.addPass(fir::createOMPMarkDeclareTargetPass()); if (isTargetDevice) diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 2af48e97eef515..bba3e8f426c0ba 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -167,9 +167,8 @@ static bool parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, std::optional val = llvm::StringSwitch>( arg->getValue()) - .Case("none", DoConcurrentMappingKind::DCMK_None) - .Case("host", DoConcurrentMappingKind::DCMK_Host) - .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Case("disable", DoConcurrentMappingKind::DCMK_Disable) + .Case("enable", DoConcurrentMappingKind::DCMK_Enable) .Default(std::nullopt); if (!val.has_value()) { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index c38c3606d21f9b..f164f126bd0f2e 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -326,19 +326,16 @@ bool CodeGenAction::beginSourceFileAction() { using DoConcurrentMappingKind = Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; - DoConcurrentMappingKind selectedKind = + DoConcurrentMappingKind doConcurrentMappingKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); - if (selectedKind != DoConcurrentMappingKind::DCMK_None) { - if (!isOpenMPEnabled) { - unsigned diagID = ci.getDiagnostics().getCustomDiagID( - clang::DiagnosticsEngine::Warning, - "lowering `do concurrent` loops to OpenMP is only supported if " - "OpenMP is enabled"); - ci.getDiagnostics().Report(diagID); - } else { - bool mapToDevice = selectedKind == DoConcurrentMappingKind::DCMK_Device; - pm.addPass(fir::createDoConcurrentConversionPass(mapToDevice)); - } + + if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_Disable && + !isOpenMPEnabled) { + unsigned diagID = ci.getDiagnostics().getCustomDiagID( + clang::DiagnosticsEngine::Warning, + "lowering `do concurrent` loops to OpenMP is only supported if " + "OpenMP is enabled"); + ci.getDiagnostics().Report(diagID); } if (isOpenMPEnabled) { @@ -346,10 +343,13 @@ bool CodeGenAction::beginSourceFileAction() { if (auto offloadMod = llvm::dyn_cast( mlirModule->getOperation())) isDevice = offloadMod.getIsTargetDevice(); + // WARNING: This pipeline must be run immediately after the lowering to // ensure that the FIR is correct with respect to OpenMP operations/ // attributes. - fir::createOpenMPFIRPassPipeline(pm, isDevice); + fir::createOpenMPFIRPassPipeline(pm, isDevice, + doConcurrentMappingKind == + DoConcurrentMappingKind::DCMK_Enable); } pm.enableVerifier(/*verifyPasses=*/true); diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 index d9ce40697ff086..3bcf0f705f6f2a 100644 --- a/flang/test/Transforms/DoConcurrent/basic_device.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -1,10 +1,13 @@ ! Tests mapping of a basic `do concurrent` loop to ! `!$omp target teams distribute parallel do`. -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device \ +! RUN: -fdo-concurrent-parallel=enable %s -o - \ +! RUN: | FileCheck %s + +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device \ +! RUN: -fopenmp-do-concurrent-parallel %s -o - \ ! RUN: | FileCheck %s -! xUN: bbc -emit-hlfir --fopenmp-do-concurrent-conversion="map-to=device" %s -o - \ -! xUN: | FileCheck %s ! CHECK-LABEL: do_concurrent_basic program do_concurrent_basic diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index d765149269d450..772f4c106edafa 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -1,8 +1,8 @@ ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=enable %s -o - \ ! RUN: | FileCheck %s -! RUN: bbc -emit-hlfir --fopenmp-do-concurrent-conversion="map-to=host" %s -o - \ +! RUN: bbc -emit-hlfir -fopenmp -fopenmp-do-concurrent-parallel %s -o - \ ! RUN: | FileCheck %s ! CHECK-LABEL: do_concurrent_basic diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index ee2ff8562e9ff2..e570ebbcc3cfa5 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -139,6 +139,11 @@ static llvm::cl::opt llvm::cl::desc("enable openmp device compilation"), llvm::cl::init(false)); +static llvm::cl::opt + enableDoConcurrentToOpenMPConversion("fopenmp-do-concurrent-parallel", + llvm::cl::desc("xxxx"), + llvm::cl::init(false)); + static llvm::cl::opt enableOpenMPGPU("fopenmp-is-gpu", llvm::cl::desc("enable openmp GPU target codegen"), @@ -258,7 +263,8 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); - fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice); + fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice, + enableDoConcurrentToOpenMPConversion); (void)mlir::applyPassManagerCLOptions(pm); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline"; From 2ed604009c70aa718c250294553c2e1d326dad36 Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 14 May 2024 08:26:58 -0500 Subject: [PATCH 6/8] update desc --- flang/tools/bbc/bbc.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index e570ebbcc3cfa5..c451799ccc7b09 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -139,10 +139,11 @@ static llvm::cl::opt llvm::cl::desc("enable openmp device compilation"), llvm::cl::init(false)); -static llvm::cl::opt - enableDoConcurrentToOpenMPConversion("fopenmp-do-concurrent-parallel", - llvm::cl::desc("xxxx"), - llvm::cl::init(false)); +static llvm::cl::opt enableDoConcurrentToOpenMPConversion( + "fopenmp-do-concurrent-parallel", + llvm::cl::desc( + "Try to map `do concurrent` loops to OpenMP (on host or device)"), + llvm::cl::init(false)); static llvm::cl::opt enableOpenMPGPU("fopenmp-is-gpu", From 476fffe4d372845acf7c5f77aff1a571d4d333fe Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 15 May 2024 07:35:04 -0500 Subject: [PATCH 7/8] more cli changes --- .../include/flang/Frontend/CodeGenOptions.def | 2 +- flang/include/flang/Frontend/CodeGenOptions.h | 5 +++-- flang/include/flang/Tools/CLOptions.inc | 10 +++++++--- flang/lib/Frontend/CompilerInvocation.cpp | 5 +++-- flang/lib/Frontend/FrontendActions.cpp | 6 ++---- .../Transforms/DoConcurrent/basic_device.f90 | 7 ++----- .../Transforms/DoConcurrent/basic_host.f90 | 4 ++-- flang/tools/bbc/bbc.cpp | 20 ++++++++++++++----- 8 files changed, 35 insertions(+), 24 deletions(-) diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index 95d6929a9bc721..a6128130baadc3 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -40,7 +40,7 @@ ENUM_CODEGENOPT(DebugInfo, llvm::codegenoptions::DebugInfoKind, 4, llvm::codeg ENUM_CODEGENOPT(VecLib, llvm::driver::VectorLibrary, 3, llvm::driver::VectorLibrary::NoLibrary) ///< Vector functions library to use ENUM_CODEGENOPT(FramePointer, llvm::FramePointerKind, 2, llvm::FramePointerKind::None) ///< Enable the usage of frame pointers -ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_Disable) ///< Map `do concurrent` to OpenMP +ENUM_CODEGENOPT(DoConcurrentMapping, DoConcurrentMappingKind, 2, DoConcurrentMappingKind::DCMK_None) ///< Map `do concurrent` to OpenMP #undef CODEGENOPT #undef ENUM_CODEGENOPT diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index f70f17f7775c33..f6cc9af04d98b9 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -132,8 +132,9 @@ class CodeGenOptions : public CodeGenOptionsBase { /// Optionally map `do concurrent` loops to OpenMP. This is only valid of /// OpenMP is enabled. enum class DoConcurrentMappingKind { - DCMK_Enable, // Do not lower `do concurrent` to OpenMP. - DCMK_Disable, // Lower to run in parallel on the CPU or the GPU. + DCMK_None, // Do not lower `do concurrent` to OpenMP. + DCMK_Host, // Lower to run in parallel on the CPU. + DCMK_Device // Lower to run in parallel on the GPU. }; // Define accessors/mutators for code generation options of enumeration type. diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index 9a5eb4cda3c023..876ccb2d4a2a4a 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -324,6 +324,9 @@ inline void createHLFIRToFIRPassPipeline( pm.addPass(hlfir::createConvertHLFIRtoFIRPass()); } +using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + /// Create a pass pipeline for handling certain OpenMP transformations needed /// prior to FIR lowering. /// @@ -334,9 +337,10 @@ inline void createHLFIRToFIRPassPipeline( /// \param isTargetDevice - Whether code is being generated for a target device /// rather than the host device. inline void createOpenMPFIRPassPipeline(mlir::PassManager &pm, - bool isTargetDevice, bool enableDoConcurrentConversion) { - if (enableDoConcurrentConversion) - pm.addPass(fir::createDoConcurrentConversionPass(isTargetDevice)); + bool isTargetDevice, DoConcurrentMappingKind doConcurrentMappingKind) { + if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None) + pm.addPass(fir::createDoConcurrentConversionPass( + doConcurrentMappingKind == DoConcurrentMappingKind::DCMK_Device)); pm.addPass(fir::createOMPMapInfoFinalizationPass()); pm.addPass(fir::createOMPMarkDeclareTargetPass()); diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index bba3e8f426c0ba..2af48e97eef515 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -167,8 +167,9 @@ static bool parseDoConcurrentMapping(Fortran::frontend::CodeGenOptions &opts, std::optional val = llvm::StringSwitch>( arg->getValue()) - .Case("disable", DoConcurrentMappingKind::DCMK_Disable) - .Case("enable", DoConcurrentMappingKind::DCMK_Enable) + .Case("none", DoConcurrentMappingKind::DCMK_None) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) .Default(std::nullopt); if (!val.has_value()) { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index f164f126bd0f2e..79a6f1101b1685 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -329,7 +329,7 @@ bool CodeGenAction::beginSourceFileAction() { DoConcurrentMappingKind doConcurrentMappingKind = ci.getInvocation().getCodeGenOpts().getDoConcurrentMapping(); - if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_Disable && + if (doConcurrentMappingKind != DoConcurrentMappingKind::DCMK_None && !isOpenMPEnabled) { unsigned diagID = ci.getDiagnostics().getCustomDiagID( clang::DiagnosticsEngine::Warning, @@ -347,9 +347,7 @@ bool CodeGenAction::beginSourceFileAction() { // WARNING: This pipeline must be run immediately after the lowering to // ensure that the FIR is correct with respect to OpenMP operations/ // attributes. - fir::createOpenMPFIRPassPipeline(pm, isDevice, - doConcurrentMappingKind == - DoConcurrentMappingKind::DCMK_Enable); + fir::createOpenMPFIRPassPipeline(pm, isDevice, doConcurrentMappingKind); } pm.enableVerifier(/*verifyPasses=*/true); diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 index 3bcf0f705f6f2a..f059a7fb1c0aae 100644 --- a/flang/test/Transforms/DoConcurrent/basic_device.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -1,12 +1,9 @@ ! Tests mapping of a basic `do concurrent` loop to ! `!$omp target teams distribute parallel do`. -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-is-target-device \ -! RUN: -fdo-concurrent-parallel=enable %s -o - \ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ ! RUN: | FileCheck %s - -! RUN: bbc -emit-hlfir -fopenmp -fopenmp-is-target-device \ -! RUN: -fopenmp-do-concurrent-parallel %s -o - \ +! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ ! RUN: | FileCheck %s ! CHECK-LABEL: do_concurrent_basic diff --git a/flang/test/Transforms/DoConcurrent/basic_host.f90 b/flang/test/Transforms/DoConcurrent/basic_host.f90 index 772f4c106edafa..62a9306b21cccd 100644 --- a/flang/test/Transforms/DoConcurrent/basic_host.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_host.f90 @@ -1,8 +1,8 @@ ! Tests mapping of a basic `do concurrent` loop to `!$omp parallel do`. -! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=enable %s -o - \ +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ ! RUN: | FileCheck %s -! RUN: bbc -emit-hlfir -fopenmp -fopenmp-do-concurrent-parallel %s -o - \ +! RUN: bbc -emit-hlfir -fopenmp -fdo-concurrent-parallel=host %s -o - \ ! RUN: | FileCheck %s ! CHECK-LABEL: do_concurrent_basic diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index c451799ccc7b09..c254cd6a15b605 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -139,11 +139,11 @@ static llvm::cl::opt llvm::cl::desc("enable openmp device compilation"), llvm::cl::init(false)); -static llvm::cl::opt enableDoConcurrentToOpenMPConversion( - "fopenmp-do-concurrent-parallel", +static llvm::cl::opt enableDoConcurrentToOpenMPConversion( + "fdo-concurrent-parallel", llvm::cl::desc( - "Try to map `do concurrent` loops to OpenMP (on host or device)"), - llvm::cl::init(false)); + "Try to map `do concurrent` loops to OpenMP [none|host|device]"), + llvm::cl::init("none")); static llvm::cl::opt enableOpenMPGPU("fopenmp-is-gpu", @@ -264,8 +264,18 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); +using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + + auto doConcurrentMappingKind = + llvm::StringSwitch( + enableDoConcurrentToOpenMPConversion) + .Case("host", DoConcurrentMappingKind::DCMK_Host) + .Case("device", DoConcurrentMappingKind::DCMK_Device) + .Default(DoConcurrentMappingKind::DCMK_None); + fir::createOpenMPFIRPassPipeline(pm, enableOpenMPDevice, - enableDoConcurrentToOpenMPConversion); + doConcurrentMappingKind); (void)mlir::applyPassManagerCLOptions(pm); if (mlir::failed(pm.run(mlirModule))) { llvm::errs() << "FATAL: failed to correctly apply OpenMP pass pipeline"; From 3bb11521ba4a64a981ff441f8093e3cb38db8bd0 Mon Sep 17 00:00:00 2001 From: ergawy Date: Wed, 15 May 2024 07:38:26 -0500 Subject: [PATCH 8/8] format --- flang/tools/bbc/bbc.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index c254cd6a15b605..2b8b00570910d6 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -264,8 +264,8 @@ createTargetMachine(llvm::StringRef targetTriple, std::string &error) { static mlir::LogicalResult runOpenMPPasses(mlir::ModuleOp mlirModule) { mlir::PassManager pm(mlirModule->getName(), mlir::OpPassManager::Nesting::Implicit); -using DoConcurrentMappingKind = - Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; + using DoConcurrentMappingKind = + Fortran::frontend::CodeGenOptions::DoConcurrentMappingKind; auto doConcurrentMappingKind = llvm::StringSwitch(