From 93c1ec3deec7ccf67bb89c1fa8a3fd85d446ded6 Mon Sep 17 00:00:00 2001 From: ergawy Date: Tue, 20 Aug 2024 02:56:12 -0500 Subject: [PATCH] [OpenMP][flang][do-conc] Map values depending on values defined outside target region. Adds support for `do concurrent` mapping when mapped value(s) depend on values defined outside the target region; e.g. the size of the array is dynamic. This needs to be handled by localizing these region outsiders by either cloning them in the region or in case we cannot do that, map them and use the mapped values. --- flang/lib/Lower/OpenMP/OpenMP.cpp | 2 + .../Transforms/DoConcurrentConversion.cpp | 89 +++++++++++++++++-- .../Transforms/DoConcurrent/basic_device.f90 | 4 +- .../DoConcurrent/runtime_sized_array.f90 | 42 +++++++++ 4 files changed, 129 insertions(+), 8 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index b6a63f802647f5..66e2508bcca500 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1320,9 +1320,11 @@ static void genBodyOfTargetOp( while (!valuesDefinedAbove.empty()) { for (mlir::Value val : valuesDefinedAbove) { mlir::Operation *valOp = val.getDefiningOp(); + assert(valOp != nullptr); if (mlir::isMemoryEffectFree(valOp)) { mlir::Operation *clonedOp = valOp->clone(); regionBlock->push_front(clonedOp); + assert(clonedOp->getNumResults() == 1); val.replaceUsesWithIf( clonedOp->getResult(0), [regionBlock](mlir::OpOperand &use) { return use.getOwner()->getBlock() == regionBlock; diff --git a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp index 7fe27c947e549d..b244173c8aaeee 100644 --- a/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/Transforms/DoConcurrentConversion.cpp @@ -147,6 +147,68 @@ mlir::Value calculateTripCount(fir::FirOpBuilder &builder, mlir::Location loc, return tripCount; } + +/// Check if cloning the bounds introduced any dependency on the outer region. +/// If so, then either clone them as well if they are MemoryEffectFree, or else +/// copy them to a new temporary and add them to the map and block_argument +/// lists and replace their uses with the new temporary. +/// +/// TODO: similar to the above functions, this is copied from OpenMP lowering +/// (in this case, from `genBodyOfTargetOp`). Once we move to a common lib for +/// these utils this will move as well. +void cloneOrMapRegionOutsiders(fir::FirOpBuilder &builder, + mlir::omp::TargetOp targetOp) { + mlir::Region &targetRegion = targetOp.getRegion(); + mlir::Block *targetEntryBlock = &targetRegion.getBlocks().front(); + llvm::SetVector valuesDefinedAbove; + mlir::getUsedValuesDefinedAbove(targetRegion, valuesDefinedAbove); + + while (!valuesDefinedAbove.empty()) { + for (mlir::Value val : valuesDefinedAbove) { + mlir::Operation *valOp = val.getDefiningOp(); + assert(valOp != nullptr); + if (mlir::isMemoryEffectFree(valOp)) { + mlir::Operation *clonedOp = valOp->clone(); + targetEntryBlock->push_front(clonedOp); + assert(clonedOp->getNumResults() == 1); + val.replaceUsesWithIf( + clonedOp->getResult(0), [targetEntryBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == targetEntryBlock; + }); + } else { + mlir::OpBuilder::InsertionGuard guard(builder); + builder.setInsertionPointAfter(valOp); + auto copyVal = builder.createTemporary(val.getLoc(), val.getType()); + builder.createStoreWithConvert(copyVal.getLoc(), val, copyVal); + + llvm::SmallVector bounds; + std::stringstream name; + builder.setInsertionPoint(targetOp); + mlir::Value mapOp = createMapInfoOp( + builder, copyVal.getLoc(), copyVal, + /*varPtrPtr=*/mlir::Value{}, name.str(), bounds, + /*members=*/llvm::SmallVector{}, + /*membersIndex=*/mlir::DenseIntElementsAttr{}, + static_cast< + std::underlying_type_t>( + llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT), + mlir::omp::VariableCaptureKind::ByCopy, copyVal.getType()); + targetOp.getMapVarsMutable().append(mapOp); + mlir::Value clonedValArg = + targetRegion.addArgument(copyVal.getType(), copyVal.getLoc()); + builder.setInsertionPointToStart(targetEntryBlock); + auto loadOp = + builder.create(clonedValArg.getLoc(), clonedValArg); + val.replaceUsesWithIf( + loadOp->getResult(0), [targetEntryBlock](mlir::OpOperand &use) { + return use.getOwner()->getBlock() == targetEntryBlock; + }); + } + } + valuesDefinedAbove.clear(); + mlir::getUsedValuesDefinedAbove(targetRegion, valuesDefinedAbove); + } +} } // namespace internal } // namespace omp } // namespace lower @@ -720,14 +782,19 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { llvm::SmallVector boundsOps; genBoundsOps(rewriter, liveIn.getLoc(), declareOp, boundsOps); + // Use the raw address to avoid unboxing `fir.box` values whenever possible. + // Put differently, if we have access to the direct value memory + // reference/address, we use it. + mlir::Value rawAddr = declareOp.getOriginalBase(); return Fortran::lower::omp ::internal::createMapInfoOp( - rewriter, liveIn.getLoc(), declareOp.getBase(), /*varPtrPtr=*/{}, - declareOp.getUniqName().str(), boundsOps, /*members=*/{}, + rewriter, liveIn.getLoc(), rawAddr, + /*varPtrPtr=*/{}, declareOp.getUniqName().str(), boundsOps, + /*members=*/{}, /*membersIndex=*/mlir::DenseIntElementsAttr{}, static_cast< std::underlying_type_t>( mapFlag), - captureKind, liveInType); + captureKind, rawAddr.getType()); } mlir::omp::TargetOp genTargetOp(mlir::Location loc, @@ -754,14 +821,24 @@ class DoConcurrentConversion : public mlir::OpConversionPattern { auto miOp = mlir::cast(mapInfoOp.getDefiningOp()); hlfir::DeclareOp liveInDeclare = genLiveInDeclare(rewriter, arg, miOp); mlir::Value miOperand = miOp.getVariableOperand(0); - mapper.map(miOperand, liveInDeclare.getBase()); + + // TODO If `miOperand.getDefiningOp()` is a `fir::BoxAddrOp`, we probably + // need to "unpack" the box by getting the defining op of it's value. + // However, we did not hit this case in reality yet so leaving it as a + // todo for now. + + mapper.map(miOperand, liveInDeclare.getOriginalBase()); if (auto origDeclareOp = mlir::dyn_cast_if_present( miOperand.getDefiningOp())) - mapper.map(origDeclareOp.getOriginalBase(), - liveInDeclare.getOriginalBase()); + mapper.map(origDeclareOp.getBase(), liveInDeclare.getBase()); } + fir::FirOpBuilder firBuilder( + rewriter, + fir::getKindMapping(targetOp->getParentOfType())); + Fortran::lower::omp::internal::cloneOrMapRegionOutsiders(firBuilder, + targetOp); rewriter.setInsertionPoint( rewriter.create(targetOp.getLoc())); diff --git a/flang/test/Transforms/DoConcurrent/basic_device.f90 b/flang/test/Transforms/DoConcurrent/basic_device.f90 index 7873fa4f88db6d..2f762c003ddf16 100644 --- a/flang/test/Transforms/DoConcurrent/basic_device.f90 +++ b/flang/test/Transforms/DoConcurrent/basic_device.f90 @@ -21,7 +21,7 @@ program do_concurrent_basic ! CHECK-NOT: fir.do_loop - ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#0 + ! CHECK-DAG: %[[I_MAP_INFO:.*]] = omp.map.info var_ptr(%[[I_ORIG_DECL]]#1 ! CHECK: %[[C0:.*]] = arith.constant 0 : index ! CHECK: %[[UPPER_BOUND:.*]] = arith.subi %[[A_EXTENT]], %[[C0]] : index @@ -29,7 +29,7 @@ program do_concurrent_basic ! CHECK-SAME: upper_bound(%[[UPPER_BOUND]] : index) ! CHECK-SAME: extent(%[[A_EXTENT]] : index) - ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#0 : {{[^(]+}}) + ! CHECK-DAG: %[[A_MAP_INFO:.*]] = omp.map.info var_ptr(%[[A_ORIG_DECL]]#1 : {{[^(]+}}) ! CHECK-SAME: map_clauses(implicit, tofrom) capture(ByRef) bounds(%[[A_BOUNDS]]) ! CHECK: %[[TRIP_COUNT:.*]] = arith.muli %{{.*}}, %{{.*}} : i64 diff --git a/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 new file mode 100644 index 00000000000000..5420ff4586be60 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/runtime_sized_array.f90 @@ -0,0 +1,42 @@ +! Tests `do concurrent` mapping when mapped value(s) depend on values defined +! outside the target region; e.g. the size of the array is dynamic. This needs +! to be handled by localizing these region outsiders by either cloning them in +! the region or in case we cannot do that, map them and use the mapped values. + +! RUN: %flang_fc1 -emit-hlfir -fopenmp -fdo-concurrent-parallel=device %s -o - \ +! RUN: | FileCheck %s + +subroutine foo(n) + implicit none + integer :: n + integer :: i + integer, dimension(n) :: a + + do concurrent(i=1:10) + a(i) = i + end do +end subroutine + +! CHECK-DAG: %[[I_DECL:.*]]:2 = hlfir.declare %{{.*}} {uniq_name = "_QFfooEi"} +! CHECK-DAG: %[[A_DECL:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFfooEa"} +! CHECK-DAG: %[[N_ALLOC:.*]] = fir.alloca i32 + +! CHECK-DAG: %[[I_MAP:.*]] = omp.map.info var_ptr(%[[I_DECL]]#1 : {{.*}}) +! CHECK-DAG: %[[A_MAP:.*]] = omp.map.info var_ptr(%[[A_DECL]]#1 : {{.*}}) +! CHECK-DAG: %[[N_MAP:.*]] = omp.map.info var_ptr(%[[N_ALLOC]] : {{.*}}) + +! CHECK: omp.target +! CHECK-SAME: map_entries(%[[I_MAP]] -> %[[I_ARG:arg[0-9]*]], +! CHECK-SAME: %[[A_MAP]] -> %[[A_ARG:arg[0-9]*]], +! CHECK-SAME: %[[N_MAP]] -> %[[N_ARG:arg[0-9]*]] : {{.*}}) +! CHECK-SAME: {{.*}} { + +! CHECK-DAG: %{{.*}} = hlfir.declare %[[I_ARG]] +! CHECK-DAG: %{{.*}} = hlfir.declare %[[A_ARG]] +! CHECK-DAG: %{{.*}} = fir.load %[[N_ARG]] + +! CHECK: omp.terminator +! CHECK: } + + +