[MLIR][OpenMP] Skip host omp ops when compiling for the target device

This patch separates the lowering dispatch for host and target devices. For the target device, if the current operation is not a top-level operation (omp.target) or is inside a target device code region it will be ignored, since it belongs to the host code. This is a copy of PR85239 upstream.
ROCm · Mar 19, 2024 · e11c806 · e11c806
1 parent 6f86d2c
commit e11c806
Show file tree

Hide file tree

Showing 7 changed files with 245 additions and 121 deletions.
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -3193,6 +3193,164 @@ convertDeclareTargetAttr(Operation *op, mlir::omp::DeclareTargetAttr attribute,
  return success();
 }
 
+static bool isInternalTargetDeviceOp(Operation *op) {
+ // Assumes no reverse offloading
+ if (op->getParentOfType<omp::TargetOp>())
+ return true;
+
+ auto parentFn = op->getParentOfType<LLVM::LLVMFuncOp>();
+ if (auto declareTargetIface =
+ llvm::dyn_cast<mlir::omp::DeclareTargetInterface>(
+ parentFn.getOperation()))
+ if (declareTargetIface.isDeclareTarget() &&
+ declareTargetIface.getDeclareTargetDeviceType() !=
+ mlir::omp::DeclareTargetDeviceType::host)
+ return true;
+
+ return false;
+}
+
+/// Given an OpenMP MLIR operation, create the corresponding LLVM IR
+/// (including OpenMP runtime calls).
+static LogicalResult
+convertCommonOperation(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+
+ llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+ return llvm::TypeSwitch<Operation *, LogicalResult>(op)
+ .Case([&](omp::BarrierOp) {
+ ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
+ return success();
+ })
+ .Case([&](omp::TaskwaitOp) {
+ ompBuilder->createTaskwait(builder.saveIP());
+ return success();
+ })
+ .Case([&](omp::TaskyieldOp) {
+ ompBuilder->createTaskyield(builder.saveIP());
+ return success();
+ })
+ .Case([&](omp::FlushOp) {
+ // No support in Openmp runtime function (__kmpc_flush) to accept
+ // the argument list.
+ // OpenMP standard states the following:
+ // "An implementation may implement a flush with a list by ignoring
+ // the list, and treating it the same as a flush without a list."
+ //
+ // The argument list is discarded so that, flush with a list is treated
+ // same as a flush without a list.
+ ompBuilder->createFlush(builder.saveIP());
+ return success();
+ })
+ .Case([&](omp::ParallelOp op) {
+ return convertOmpParallel(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::ReductionOp reductionOp) {
+ return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
+ })
+ .Case([&](omp::MasterOp) {
+ return convertOmpMaster(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::CriticalOp) {
+ return convertOmpCritical(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::OrderedRegionOp) {
+ return convertOmpOrderedRegion(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::OrderedOp) {
+ return convertOmpOrdered(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::WsLoopOp) {
+ return convertOmpWsLoop(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::SimdLoopOp) {
+ return convertOmpSimdLoop(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::AtomicReadOp) {
+ return convertOmpAtomicRead(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::AtomicWriteOp) {
+ return convertOmpAtomicWrite(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::AtomicUpdateOp op) {
+ return convertOmpAtomicUpdate(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::AtomicCaptureOp op) {
+ return convertOmpAtomicCapture(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::SectionsOp) {
+ return convertOmpSections(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::SingleOp op) {
+ return convertOmpSingle(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::TeamsOp op) {
+ return convertOmpTeams(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::TaskOp op) {
+ return convertOmpTaskOp(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::TaskGroupOp op) {
+ return convertOmpTaskgroupOp(op, builder, moduleTranslation);
+ })
+ .Case<omp::YieldOp, omp::TerminatorOp, omp::ReductionDeclareOp,
+ omp::CriticalDeclareOp>([](auto op) {
+ // `yield` and `terminator` can be just omitted. The block structure
+ // was created in the region that handles their parent operation.
+ // `reduction.declare` will be used by reductions and is not
+ // converted directly, skip it.
+ // `critical.declare` is only used to declare names of critical
+ // sections which will be used by `critical` ops and hence can be
+ // ignored for lowering. The OpenMP IRBuilder will create unique
+ // name for critical section names.
+ return success();
+ })
+ .Case([&](omp::ThreadprivateOp) {
+ return convertOmpThreadprivate(*op, builder, moduleTranslation);
+ })
+ .Case<omp::DataOp, omp::EnterDataOp, omp::ExitDataOp, omp::UpdateDataOp>(
+ [&](auto op) {
+ return convertOmpTargetData(op, builder, moduleTranslation);
+ })
+ .Case([&](omp::TargetOp) {
+ return convertOmpTarget(*op, builder, moduleTranslation);
+ })
+ .Case([&](omp::DistributeOp) {
+ return convertOmpDistribute(*op, builder, moduleTranslation);
+ })
+ .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
+ [&](auto op) {
+ // No-op, should be handled by relevant owning operations e.g.
+ // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
+ // discarded
+ return success();
+ })
+ .Default([&](Operation *inst) {
+ return inst->emitError("unsupported OpenMP operation: ")
+ << inst->getName();
+ });
+}
+
+static LogicalResult
+convertInternalTargetOp(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ return convertCommonOperation(op, builder, moduleTranslation);
+}
+
+static LogicalResult
+convertTopLevelTargetOp(Operation *op, llvm::IRBuilderBase &builder,
+ LLVM::ModuleTranslation &moduleTranslation) {
+ if (isa<omp::TargetOp>(op))
+ return convertOmpTarget(*op, builder, moduleTranslation);
+ bool interrupted =
+ op->walk<WalkOrder::PreOrder>([&](omp::TargetOp targetOp) {
+ if (failed(convertOmpTarget(*targetOp, builder, moduleTranslation)))
+ return WalkResult::interrupt();
+ return WalkResult::skip();
+ }).wasInterrupted();
+ return failure(interrupted);
+}
+
 namespace {
 
 /// Implementation of the dialect interface that converts operations belonging
@@ -3208,8 +3366,8 @@ class OpenMPDialectLLVMIRTranslationInterface
  convertOperation(Operation *op, llvm::IRBuilderBase &builder,
  LLVM::ModuleTranslation &moduleTranslation) const final;
 
- /// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR, runtime
- /// calls, or operation amendments
+ /// Given an OpenMP MLIR attribute, create the corresponding LLVM-IR,
+ /// runtime calls, or operation amendments
  LogicalResult
  amendOperation(Operation *op, ArrayRef<llvm::Instruction *> instructions,
  NamedAttribute attribute,
@@ -3315,118 +3473,15 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
 
  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
- return llvm::TypeSwitch<Operation *, LogicalResult>(op)
- .Case([&](omp::BarrierOp) {
- ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
- return success();
- })
- .Case([&](omp::TaskwaitOp) {
- ompBuilder->createTaskwait(builder.saveIP());
- return success();
- })
- .Case([&](omp::TaskyieldOp) {
- ompBuilder->createTaskyield(builder.saveIP());
- return success();
- })
- .Case([&](omp::FlushOp) {
- // No support in Openmp runtime function (__kmpc_flush) to accept
- // the argument list.
- // OpenMP standard states the following:
- // "An implementation may implement a flush with a list by ignoring
- // the list, and treating it the same as a flush without a list."
- //
- // The argument list is discarded so that, flush with a list is treated
- // same as a flush without a list.
- ompBuilder->createFlush(builder.saveIP());
- return success();
- })
- .Case([&](omp::ParallelOp op) {
- return convertOmpParallel(*op, builder, moduleTranslation);
- })
- .Case([&](omp::ReductionOp reductionOp) {
- return convertOmpReductionOp(reductionOp, builder, moduleTranslation);
- })
- .Case([&](omp::MasterOp) {
- return convertOmpMaster(*op, builder, moduleTranslation);
- })
- .Case([&](omp::CriticalOp) {
- return convertOmpCritical(*op, builder, moduleTranslation);
- })
- .Case([&](omp::OrderedRegionOp) {
- return convertOmpOrderedRegion(*op, builder, moduleTranslation);
- })
- .Case([&](omp::OrderedOp) {
- return convertOmpOrdered(*op, builder, moduleTranslation);
- })
- .Case([&](omp::WsLoopOp) {
- return convertOmpWsLoop(*op, builder, moduleTranslation);
- })
- .Case([&](omp::SimdLoopOp) {
- return convertOmpSimdLoop(*op, builder, moduleTranslation);
- })
- .Case([&](omp::AtomicReadOp) {
- return convertOmpAtomicRead(*op, builder, moduleTranslation);
- })
- .Case([&](omp::AtomicWriteOp) {
- return convertOmpAtomicWrite(*op, builder, moduleTranslation);
- })
- .Case([&](omp::AtomicUpdateOp op) {
- return convertOmpAtomicUpdate(op, builder, moduleTranslation);
- })
- .Case([&](omp::AtomicCaptureOp op) {
- return convertOmpAtomicCapture(op, builder, moduleTranslation);
- })
- .Case([&](omp::SectionsOp) {
- return convertOmpSections(*op, builder, moduleTranslation);
- })
- .Case([&](omp::SingleOp op) {
- return convertOmpSingle(op, builder, moduleTranslation);
- })
- .Case([&](omp::TeamsOp op) {
- return convertOmpTeams(op, builder, moduleTranslation);
- })
- .Case([&](omp::TaskOp op) {
- return convertOmpTaskOp(op, builder, moduleTranslation);
- })
- .Case([&](omp::TaskGroupOp op) {
- return convertOmpTaskgroupOp(op, builder, moduleTranslation);
- })
- .Case<omp::YieldOp, omp::TerminatorOp, omp::ReductionDeclareOp,
- omp::CriticalDeclareOp>([](auto op) {
- // `yield` and `terminator` can be just omitted. The block structure
- // was created in the region that handles their parent operation.
- // `reduction.declare` will be used by reductions and is not
- // converted directly, skip it.
- // `critical.declare` is only used to declare names of critical
- // sections which will be used by `critical` ops and hence can be
- // ignored for lowering. The OpenMP IRBuilder will create unique
- // name for critical section names.
- return success();
- })
- .Case([&](omp::ThreadprivateOp) {
- return convertOmpThreadprivate(*op, builder, moduleTranslation);
- })
- .Case<omp::DataOp, omp::EnterDataOp, omp::ExitDataOp, omp::UpdateDataOp>(
- [&](auto op) {
- return convertOmpTargetData(op, builder, moduleTranslation);
- })
- .Case([&](omp::TargetOp) {
- return convertOmpTarget(*op, builder, moduleTranslation);
- })
- .Case([&](omp::DistributeOp) {
- return convertOmpDistribute(*op, builder, moduleTranslation);
- })
- .Case<omp::MapInfoOp, omp::DataBoundsOp, omp::PrivateClauseOp>(
- [&](auto op) {
- // No-op, should be handled by relevant owning operations e.g.
- // TargetOp, EnterDataOp, ExitDataOp, DataOp etc. and then
- // discarded
- return success();
- })
- .Default([&](Operation *inst) {
- return inst->emitError("unsupported OpenMP operation: ")
- << inst->getName();
- });
+ if (ompBuilder->Config.isTargetDevice()) {
+ if (isInternalTargetDeviceOp(op)) {
+ return convertInternalTargetOp(op, builder, moduleTranslation);
+ } else {
+ return convertTopLevelTargetOp(op, builder, moduleTranslation);
+ }
+ }
+
+ return convertCommonOperation(op, builder, moduleTranslation);
 }
 
 void mlir::registerOpenMPDialectTranslation(DialectRegistry &registry) {

diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -4,10 +4,10 @@
 // for nested omp do loop inside omp target region
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
- llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {
+ llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>,
  target_cpu = "gfx90a",
- target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>
- } {
+ target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>}
+ {
  omp.parallel {
  %loop_ub = llvm.mlir.constant(9 : i32) : i32
  %loop_lb = llvm.mlir.constant(0 : i32) : i32

diff --git a/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir b/mlir/test/Target/LLVMIR/omptarget-target-inside-task.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true} {
+ llvm.func @omp_target_region_() {
+ %0 = llvm.mlir.constant(20 : i32) : i32
+ %1 = llvm.mlir.constant(10 : i32) : i32
+ %2 = llvm.mlir.constant(1 : i64) : i64
+ %3 = llvm.alloca %2 x i32 {bindc_name = "a", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEa"} : (i64) -> !llvm.ptr
+ %4 = llvm.mlir.constant(1 : i64) : i64
+ %5 = llvm.alloca %4 x i32 {bindc_name = "b", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEb"} : (i64) -> !llvm.ptr
+ %6 = llvm.mlir.constant(1 : i64) : i64
+ %7 = llvm.alloca %6 x i32 {bindc_name = "c", in_type = i32, operandSegmentSizes = array<i32: 0, 0>, uniq_name = "_QFomp_target_regionEc"} : (i64) -> !llvm.ptr
+ llvm.store %1, %3 : i32, !llvm.ptr
+ llvm.store %0, %5 : i32, !llvm.ptr
+ %map1 = omp.map_info var_ptr(%3 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+ %map2 = omp.map_info var_ptr(%5 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+ %map3 = omp.map_info var_ptr(%7 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+ omp.task {
+ omp.target map_entries(%map1 -> %arg0, %map2 -> %arg1, %map3 -> %arg2 : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
+ ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: !llvm.ptr):
+ %8 = llvm.load %arg0 : !llvm.ptr -> i32
+ %9 = llvm.load %arg1 : !llvm.ptr -> i32
+ %10 = llvm.add %8, %9 : i32
+ llvm.store %10, %arg2 : i32, !llvm.ptr
+ omp.terminator
+ }
+ omp.terminator
+ }
+ llvm.return
+ }
+
+ llvm.func @omp_target_no_map() {
+ omp.target {
+ omp.terminator
+ }
+ llvm.return
+ }
+}
+
+// CHECK: define weak_odr protected amdgpu_kernel void @__omp_offloading_{{.*}}_{{.*}}_omp_target_region__l19
+// CHECK: ret void
diff --git a/mlir/test/Target/LLVMIR/omptarget-teams-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-teams-llvm.mlir
@@ -5,7 +5,7 @@
 
 module attributes {omp.is_target_device = true} {
  llvm.func @foo(i32)
- llvm.func @omp_target_teams_shared_simple(%arg0 : i32) {
+ llvm.func @omp_target_teams_shared_simple(%arg0 : i32) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
  omp.teams {
  llvm.call @foo(%arg0) : (i32) -> ()
  omp.terminator

diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -4,7 +4,7 @@
 // for nested omp do loop with collapse clause inside omp target region
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
- llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
+ llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
  %loop_ub = llvm.mlir.constant(99 : i32) : i32
  %loop_lb = llvm.mlir.constant(0 : i32) : i32
  %loop_step = llvm.mlir.constant(1 : index) : i32

diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -4,7 +4,7 @@
 // for nested omp do loop inside omp target region
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
- llvm.func @target_wsloop(%arg0: !llvm.ptr ){
+ llvm.func @target_wsloop(%arg0: !llvm.ptr ) attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
  %loop_ub = llvm.mlir.constant(9 : i32) : i32
  %loop_lb = llvm.mlir.constant(0 : i32) : i32
  %loop_step = llvm.mlir.constant(1 : i32) : i32
@@ -16,7 +16,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
  llvm.return
  }
 
- llvm.func @target_empty_wsloop(){
+ llvm.func @target_empty_wsloop() attributes {omp.declare_target = #omp.declaretarget<device_type = (any), capture_clause = (to)>} {
  %loop_ub = llvm.mlir.constant(9 : i32) : i32
  %loop_lb = llvm.mlir.constant(0 : i32) : i32
  %loop_step = llvm.mlir.constant(1 : i32) : i32