From 73466c5de83beac4109c65c99b25654db8bb67fe Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Thu, 22 Feb 2024 03:53:40 -0600
Subject: [PATCH] [MLIR][OpenMP] Lower only target related pragmas for GPU MLIR

Flang generates two MLIR files for target-related pragmas:
the host MLIR and the GPU MLIR. If a given source function
contains both host and target-related pragmas, we must ensure
that we do not lower the host-related pragmas for the GPU MLIR.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 43 +++++++++++++++++++
 .../LLVMIR/omptarget-parallel-wsloop.mlir     |  3 +-
 .../LLVMIR/omptarget-wsloop-collapsed.mlir    |  2 +-
 mlir/test/Target/LLVMIR/omptarget-wsloop.mlir |  6 ++-
 4 files changed, 50 insertions(+), 4 deletions(-)
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 3500ddc93a1753..d4ec80d700c071 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -127,6 +127,46 @@ findAllocaInsertPoint(llvm::IRBuilderBase &builder,
       &funcEntryBlock, funcEntryBlock.getFirstInsertionPt());
 }
 
+static bool isOpAllowedToBeLowered(Operation *opInst,
+                                   llvm::OpenMPIRBuilder *ompBuilder) {
+  if (!opInst)
+    return false;
+  // omp.target operation can be lowered for host and device MLIR
+  if (isa<omp::TargetOp>(opInst))
+    return true;
+
+  // OpenMP operations inside omp.target can be lowered for host and device MLIR
+  if (opInst->getParentOfType<omp::TargetOp>())
+    return true;
+
+  // TODO: Add support for test case:
+  // omp.parallel { //host pragma
+  //   omp.target { }
+  // }
+  bool hasTargetRegion =
+      opInst->walk([](omp::TargetOp) { return WalkResult::interrupt(); })
+          .wasInterrupted();
+  if (hasTargetRegion)
+    opInst->emitError("Target region inside other pragma is not yet supported");
+
+  // Check if given OpenMP operation belongs to function labelled with
+  // omp declare target pragma
+  LLVM::LLVMFuncOp funcOp = opInst->getParentOfType<LLVM::LLVMFuncOp>();
+  omp::DeclareTargetDeviceType declareType = omp::DeclareTargetDeviceType::host;
+
+  if (!funcOp)
+    return false;
+  auto declareTargetOp =
+      dyn_cast<omp::DeclareTargetInterface>(funcOp.getOperation());
+  if (declareTargetOp && declareTargetOp.isDeclareTarget())
+    declareType = declareTargetOp.getDeclareTargetDeviceType();
+  if ((declareType == omp::DeclareTargetDeviceType::host) &&
+      ompBuilder->Config.isGPU()) {
+    return false;
+  }
+  return true;
+}
+
 /// Converts the given region that appears within an OpenMP dialect operation to
 /// LLVM IR, creating a branch from the `sourceBlock` to the entry block of the
 /// region, and a branch from any block with an successor-less OpenMP terminator
@@ -3182,6 +3222,9 @@ LogicalResult OpenMPDialectLLVMIRTranslationInterface::convertOperation(
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
+  // Skip lowering of an OpenMP operation if it's context is not appropriate
+  if (!isOpAllowedToBeLowered(op, ompBuilder))
+    return success();
   return llvm::TypeSwitch<Operation *, LogicalResult>(op)
       .Case([&](omp::BarrierOp) {
         ompBuilder->createBarrier(builder.saveIP(), llvm::omp::OMPD_barrier);
diff --git a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
index afbf5f22246309..1561b535250142 100644
--- a/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-parallel-wsloop.mlir
@@ -6,7 +6,8 @@
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
   llvm.func @target_parallel_wsloop(%arg0: !llvm.ptr) attributes {
     target_cpu = "gfx90a",
-    target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>
+    target_features = #llvm.target_features<["+gfx9-insts", "+wavefrontsize64"]>,
+    omp.declare_target = #omp.declaretarget<device_type = (any)>
   } {
     omp.parallel {
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
index 435aca32450c2f..4690cf43e84df2 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -4,7 +4,7 @@
 // for nested omp do loop with collapse clause inside omp target region
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
-  llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
+  llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) attributes {omp.declare_target = #omp.declaretarget<device_type = (any)> } {
     %loop_ub = llvm.mlir.constant(99 : i32) : i32
     %loop_lb = llvm.mlir.constant(0 : i32) : i32
     %loop_step = llvm.mlir.constant(1 : index) : i32
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
index 4cfb7d4f695143..a7b3f3542a4acd 100644
--- a/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop.mlir
@@ -4,7 +4,8 @@
 // for nested omp do loop inside omp target region
 
 module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
-  llvm.func @target_wsloop(%arg0: !llvm.ptr ){
+  llvm.func @target_wsloop(%arg0: !llvm.ptr ) attributes {
+    omp.declare_target = #omp.declaretarget<device_type = (any)> } {
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
       %loop_lb = llvm.mlir.constant(0 : i32) : i32
       %loop_step = llvm.mlir.constant(1 : i32) : i32
@@ -16,7 +17,8 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memo
     llvm.return
   }
 
-  llvm.func @target_empty_wsloop(){
+  llvm.func @target_empty_wsloop() attributes {
+    omp.declare_target = #omp.declaretarget<device_type = (any)> } {
       %loop_ub = llvm.mlir.constant(9 : i32) : i32
       %loop_lb = llvm.mlir.constant(0 : i32) : i32
       %loop_step = llvm.mlir.constant(1 : i32) : i32