[SYCL][NVPTX] Split max_work_group_size into 3 NVVM annotations (#14420)

NVVM IR supports separated maxntidx, maxntidy, and maxntidz annotations. The backend will print them individually as three dimensions. This better preserves programmer intent than prematurely flattening them together. Note that the semantics are in fact identical; the CUDA implementation internally multiplies all dimensions together and only guarantees that the total is never exceeded, but not that any individual dimension is not exceeded. Thus 64,1,1 is identical to 4,4,4. We try and preserve a logical mapping of dimensions by index flipping between SYCL (z,y,x) and NVVM (x,y,z) in CUDA terminology despite, as mentioned above, it being largely irrelevant. Also this patch simplifies the attribute's getter functions as all dimensions are mandatory, and the getters seemed copied from the reqd_work_group_size attribute where some are optional. We could probably improve the code further by making the operands "unsigned" and not "Expr", and renaming them from X,Y,Z to Dim{0,1,2} as per the SYCL spec. This has been left for future work, however, as there's a non-trivial amount of code that expects to be able to treat the max_work_group_size and reqd_work_group_size attributes identically through templates and identical helper methods.
intel · Jul 4, 2024 · ef62cad · ef62cad
1 parent 7cb3107
commit ef62cad
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 41 deletions.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -1705,20 +1705,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
   let Subjects = SubjectList<[Function], ErrorDiag>;
   let AdditionalMembers = [{
-    std::optional<llvm::APSInt> getXDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getXDimVal() const {
+      return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getYDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getYDimVal() const {
+      return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
     }
-    std::optional<llvm::APSInt> getZDimVal() const {
-      if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
-        return CE->getResultAsAPSInt();
-      return std::nullopt;
+    unsigned getZDimVal() const {
+      return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
     }
   }];
   let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
     // Attributes arguments (first and third) are reversed on SYCLDevice.
     if (getLangOpts().SYCLIsDevice) {
       llvm::Metadata *AttrMDArgs[] = {
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
-          llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
+          llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
       Fn->setMetadata("max_work_group_size",
                       llvm::MDNode::get(Context, AttrMDArgs));
     }

diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
     bool HasMaxWorkGroupSize = false;
     bool HasMinWorkGroupPerCU = false;
     if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
-      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
-                        (*MWGS->getYDimVal()).getExtValue() *
-                        (*MWGS->getXDimVal()).getExtValue();
-      if (MaxThreads > 0) {
-        addNVVMMetadata(F, "maxntidx", MaxThreads);
-        HasMaxWorkGroupSize = true;
-      }
+      HasMaxWorkGroupSize = true;
+      // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
+      // with the fastest-moving dimension rightmost, to CUDA's, where X is the
+      // fastest-moving dimension.
+      addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
+      addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
+      addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
     }
 
     auto attrValue = [&](Expr *E) {

diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -4,7 +4,7 @@
 // compute unit and maximum work groups per multi-processor attributes, that
 // correspond to CUDA's launch bounds. Expect max_work_group_size,
 // min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
-// maxntidx, minctasm, and maxclusterrank NVVM annotations respectively.
+// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.
 
 #include "sycl.hpp"
 
@@ -13,24 +13,24 @@ queue q;
 
 class Foo {
 public:
-  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  [[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
     intel::max_work_groups_per_mp(4)]] void
   operator()() const {}
 };
 
 template <int N> class Functor {
 public:
-  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  [[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
     intel::max_work_groups_per_mp(N)]] void
   operator()() const {}
 };
 
 template <int N>
-[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
   intel::max_work_groups_per_mp(N)]] void
 zoo() {}
 
-[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
   intel::max_work_groups_per_mp(4)]] void
 bar() {}
 
@@ -42,7 +42,7 @@ int main() {
 
     // Test attribute is applied on lambda.
     h.single_task<class kernel_name2>(
-        [] [[intel::max_work_group_size(8, 8, 8),
+        [] [[intel::max_work_group_size(2, 4, 8),
              intel::min_work_groups_per_cu(2),
              intel::max_work_groups_per_mp(4)]] () {});
 
@@ -65,41 +65,61 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
 
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
-// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
 // CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
-// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
 // CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
-// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}