Merge branch 'sycl' of https://github.com/intel/llvm into benchmarkin…

…g-workflow
intel · Jul 4, 2024 · 6d14a32 · 6d14a32
2 parents 6ea0110 + ef62cad
commit 6d14a32
Show file tree

Hide file tree

Showing 4 changed files with 55 additions and 41 deletions.
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
@@ -1705,20 +1705,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
  let Subjects = SubjectList<[Function], ErrorDiag>;
  let AdditionalMembers = [{
- std::optional<llvm::APSInt> getXDimVal() const {
- if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
- return CE->getResultAsAPSInt();
- return std::nullopt;
+ unsigned getXDimVal() const {
+ return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
  }
- std::optional<llvm::APSInt> getYDimVal() const {
- if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
- return CE->getResultAsAPSInt();
- return std::nullopt;
+ unsigned getYDimVal() const {
+ return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
  }
- std::optional<llvm::APSInt> getZDimVal() const {
- if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
- return CE->getResultAsAPSInt();
- return std::nullopt;
+ unsigned getZDimVal() const {
+ return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
  }
  }];
  let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];

diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
  // Attributes arguments (first and third) are reversed on SYCLDevice.
  if (getLangOpts().SYCLIsDevice) {
  llvm::Metadata *AttrMDArgs[] = {
- llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
- llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
- llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
+ llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
+ llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
+ llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
  Fn->setMetadata("max_work_group_size",
  llvm::MDNode::get(Context, AttrMDArgs));
  }

diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
  bool HasMaxWorkGroupSize = false;
  bool HasMinWorkGroupPerCU = false;
  if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
- auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
-  (*MWGS->getYDimVal()).getExtValue() *
-  (*MWGS->getXDimVal()).getExtValue();
- if (MaxThreads > 0) {
-  addNVVMMetadata(F, "maxntidx", MaxThreads);
-  HasMaxWorkGroupSize = true;
- }
+ HasMaxWorkGroupSize = true;
+ // We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
+ // with the fastest-moving dimension rightmost, to CUDA's, where X is the
+ // fastest-moving dimension.
+ addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
+ addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
+ addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
  }
 
  auto attrValue = [&](Expr *E) {

diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -4,7 +4,7 @@
 // compute unit and maximum work groups per multi-processor attributes, that
 // correspond to CUDA's launch bounds. Expect max_work_group_size,
 // min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
-// maxntidx, minctasm, and maxclusterrank NVVM annotations respectively.
+// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.
 
 #include "sycl.hpp"
 
@@ -13,24 +13,24 @@ queue q;
 
 class Foo {
 public:
- [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+ [[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
  intel::max_work_groups_per_mp(4)]] void
  operator()() const {}
 };
 
 template <int N> class Functor {
 public:
- [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+ [[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
  intel::max_work_groups_per_mp(N)]] void
  operator()() const {}
 };
 
 template <int N>
-[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
  intel::max_work_groups_per_mp(N)]] void
 zoo() {}
 
-[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
  intel::max_work_groups_per_mp(4)]] void
 bar() {}
 
@@ -42,7 +42,7 @@ int main() {
 
  // Test attribute is applied on lambda.
  h.single_task<class kernel_name2>(
- [] [[intel::max_work_group_size(8, 8, 8),
+ [] [[intel::max_work_group_size(2, 4, 8),
  intel::min_work_groups_per_cu(2),
  intel::max_work_groups_per_mp(4)]] () {});
 
@@ -65,41 +65,61 @@ int main() {
 // CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
 // CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
 
-// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
 // CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
-// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
 // CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
-// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
-// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
 // CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
 
 // CHECK: ![[MWGPC]] = !{i32 2}
 // CHECK: ![[MWGPM]] = !{i32 4}
-// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
 // CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
-// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
 // CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
-// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}