Skip to content

Commit

Permalink
[SYCL][NVPTX] Split max_work_group_size into 3 NVVM annotations (#14420)
Browse files Browse the repository at this point in the history
NVVM IR supports separated maxntidx, maxntidy, and maxntidz annotations.
The backend will print them individually as three dimensions. This
better preserves programmer intent than prematurely flattening them
together.

Note that the semantics are in fact identical; the CUDA implementation
internally multiplies all dimensions together and only guarantees that
the total is never exceeded, but not that any individual dimension is
not exceeded. Thus 64,1,1 is identical to 4,4,4.

We try and preserve a logical mapping of dimensions by index flipping
between SYCL (z,y,x) and NVVM (x,y,z) in CUDA terminology despite, as
mentioned above, it being largely irrelevant.

Also this patch simplifies the attribute's getter functions as all
dimensions are mandatory, and the getters seemed copied from the
reqd_work_group_size attribute where some are optional.

We could probably improve the code further by making the operands
"unsigned" and not "Expr", and renaming them from X,Y,Z to Dim{0,1,2} as
per the SYCL spec. This has been left for future work, however, as
there's a non-trivial amount of code that expects to be able to treat
the max_work_group_size and reqd_work_group_size attributes identically
through templates and identical helper methods.
  • Loading branch information
frasercrmck authored Jul 4, 2024
1 parent 7cb3107 commit ef62cad
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 41 deletions.
18 changes: 6 additions & 12 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1705,20 +1705,14 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
let Subjects = SubjectList<[Function], ErrorDiag>;
let AdditionalMembers = [{
std::optional<llvm::APSInt> getXDimVal() const {
if (const auto *CE = dyn_cast<ConstantExpr>(getXDim()))
return CE->getResultAsAPSInt();
return std::nullopt;
unsigned getXDimVal() const {
return cast<ConstantExpr>(getXDim())->getResultAsAPSInt().getExtValue();
}
std::optional<llvm::APSInt> getYDimVal() const {
if (const auto *CE = dyn_cast<ConstantExpr>(getYDim()))
return CE->getResultAsAPSInt();
return std::nullopt;
unsigned getYDimVal() const {
return cast<ConstantExpr>(getYDim())->getResultAsAPSInt().getExtValue();
}
std::optional<llvm::APSInt> getZDimVal() const {
if (const auto *CE = dyn_cast<ConstantExpr>(getZDim()))
return CE->getResultAsAPSInt();
return std::nullopt;
unsigned getZDimVal() const {
return cast<ConstantExpr>(getZDim())->getResultAsAPSInt().getExtValue();
}
}];
let Documentation = [SYCLIntelMaxWorkGroupSizeAttrDocs];
Expand Down
6 changes: 3 additions & 3 deletions clang/lib/CodeGen/CodeGenFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -826,9 +826,9 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
// Attributes arguments (first and third) are reversed on SYCLDevice.
if (getLangOpts().SYCLIsDevice) {
llvm::Metadata *AttrMDArgs[] = {
llvm::ConstantAsMetadata::get(Builder.getInt(*A->getZDimVal())),
llvm::ConstantAsMetadata::get(Builder.getInt(*A->getYDimVal())),
llvm::ConstantAsMetadata::get(Builder.getInt(*A->getXDimVal()))};
llvm::ConstantAsMetadata::get(Builder.getInt32(A->getZDimVal())),
llvm::ConstantAsMetadata::get(Builder.getInt32(A->getYDimVal())),
llvm::ConstantAsMetadata::get(Builder.getInt32(A->getXDimVal()))};
Fn->setMetadata("max_work_group_size",
llvm::MDNode::get(Context, AttrMDArgs));
}
Expand Down
14 changes: 7 additions & 7 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
bool HasMaxWorkGroupSize = false;
bool HasMinWorkGroupPerCU = false;
if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
(*MWGS->getYDimVal()).getExtValue() *
(*MWGS->getXDimVal()).getExtValue();
if (MaxThreads > 0) {
addNVVMMetadata(F, "maxntidx", MaxThreads);
HasMaxWorkGroupSize = true;
}
HasMaxWorkGroupSize = true;
// We must index-flip between SYCL's notation, X,Y,Z (aka dim0,dim1,dim2)
// with the fastest-moving dimension rightmost, to CUDA's, where X is the
// fastest-moving dimension.
addNVVMMetadata(F, "maxntidx", MWGS->getZDimVal());
addNVVMMetadata(F, "maxntidy", MWGS->getYDimVal());
addNVVMMetadata(F, "maxntidz", MWGS->getXDimVal());
}

auto attrValue = [&](Expr *E) {
Expand Down
58 changes: 39 additions & 19 deletions clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// compute unit and maximum work groups per multi-processor attributes, that
// correspond to CUDA's launch bounds. Expect max_work_group_size,
// min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
// maxntidx, minctasm, and maxclusterrank NVVM annotations respectively.
// maxntid[xyz], minctasm, and maxclusterrank NVVM annotations respectively.

#include "sycl.hpp"

Expand All @@ -13,24 +13,24 @@ queue q;

class Foo {
public:
[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
intel::max_work_groups_per_mp(4)]] void
operator()() const {}
};

template <int N> class Functor {
public:
[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
intel::max_work_groups_per_mp(N)]] void
operator()() const {}
};

template <int N>
[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
[[intel::max_work_group_size(N, 4, 8), intel::min_work_groups_per_cu(N),
intel::max_work_groups_per_mp(N)]] void
zoo() {}

[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
[[intel::max_work_group_size(2, 4, 8), intel::min_work_groups_per_cu(2),
intel::max_work_groups_per_mp(4)]] void
bar() {}

Expand All @@ -42,7 +42,7 @@ int main() {

// Test attribute is applied on lambda.
h.single_task<class kernel_name2>(
[] [[intel::max_work_group_size(8, 8, 8),
[] [[intel::max_work_group_size(2, 4, 8),
intel::min_work_groups_per_cu(2),
intel::max_work_groups_per_mp(4)]] () {});

Expand All @@ -65,41 +65,61 @@ int main() {
// CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
// CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]

// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name1, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidz", i32 6}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"minctasm", i32 6}
// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidz", i32 6}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minctasm", i32 6}
// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidz", i32 2}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minctasm", i32 2}
// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidz", i32 16}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"minctasm", i32 16}
// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 8}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidy", i32 4}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidz", i32 16}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minctasm", i32 16}
// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}

// CHECK: ![[MWGPC]] = !{i32 2}
// CHECK: ![[MWGPM]] = !{i32 4}
// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
// CHECK: ![[MWGS]] = !{i32 8, i32 4, i32 2}
// CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
// CHECK: ![[MWGS_2]] = !{i32 8, i32 4, i32 6}
// CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
// CHECK: ![[MWGS_3]] = !{i32 8, i32 4, i32 16}

0 comments on commit ef62cad

Please sign in to comment.