Skip to content

Commit

Permalink
[SYCL] Fix ballot_group when the sub-group is not full size (#12737)
Browse files Browse the repository at this point in the history
Not all sub-groups are necessarily the max size of sub-groups in the
kernel invocation. As such, non-uniform groups should handle these
sub-groups properly. However, due to how the mask for the false-group in
ballot_group creates its mask, it thinks it has full 32-element size no
matter how big the actual sub-group is. This commit fixes this issue.

---------

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
  • Loading branch information
steffenlarsen authored Feb 22, 2024
1 parent 358843a commit 4bc9745
Show file tree
Hide file tree
Showing 6 changed files with 186 additions and 147 deletions.
8 changes: 7 additions & 1 deletion sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,13 @@ get_ballot_group(Group group, bool predicate) {
if (predicate) {
return ballot_group<sycl::sub_group>(mask, predicate);
} else {
return ballot_group<sycl::sub_group>(~mask, predicate);
// To negate the mask for the false-predicate group, we also need to exclude
// all parts of the mask that is not part of the group.
sub_group_mask::BitsType participant_filter =
(~sub_group_mask::BitsType{0}) >>
(sub_group_mask::max_bits - group.get_local_linear_range());
return ballot_group<sycl::sub_group>((~mask) & participant_filter,
predicate);
}
#endif
#else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ template <size_t PartitionSize, typename ParentGroup> class fixed_size_group {

range_type get_group_range() const {
#ifdef __SYCL_DEVICE_ONLY__
return __spirv_SubgroupMaxSize() / PartitionSize;
return __spirv_SubgroupSize() / PartitionSize;
#else
throw runtime_error("Non-uniform groups are not supported on host device.",
PI_ERROR_INVALID_DEVICE);
Expand Down
82 changes: 45 additions & 37 deletions sycl/test-e2e/NonUniformGroups/ballot_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,43 +20,51 @@ int main() {
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Split into odd and even work-items.
bool Predicate = WI % 2 == 0;
auto BallotGroup = syclex::get_ballot_group(SG, Predicate);

// Check function return values match Predicate.
// NB: Test currently uses exactly one sub-group, but we use SG
// below in case this changes in future.
bool Match = true;
auto GroupID = (Predicate) ? 1 : 0;
auto LocalID = SG.get_local_id() / 2;
Match &= (BallotGroup.get_group_id() == GroupID);
Match &= (BallotGroup.get_local_id() == LocalID);
Match &= (BallotGroup.get_group_range() == 2);
Match &= (BallotGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = BallotGroup.leader();
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
// Test for both the full sub-group size and a case with less work than a full
// sub-group.
for (size_t WGS : std::array<size_t, 2>{32, 16}) {
std::cout << "Testing for work size " << WGS << std::endl;

sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};

const auto NDR = sycl::nd_range<1>{WGS, WGS};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

// Split into odd and even work-items.
bool Predicate = WI % 2 == 0;
auto BallotGroup = syclex::get_ballot_group(SG, Predicate);

// Check function return values match Predicate.
// NB: Test currently uses exactly one sub-group, but we use SG
// below in case this changes in future.
bool Match = true;
auto GroupID = (Predicate) ? 1 : 0;
auto LocalID = SG.get_local_id() / 2;
Match &= (BallotGroup.get_group_id() == GroupID);
Match &= (BallotGroup.get_local_id() == LocalID);
Match &= (BallotGroup.get_group_range() == 2);
Match &= (BallotGroup.get_local_range() ==
SG.get_local_linear_range() / 2);
MatchAcc[WI] = Match;
LeaderAcc[WI] = BallotGroup.leader();
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < WGS; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
}
}

return 0;
}
71 changes: 41 additions & 30 deletions sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,36 +14,47 @@ template <size_t PartitionSize> class TestKernel;
template <size_t PartitionSize> void test() {
sycl::queue Q;

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

auto Partition = syclex::get_fixed_size_group<PartitionSize>(SG);

bool Match = true;
Match &= (Partition.get_group_id() == (WI / PartitionSize));
Match &= (Partition.get_local_id() == (WI % PartitionSize));
Match &= (Partition.get_group_range() == (32 / PartitionSize));
Match &= (Partition.get_local_range() == PartitionSize);
MatchAcc[WI] = Match;
LeaderAcc[WI] = Partition.leader();
};
CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0));
// Test for both the full sub-group size and a case with less work than a full
// sub-group.
for (size_t WGS : std::array<size_t, 2>{32, 16}) {
if (WGS < PartitionSize)
continue;

std::cout << "Testing for work size " << WGS << " and partition size "
<< PartitionSize << std::endl;

sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};

const auto NDR = sycl::nd_range<1>{WGS, WGS};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();
auto SGS = SG.get_local_linear_range();

auto Partition = syclex::get_fixed_size_group<PartitionSize>(SG);

bool Match = true;
Match &= (Partition.get_group_id() == (WI / PartitionSize));
Match &= (Partition.get_local_id() == (WI % PartitionSize));
Match &= (Partition.get_group_range() == (SGS / PartitionSize));
Match &= (Partition.get_local_range() == PartitionSize);
MatchAcc[WI] = Match;
LeaderAcc[WI] = Partition.leader();
};
CGH.parallel_for<TestKernel<PartitionSize>>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < WGS; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0));
}
}
}

Expand Down
82 changes: 44 additions & 38 deletions sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,50 +20,56 @@ int main() {
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
// Test for both the full sub-group size and a case with less work than a full
// sub-group.
for (size_t WGS : std::array<size_t, 2>{32, 16}) {
std::cout << "Testing for work size " << WGS << std::endl;

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();
sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};

// Due to the unpredictable runtime behavior of opportunistic groups,
// some values may change from run to run. Check they're in expected
// ranges and consistent with other groups.
if (item.get_global_id() % 2 == 0) {
auto OpportunisticGroup =
syclex::this_kernel::get_opportunistic_group();
const auto NDR = sycl::nd_range<1>{WGS, WGS};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

bool Match = true;
Match &= (OpportunisticGroup.get_group_id() == 0);
Match &= (OpportunisticGroup.get_local_id() <
OpportunisticGroup.get_local_range());
Match &= (OpportunisticGroup.get_group_range() == 1);
Match &= (OpportunisticGroup.get_local_linear_range() <=
SG.get_local_linear_range());
MatchAcc[WI] = Match;
LeaderAcc[WI] = OpportunisticGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});
// Due to the unpredictable runtime behavior of opportunistic
// groups, some values may change from run to run. Check they're in
// expected ranges and consistent with other groups.
if (item.get_global_id() % 2 == 0) {
auto OpportunisticGroup =
syclex::this_kernel::get_opportunistic_group();

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
uint32_t NumLeaders = 0;
for (int WI = 0; WI < 32; ++WI) {
if (WI % 2 == 0) {
assert(MatchAcc[WI] == true);
if (LeaderAcc[WI]) {
NumLeaders++;
bool Match = true;
Match &= (OpportunisticGroup.get_group_id() == 0);
Match &= (OpportunisticGroup.get_local_id() <
OpportunisticGroup.get_local_range());
Match &= (OpportunisticGroup.get_group_range() == 1);
Match &= (OpportunisticGroup.get_local_linear_range() <=
SG.get_local_linear_range());
MatchAcc[WI] = Match;
LeaderAcc[WI] = OpportunisticGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
uint32_t NumLeaders = 0;
for (int WI = 0; WI < WGS; ++WI) {
if (WI % 2 == 0) {
assert(MatchAcc[WI] == true);
if (LeaderAcc[WI]) {
NumLeaders++;
}
}
}
assert(NumLeaders > 0);
}
assert(NumLeaders > 0);
return 0;
}
88 changes: 48 additions & 40 deletions sycl/test-e2e/NonUniformGroups/tangle_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,51 +20,59 @@ int main() {
return 0;
}

sycl::buffer<bool, 1> MatchBuf{sycl::range{32}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{32}};
// Test for both the full sub-group size and a case with less work than a full
// sub-group.
for (size_t WGS : std::array<size_t, 2>{32, 16}) {
std::cout << "Testing for work size " << WGS << std::endl;

const auto NDR = sycl::nd_range<1>{32, 32};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();
sycl::buffer<bool, 1> MatchBuf{sycl::range{WGS}};
sycl::buffer<bool, 1> LeaderBuf{sycl::range{WGS}};

// Split into odd and even work-items via control flow.
// Branches deliberately duplicated to test impact of optimizations.
// This only reliably works with optimizations disabled right now.
if (item.get_global_id() % 2 == 0) {
auto TangleGroup = syclex::get_tangle_group(SG);
const auto NDR = sycl::nd_range<1>{WGS, WGS};
Q.submit([&](sycl::handler &CGH) {
sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only};
sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only};
const auto KernelFunc =
[=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] {
auto WI = item.get_global_id();
auto SG = item.get_sub_group();

bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
} else {
auto TangleGroup = syclex::get_tangle_group(SG);
// Split into odd and even work-items via control flow.
// Branches deliberately duplicated to test impact of optimizations.
// This only reliably works with optimizations disabled right now.
if (item.get_global_id() % 2 == 0) {
auto TangleGroup = syclex::get_tangle_group(SG);

bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() == 16);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});
bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() ==
SG.get_local_linear_range() / 2);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
} else {
auto TangleGroup = syclex::get_tangle_group(SG);

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < 32; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
bool Match = true;
Match &= (TangleGroup.get_group_id() == 0);
Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2);
Match &= (TangleGroup.get_group_range() == 1);
Match &= (TangleGroup.get_local_range() ==
SG.get_local_linear_range() / 2);
MatchAcc[WI] = Match;
LeaderAcc[WI] = TangleGroup.leader();
}
};
CGH.parallel_for<TestKernel>(NDR, KernelFunc);
});

sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only};
sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only};
for (int WI = 0; WI < WGS; ++WI) {
assert(MatchAcc[WI] == true);
assert(LeaderAcc[WI] == (WI < 2));
}
}
return 0;
}

0 comments on commit 4bc9745

Please sign in to comment.