diff --git a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp index 985620152e4ae..079a637580b93 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/ballot_group.hpp @@ -153,7 +153,13 @@ get_ballot_group(Group group, bool predicate) { if (predicate) { return ballot_group(mask, predicate); } else { - return ballot_group(~mask, predicate); + // To negate the mask for the false-predicate group, we also need to exclude + // all parts of the mask that is not part of the group. + sub_group_mask::BitsType participant_filter = + (~sub_group_mask::BitsType{0}) >> + (sub_group_mask::max_bits - group.get_local_linear_range()); + return ballot_group((~mask) & participant_filter, + predicate); } #endif #else diff --git a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp index c5543989998a2..3c7ef6b257d0a 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/fixed_size_group.hpp @@ -64,7 +64,7 @@ template class fixed_size_group { range_type get_group_range() const { #ifdef __SYCL_DEVICE_ONLY__ - return __spirv_SubgroupMaxSize() / PartitionSize; + return __spirv_SubgroupSize() / PartitionSize; #else throw runtime_error("Non-uniform groups are not supported on host device.", PI_ERROR_INVALID_DEVICE); diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp index 2a4eba90a68e8..7f21d55bcc3a4 100644 --- a/sycl/test-e2e/NonUniformGroups/ballot_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/ballot_group.cpp @@ -20,43 +20,51 @@ int main() { return 0; } - sycl::buffer MatchBuf{sycl::range{32}}; - sycl::buffer LeaderBuf{sycl::range{32}}; - - const auto NDR = sycl::nd_range<1>{32, 32}; - Q.submit([&](sycl::handler &CGH) { - sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; - sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; - const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { - auto WI = item.get_global_id(); - auto SG = item.get_sub_group(); - - // Split into odd and even work-items. - bool Predicate = WI % 2 == 0; - auto BallotGroup = syclex::get_ballot_group(SG, Predicate); - - // Check function return values match Predicate. - // NB: Test currently uses exactly one sub-group, but we use SG - // below in case this changes in future. - bool Match = true; - auto GroupID = (Predicate) ? 1 : 0; - auto LocalID = SG.get_local_id() / 2; - Match &= (BallotGroup.get_group_id() == GroupID); - Match &= (BallotGroup.get_local_id() == LocalID); - Match &= (BallotGroup.get_group_range() == 2); - Match &= (BallotGroup.get_local_range() == 16); - MatchAcc[WI] = Match; - LeaderAcc[WI] = BallotGroup.leader(); - }; - CGH.parallel_for(NDR, KernelFunc); - }); - - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - for (int WI = 0; WI < 32; ++WI) { - assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == (WI < 2)); + // Test for both the full sub-group size and a case with less work than a full + // sub-group. + for (size_t WGS : std::array{32, 16}) { + std::cout << "Testing for work size " << WGS << std::endl; + + sycl::buffer MatchBuf{sycl::range{WGS}}; + sycl::buffer LeaderBuf{sycl::range{WGS}}; + + const auto NDR = sycl::nd_range<1>{WGS, WGS}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + + // Split into odd and even work-items. + bool Predicate = WI % 2 == 0; + auto BallotGroup = syclex::get_ballot_group(SG, Predicate); + + // Check function return values match Predicate. + // NB: Test currently uses exactly one sub-group, but we use SG + // below in case this changes in future. + bool Match = true; + auto GroupID = (Predicate) ? 1 : 0; + auto LocalID = SG.get_local_id() / 2; + Match &= (BallotGroup.get_group_id() == GroupID); + Match &= (BallotGroup.get_local_id() == LocalID); + Match &= (BallotGroup.get_group_range() == 2); + Match &= (BallotGroup.get_local_range() == + SG.get_local_linear_range() / 2); + MatchAcc[WI] = Match; + LeaderAcc[WI] = BallotGroup.leader(); + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } } + return 0; } diff --git a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp index 2f043c5bed711..29567639ff442 100644 --- a/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/fixed_size_group.cpp @@ -14,36 +14,47 @@ template class TestKernel; template void test() { sycl::queue Q; - sycl::buffer MatchBuf{sycl::range{32}}; - sycl::buffer LeaderBuf{sycl::range{32}}; - - const auto NDR = sycl::nd_range<1>{32, 32}; - Q.submit([&](sycl::handler &CGH) { - sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; - sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; - const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { - auto WI = item.get_global_id(); - auto SG = item.get_sub_group(); - - auto Partition = syclex::get_fixed_size_group(SG); - - bool Match = true; - Match &= (Partition.get_group_id() == (WI / PartitionSize)); - Match &= (Partition.get_local_id() == (WI % PartitionSize)); - Match &= (Partition.get_group_range() == (32 / PartitionSize)); - Match &= (Partition.get_local_range() == PartitionSize); - MatchAcc[WI] = Match; - LeaderAcc[WI] = Partition.leader(); - }; - CGH.parallel_for>(NDR, KernelFunc); - }); - - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - for (int WI = 0; WI < 32; ++WI) { - assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + // Test for both the full sub-group size and a case with less work than a full + // sub-group. + for (size_t WGS : std::array{32, 16}) { + if (WGS < PartitionSize) + continue; + + std::cout << "Testing for work size " << WGS << " and partition size " + << PartitionSize << std::endl; + + sycl::buffer MatchBuf{sycl::range{WGS}}; + sycl::buffer LeaderBuf{sycl::range{WGS}}; + + const auto NDR = sycl::nd_range<1>{WGS, WGS}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); + auto SGS = SG.get_local_linear_range(); + + auto Partition = syclex::get_fixed_size_group(SG); + + bool Match = true; + Match &= (Partition.get_group_id() == (WI / PartitionSize)); + Match &= (Partition.get_local_id() == (WI % PartitionSize)); + Match &= (Partition.get_group_range() == (SGS / PartitionSize)); + Match &= (Partition.get_local_range() == PartitionSize); + MatchAcc[WI] = Match; + LeaderAcc[WI] = Partition.leader(); + }; + CGH.parallel_for>(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == ((WI % PartitionSize) == 0)); + } } } diff --git a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp index 292d26859cbef..c926a8643f31d 100644 --- a/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/opportunistic_group.cpp @@ -20,50 +20,56 @@ int main() { return 0; } - sycl::buffer MatchBuf{sycl::range{32}}; - sycl::buffer LeaderBuf{sycl::range{32}}; + // Test for both the full sub-group size and a case with less work than a full + // sub-group. + for (size_t WGS : std::array{32, 16}) { + std::cout << "Testing for work size " << WGS << std::endl; - const auto NDR = sycl::nd_range<1>{32, 32}; - Q.submit([&](sycl::handler &CGH) { - sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; - sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; - const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { - auto WI = item.get_global_id(); - auto SG = item.get_sub_group(); + sycl::buffer MatchBuf{sycl::range{WGS}}; + sycl::buffer LeaderBuf{sycl::range{WGS}}; - // Due to the unpredictable runtime behavior of opportunistic groups, - // some values may change from run to run. Check they're in expected - // ranges and consistent with other groups. - if (item.get_global_id() % 2 == 0) { - auto OpportunisticGroup = - syclex::this_kernel::get_opportunistic_group(); + const auto NDR = sycl::nd_range<1>{WGS, WGS}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); - bool Match = true; - Match &= (OpportunisticGroup.get_group_id() == 0); - Match &= (OpportunisticGroup.get_local_id() < - OpportunisticGroup.get_local_range()); - Match &= (OpportunisticGroup.get_group_range() == 1); - Match &= (OpportunisticGroup.get_local_linear_range() <= - SG.get_local_linear_range()); - MatchAcc[WI] = Match; - LeaderAcc[WI] = OpportunisticGroup.leader(); - } - }; - CGH.parallel_for(NDR, KernelFunc); - }); + // Due to the unpredictable runtime behavior of opportunistic + // groups, some values may change from run to run. Check they're in + // expected ranges and consistent with other groups. + if (item.get_global_id() % 2 == 0) { + auto OpportunisticGroup = + syclex::this_kernel::get_opportunistic_group(); - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - uint32_t NumLeaders = 0; - for (int WI = 0; WI < 32; ++WI) { - if (WI % 2 == 0) { - assert(MatchAcc[WI] == true); - if (LeaderAcc[WI]) { - NumLeaders++; + bool Match = true; + Match &= (OpportunisticGroup.get_group_id() == 0); + Match &= (OpportunisticGroup.get_local_id() < + OpportunisticGroup.get_local_range()); + Match &= (OpportunisticGroup.get_group_range() == 1); + Match &= (OpportunisticGroup.get_local_linear_range() <= + SG.get_local_linear_range()); + MatchAcc[WI] = Match; + LeaderAcc[WI] = OpportunisticGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + uint32_t NumLeaders = 0; + for (int WI = 0; WI < WGS; ++WI) { + if (WI % 2 == 0) { + assert(MatchAcc[WI] == true); + if (LeaderAcc[WI]) { + NumLeaders++; + } } } + assert(NumLeaders > 0); } - assert(NumLeaders > 0); return 0; } diff --git a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp index 80132d6aa3e30..a5fb9a0d31dc6 100644 --- a/sycl/test-e2e/NonUniformGroups/tangle_group.cpp +++ b/sycl/test-e2e/NonUniformGroups/tangle_group.cpp @@ -20,51 +20,59 @@ int main() { return 0; } - sycl::buffer MatchBuf{sycl::range{32}}; - sycl::buffer LeaderBuf{sycl::range{32}}; + // Test for both the full sub-group size and a case with less work than a full + // sub-group. + for (size_t WGS : std::array{32, 16}) { + std::cout << "Testing for work size " << WGS << std::endl; - const auto NDR = sycl::nd_range<1>{32, 32}; - Q.submit([&](sycl::handler &CGH) { - sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; - sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; - const auto KernelFunc = - [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { - auto WI = item.get_global_id(); - auto SG = item.get_sub_group(); + sycl::buffer MatchBuf{sycl::range{WGS}}; + sycl::buffer LeaderBuf{sycl::range{WGS}}; - // Split into odd and even work-items via control flow. - // Branches deliberately duplicated to test impact of optimizations. - // This only reliably works with optimizations disabled right now. - if (item.get_global_id() % 2 == 0) { - auto TangleGroup = syclex::get_tangle_group(SG); + const auto NDR = sycl::nd_range<1>{WGS, WGS}; + Q.submit([&](sycl::handler &CGH) { + sycl::accessor MatchAcc{MatchBuf, CGH, sycl::write_only}; + sycl::accessor LeaderAcc{LeaderBuf, CGH, sycl::write_only}; + const auto KernelFunc = + [=](sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(32)]] { + auto WI = item.get_global_id(); + auto SG = item.get_sub_group(); - bool Match = true; - Match &= (TangleGroup.get_group_id() == 0); - Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); - Match &= (TangleGroup.get_group_range() == 1); - Match &= (TangleGroup.get_local_range() == 16); - MatchAcc[WI] = Match; - LeaderAcc[WI] = TangleGroup.leader(); - } else { - auto TangleGroup = syclex::get_tangle_group(SG); + // Split into odd and even work-items via control flow. + // Branches deliberately duplicated to test impact of optimizations. + // This only reliably works with optimizations disabled right now. + if (item.get_global_id() % 2 == 0) { + auto TangleGroup = syclex::get_tangle_group(SG); - bool Match = true; - Match &= (TangleGroup.get_group_id() == 0); - Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); - Match &= (TangleGroup.get_group_range() == 1); - Match &= (TangleGroup.get_local_range() == 16); - MatchAcc[WI] = Match; - LeaderAcc[WI] = TangleGroup.leader(); - } - }; - CGH.parallel_for(NDR, KernelFunc); - }); + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == + SG.get_local_linear_range() / 2); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } else { + auto TangleGroup = syclex::get_tangle_group(SG); - sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; - sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; - for (int WI = 0; WI < 32; ++WI) { - assert(MatchAcc[WI] == true); - assert(LeaderAcc[WI] == (WI < 2)); + bool Match = true; + Match &= (TangleGroup.get_group_id() == 0); + Match &= (TangleGroup.get_local_id() == SG.get_local_id() / 2); + Match &= (TangleGroup.get_group_range() == 1); + Match &= (TangleGroup.get_local_range() == + SG.get_local_linear_range() / 2); + MatchAcc[WI] = Match; + LeaderAcc[WI] = TangleGroup.leader(); + } + }; + CGH.parallel_for(NDR, KernelFunc); + }); + + sycl::host_accessor MatchAcc{MatchBuf, sycl::read_only}; + sycl::host_accessor LeaderAcc{LeaderBuf, sycl::read_only}; + for (int WI = 0; WI < WGS; ++WI) { + assert(MatchAcc[WI] == true); + assert(LeaderAcc[WI] == (WI < 2)); + } } return 0; }