Skip to content

Commit

Permalink
ockl: Don't use wave32 ballot builtin
Browse files Browse the repository at this point in the history
Wave32 and wave64 paths cannot really co-exist in the same function
or callgraph. They need to be treated as a hard ABI incompatibility.
We cannot handle the wave32 operation on wave64, but we can and do
handle the wave64 operation on wave32. Given the current linking scheme,
the most expedient fix for this not working is to pretend wave32 does
not exist and just use the wave64 ballot. The optimizer will fold
the 64-bit ballot intrinsic to the 32-bit one when it sees a 32-bit
target.

This was reported broken in
llvm#89332

Change-Id: If897f631066e257e18d8adf574cd17d3f9278ca7
  • Loading branch information
arsenm committed Apr 26, 2024
1 parent 13ea524 commit 066a0b2
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 80 deletions.
50 changes: 9 additions & 41 deletions amd/device-libs/ockl/src/dm.cl
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,14 @@ first(__global void * v)
return __builtin_astype(w2, __global void *);
}

REQUIRES_WAVE64
// Read val from one active lane whose predicate is one.
// If no lanes have the predicate set, return none
// This is like first, except that first may not have its predicate set
static uint
elect_uint_wave64(int pred, uint val, uint none) {
elect_uint(int pred, uint val, uint none)
{
// Pretend wave32 doesn't exist. The wave64 ballot works, and the high half
// will fold out as 0.
uint ret = none;

ulong mask = __builtin_amdgcn_ballot_w64(pred != 0);
Expand All @@ -308,49 +313,12 @@ elect_uint_wave64(int pred, uint val, uint none) {
return ret;
}

REQUIRES_WAVE32
static uint
elect_uint_wave32(int pred, uint val, uint none) {
uint ret = none;
uint mask = __builtin_amdgcn_ballot_w32(pred != 0);
if (mask != 0U) {
uint l = __ockl_ctz_u32(mask);
ret = __builtin_amdgcn_ds_bpermute(l << 2, val);
}

return ret;
}

// Read val from one active lane whose predicate is one.
// If no lanes have the predicate set, return none
// This is like first, except that first may not have its predicate set
static uint
elect_uint(int pred, uint val, uint none)
{
return __oclc_wavefrontsize64 ? elect_uint_wave64(pred, val, none) : elect_uint_wave32(pred, val, none);
}

REQUIRES_WAVE64
static uint
votes_wave64(bool b)
{
ulong mask = __builtin_amdgcn_ballot_w64(b);
return __builtin_popcountl(mask);
}

REQUIRES_WAVE32
static uint
votes_wave32(bool b)
{
uint mask = __builtin_amdgcn_ballot_w32(b);
return __builtin_popcount(mask);
}

// Count the number of nonzero arguments across the wave
static uint
votes(bool b)
{
return __oclc_wavefrontsize64 ? votes_wave64(b) : votes_wave32(b);
ulong mask = __builtin_amdgcn_ballot_w64(b);
return __builtin_popcountl(mask);
}

// The kind of the smallest block that can hold sz bytes
Expand Down
43 changes: 4 additions & 39 deletions amd/device-libs/ockl/src/wfaas.cl
Original file line number Diff line number Diff line change
Expand Up @@ -22,60 +22,25 @@ static int optimizationBarrierHack(int in_val)
return out_val;
}

REQUIRES_WAVE32
static bool wfany_impl_w32(int e) {
return __builtin_amdgcn_ballot_w32(e) != 0;
}

REQUIRES_WAVE64
static bool wfany_impl_w64(int e) {
return __builtin_amdgcn_ballot_w64(e) != 0;
}

ATTR bool
OCKL_MANGLE_I32(wfany)(int e)
{
e = optimizationBarrierHack(e);
return __oclc_wavefrontsize64 ?
wfany_impl_w64(e) : wfany_impl_w32(e);
}

REQUIRES_WAVE32
static bool wfall_impl_w32(int e) {
return __builtin_amdgcn_ballot_w32(e) == __builtin_amdgcn_read_exec_lo();
}

REQUIRES_WAVE64
static bool wfall_impl_w64(int e) {
return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
return __builtin_amdgcn_ballot_w64(e) != 0;
}

ATTR bool
OCKL_MANGLE_I32(wfall)(int e)
{
e = optimizationBarrierHack(e);
return __oclc_wavefrontsize64 ?
wfall_impl_w64(e) : wfall_impl_w32(e);
}


REQUIRES_WAVE32
static bool wfsame_impl_w32(int e) {
uint u = __builtin_amdgcn_ballot_w32(e);
return (u == 0) | (u == __builtin_amdgcn_read_exec_lo());
}

REQUIRES_WAVE64
static bool wfsame_impl_w64(int e) {
ulong u = __builtin_amdgcn_ballot_w64(e);
return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
return __builtin_amdgcn_ballot_w64(e) == __builtin_amdgcn_read_exec();
}

ATTR bool
OCKL_MANGLE_I32(wfsame)(int e)
{
e = optimizationBarrierHack(e);
return __oclc_wavefrontsize64 ?
wfsame_impl_w64(e) : wfsame_impl_w32(e);
ulong u = __builtin_amdgcn_ballot_w64(e);
return (u == 0UL) | (u == __builtin_amdgcn_read_exec());
}

0 comments on commit 066a0b2

Please sign in to comment.