Skip to content

Commit

Permalink
Fix GenISA_WaveShuffleIndex intrinsic if src and dst are the s…
Browse files Browse the repository at this point in the history
…ame variables

To handle `GenISA_WaveShuffleIndex` intrisc with non-uniform `simdChannel`,
IGC needs to generate two SIMD16 indirectly addressed mov instructions,
because address register has only 16 subregisters. If that happens when
`GenISA_WaveShuffleIndex` intrinsic uses the same variable as a source,
and as a destination, then the first SIMD16 instruction may overwrite
values used as a source by the second SIMD16 instruction.

Here is the example of an OpenCL C code that reproduces the issue:
```c
__attribute__((intel_reqd_sub_group_size(32)))
kernel void k(global int* in, global int* ids, uint num_iterations, global int* out) {
    size_t gid = get_global_id(0);
    int x = in[gid];
    uint which_sub_group_local_id = ids[gid];

    for (uint i = 0; i < num_iterations; ++i)
    {
        x = intel_sub_group_shuffle(x, which_sub_group_local_id);
    }

    out[gid] = x;
}
```

This change fixes the issue by writing the result for the first 16
channels into a temporary variable, before executing shuffle index for
the last 16 channels
  • Loading branch information
aratajew authored and igcbot committed Sep 2, 2024
1 parent a7ed19e commit e98892d
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 8 deletions.
55 changes: 47 additions & 8 deletions IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5509,18 +5509,47 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
}
else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
{
m_encoder->SetSimdSize(SIMDMode::SIMD16);
if (channelUniform)
{
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
m_encoder->Push();

m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
m_encoder->Push();
m_encoder->Copy(m_destination, pDstArrElm);
m_encoder->Push();
}
else // !channelUniform
{
m_encoder->SetSimdSize(SIMDMode::SIMD16);

m_encoder->SetSimdSize(SIMDMode::SIMD16);
m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
m_encoder->Push();

m_encoder->Copy(m_destination, pDstArrElm);
m_encoder->Push();
m_encoder->SetSimdSize(SIMDMode::SIMD16);

if (!channelUniform)
{
// If `src` variable is the same as `m_destination` variable, we cannot write the result
// for the first 16 channels to `m_destionation` right away, because it may overwrite `src`
// values used by the last 16 channels. Instead, the result for the first 16 channels
// gets written into a temporary variable, then shuffle index for the last 16 channels
// is generated and, at the end, the result for first 16 channels is rewritten from
// the temporary variable into the `m_destination`.
const bool isSrcSameAsDst = src == m_destination;
CVariable* first16LanesResult = nullptr;
if (isSrcSameAsDst)
{
first16LanesResult = m_currShader->GetNewVariable(
16,
m_destination->GetType(),
m_destination->GetAlign(),
false, // isUniform
"first16LanesResult");

m_encoder->Copy(first16LanesResult, pDstArrElm);
}
else
{
m_encoder->Copy(m_destination, pDstArrElm);
}
m_encoder->Push();

m_encoder->SetSimdSize(SIMDMode::SIMD16);
m_encoder->SetMask(EMASK_H2);
Expand All @@ -5536,6 +5565,16 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
m_encoder->Copy(m_destination, pDstArrElm);
m_encoder->Push();
m_encoder->SetSecondHalf(false);

m_encoder->Push();

if (isSrcSameAsDst)
{
IGC_ASSERT(first16LanesResult);
m_encoder->SetSimdSize(SIMDMode::SIMD16);
m_encoder->Copy(m_destination, first16LanesResult);
m_encoder->Push();
}
}
}
if (disableHelperLanes)
Expand Down
79 changes: 79 additions & 0 deletions IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*========================== begin_copyright_notice ============================
Copyright (C) 2024 Intel Corporation
SPDX-License-Identifier: MIT
============================= end_copyright_notice ===========================*/

// UNSUPPORTED: sys32
// REQUIRES: regkeys, pvc-supported

// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s

// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_immediate_index_simd32"
__attribute__((intel_reqd_sub_group_size(32)))
kernel void test_intel_sub_group_shuffle_immediate_index_simd32(global int* in, global int* ids, global int* out) {
size_t gid = get_global_id(0);
int x = in[gid];

// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> V0039(1,15)<0;1,0>

// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
// CHECK: lsc_store.ugm (M1, 32) flat[V0041]:a64 simdShuffleBroadcast:d32
out[gid] = intel_sub_group_shuffle(x, 31);
}

// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32"
__attribute__((intel_reqd_sub_group_size(32)))
kernel void test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32(global int* in, global int* ids, uint which_sub_group_local_id, global int* out) {
size_t gid = get_global_id(0);
int x = in[gid];

// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> which_sub_group_local_id_0(0,0)<0;1,0> 0x2:uw
// CHECK-NEXT: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
// CHECK-NEXT: mov (M1_NM, 1) simdShuffle(0,0)<1> r[A0(0),0]<0;1,0>:d

// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffleBroadcast:d32
out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
}

// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32"
__attribute__((intel_reqd_sub_group_size(32)))
kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(global int* in, global int* ids, global int* out) {
size_t gid = get_global_id(0);
int x = in[gid];
uint which_sub_group_local_id = ids[gid];

// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d

// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffle:d32
out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
}

// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32"
__attribute__((intel_reqd_sub_group_size(32)))
kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32(global int* in, global int* ids, uint num_iterations, global int* out) {
size_t gid = get_global_id(0);
int x = in[gid];
uint which_sub_group_local_id = ids[gid];

for (uint i = 0; i < num_iterations; ++i)
{
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d
// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0>
x = intel_sub_group_shuffle(x, which_sub_group_local_id);
}

// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 [[X]]:d32
out[gid] = x;
}

0 comments on commit e98892d

Please sign in to comment.