Fix GenISA_WaveShuffleIndex intrinsic if src and dst are the s…

…ame variables To handle `GenISA_WaveShuffleIndex` intrisc with non-uniform `simdChannel`, IGC needs to generate two SIMD16 indirectly addressed mov instructions, because address register has only 16 subregisters. If that happens when `GenISA_WaveShuffleIndex` intrinsic uses the same variable as a source, and as a destination, then the first SIMD16 instruction may overwrite values used as a source by the second SIMD16 instruction. Here is the example of an OpenCL C code that reproduces the issue: ```c __attribute__((intel_reqd_sub_group_size(32))) kernel void k(global int* in, global int* ids, uint num_iterations, global int* out) { size_t gid = get_global_id(0); int x = in[gid]; uint which_sub_group_local_id = ids[gid]; for (uint i = 0; i < num_iterations; ++i) { x = intel_sub_group_shuffle(x, which_sub_group_local_id); } out[gid] = x; } ``` This change fixes the issue by writing the result for the first 16 channels into a temporary variable, before executing shuffle index for the last 16 channels
intel · Sep 2, 2024 · e98892d · e98892d
1 parent a7ed19e
commit e98892d
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 8 deletions.
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -5509,18 +5509,47 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
  }
  else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
  {
- m_encoder->SetSimdSize(SIMDMode::SIMD16);
+ if (channelUniform)
+ {
+ m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
+ m_encoder->Push();
 
- m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
- m_encoder->Push();
+ m_encoder->Copy(m_destination, pDstArrElm);
+ m_encoder->Push();
+ }
+ else // !channelUniform
+ {
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
 
- m_encoder->SetSimdSize(SIMDMode::SIMD16);
+ m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
+ m_encoder->Push();
 
- m_encoder->Copy(m_destination, pDstArrElm);
- m_encoder->Push();
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
 
- if (!channelUniform)
- {
+ // If `src` variable is the same as `m_destination` variable, we cannot write the result
+ // for the first 16 channels to `m_destionation` right away, because it may overwrite `src`
+ // values used by the last 16 channels. Instead, the result for the first 16 channels
+ // gets written into a temporary variable, then shuffle index for the last 16 channels
+ // is generated and, at the end, the result for first 16 channels is rewritten from
+ // the temporary variable into the `m_destination`.
+ const bool isSrcSameAsDst = src == m_destination;
+ CVariable* first16LanesResult = nullptr;
+ if (isSrcSameAsDst)
+ {
+ first16LanesResult = m_currShader->GetNewVariable(
+ 16,
+ m_destination->GetType(),
+ m_destination->GetAlign(),
+ false, // isUniform
+ "first16LanesResult");
+
+ m_encoder->Copy(first16LanesResult, pDstArrElm);
+ }
+ else
+ {
+ m_encoder->Copy(m_destination, pDstArrElm);
+ }
+ m_encoder->Push();
 
  m_encoder->SetSimdSize(SIMDMode::SIMD16);
  m_encoder->SetMask(EMASK_H2);
@@ -5536,6 +5565,16 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
  m_encoder->Copy(m_destination, pDstArrElm);
  m_encoder->Push();
  m_encoder->SetSecondHalf(false);
+
+ m_encoder->Push();
+
+ if (isSrcSameAsDst)
+ {
+ IGC_ASSERT(first16LanesResult);
+ m_encoder->SetSimdSize(SIMDMode::SIMD16);
+ m_encoder->Copy(m_destination, first16LanesResult);
+ m_encoder->Push();
+ }
  }
  }
  if (disableHelperLanes)

diff --git a/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl b/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl
@@ -0,0 +1,79 @@
+/*========================== begin_copyright_notice ============================
+
+Copyright (C) 2024 Intel Corporation
+
+SPDX-License-Identifier: MIT
+
+============================= end_copyright_notice ===========================*/
+
+// UNSUPPORTED: sys32
+// REQUIRES: regkeys, pvc-supported
+
+// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_immediate_index_simd32(global int* in, global int* ids, global int* out) {
+ size_t gid = get_global_id(0);
+ int x = in[gid];
+
+// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> V0039(1,15)<0;1,0>
+
+// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
+// CHECK: lsc_store.ugm (M1, 32) flat[V0041]:a64 simdShuffleBroadcast:d32
+ out[gid] = intel_sub_group_shuffle(x, 31);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32(global int* in, global int* ids, uint which_sub_group_local_id, global int* out) {
+ size_t gid = get_global_id(0);
+ int x = in[gid];
+
+// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> which_sub_group_local_id_0(0,0)<0;1,0> 0x2:uw
+// CHECK-NEXT: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
+// CHECK-NEXT: mov (M1_NM, 1) simdShuffle(0,0)<1> r[A0(0),0]<0;1,0>:d
+
+// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
+// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffleBroadcast:d32
+ out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(global int* in, global int* ids, global int* out) {
+ size_t gid = get_global_id(0);
+ int x = in[gid];
+ uint which_sub_group_local_id = ids[gid];
+
+// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
+// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
+// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
+// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d
+
+// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffle:d32
+ out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32(global int* in, global int* ids, uint num_iterations, global int* out) {
+ size_t gid = get_global_id(0);
+ int x = in[gid];
+ uint which_sub_group_local_id = ids[gid];
+
+ for (uint i = 0; i < num_iterations; ++i)
+ {
+// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
+// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
+// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
+// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0>
+ x = intel_sub_group_shuffle(x, which_sub_group_local_id);
+ }
+
+// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 [[X]]:d32
+ out[gid] = x;
+}