From e98892da604367255da940d56546f53dc6c2c841 Mon Sep 17 00:00:00 2001 From: Andrzej Ratajewski Date: Fri, 30 Aug 2024 13:00:24 +0000 Subject: [PATCH] Fix `GenISA_WaveShuffleIndex` intrinsic if `src` and `dst` are the same variables To handle `GenISA_WaveShuffleIndex` intrisc with non-uniform `simdChannel`, IGC needs to generate two SIMD16 indirectly addressed mov instructions, because address register has only 16 subregisters. If that happens when `GenISA_WaveShuffleIndex` intrinsic uses the same variable as a source, and as a destination, then the first SIMD16 instruction may overwrite values used as a source by the second SIMD16 instruction. Here is the example of an OpenCL C code that reproduces the issue: ```c __attribute__((intel_reqd_sub_group_size(32))) kernel void k(global int* in, global int* ids, uint num_iterations, global int* out) { size_t gid = get_global_id(0); int x = in[gid]; uint which_sub_group_local_id = ids[gid]; for (uint i = 0; i < num_iterations; ++i) { x = intel_sub_group_shuffle(x, which_sub_group_local_id); } out[gid] = x; } ``` This change fixes the issue by writing the result for the first 16 channels into a temporary variable, before executing shuffle index for the last 16 channels --- IGC/Compiler/CISACodeGen/EmitVISAPass.cpp | 55 +++++++++++-- .../Builtins/intel_sub_group_shuffle.cl | 79 +++++++++++++++++++ 2 files changed, 126 insertions(+), 8 deletions(-) create mode 100644 IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp index 698f334905f6..83a0b989e14b 100644 --- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp +++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp @@ -5509,18 +5509,47 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst) } else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex) { - m_encoder->SetSimdSize(SIMDMode::SIMD16); + if (channelUniform) + { + m_encoder->AddrAdd(pDstArrElm, src, pSrcElm); + m_encoder->Push(); - m_encoder->AddrAdd(pDstArrElm, src, pSrcElm); - m_encoder->Push(); + m_encoder->Copy(m_destination, pDstArrElm); + m_encoder->Push(); + } + else // !channelUniform + { + m_encoder->SetSimdSize(SIMDMode::SIMD16); - m_encoder->SetSimdSize(SIMDMode::SIMD16); + m_encoder->AddrAdd(pDstArrElm, src, pSrcElm); + m_encoder->Push(); - m_encoder->Copy(m_destination, pDstArrElm); - m_encoder->Push(); + m_encoder->SetSimdSize(SIMDMode::SIMD16); - if (!channelUniform) - { + // If `src` variable is the same as `m_destination` variable, we cannot write the result + // for the first 16 channels to `m_destionation` right away, because it may overwrite `src` + // values used by the last 16 channels. Instead, the result for the first 16 channels + // gets written into a temporary variable, then shuffle index for the last 16 channels + // is generated and, at the end, the result for first 16 channels is rewritten from + // the temporary variable into the `m_destination`. + const bool isSrcSameAsDst = src == m_destination; + CVariable* first16LanesResult = nullptr; + if (isSrcSameAsDst) + { + first16LanesResult = m_currShader->GetNewVariable( + 16, + m_destination->GetType(), + m_destination->GetAlign(), + false, // isUniform + "first16LanesResult"); + + m_encoder->Copy(first16LanesResult, pDstArrElm); + } + else + { + m_encoder->Copy(m_destination, pDstArrElm); + } + m_encoder->Push(); m_encoder->SetSimdSize(SIMDMode::SIMD16); m_encoder->SetMask(EMASK_H2); @@ -5536,6 +5565,16 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst) m_encoder->Copy(m_destination, pDstArrElm); m_encoder->Push(); m_encoder->SetSecondHalf(false); + + m_encoder->Push(); + + if (isSrcSameAsDst) + { + IGC_ASSERT(first16LanesResult); + m_encoder->SetSimdSize(SIMDMode::SIMD16); + m_encoder->Copy(m_destination, first16LanesResult); + m_encoder->Push(); + } } } if (disableHelperLanes) diff --git a/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl b/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl new file mode 100644 index 000000000000..1a3d3efd5c9e --- /dev/null +++ b/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl @@ -0,0 +1,79 @@ +/*========================== begin_copyright_notice ============================ + +Copyright (C) 2024 Intel Corporation + +SPDX-License-Identifier: MIT + +============================= end_copyright_notice ===========================*/ + +// UNSUPPORTED: sys32 +// REQUIRES: regkeys, pvc-supported + +// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s + +// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_immediate_index_simd32" +__attribute__((intel_reqd_sub_group_size(32))) +kernel void test_intel_sub_group_shuffle_immediate_index_simd32(global int* in, global int* ids, global int* out) { + size_t gid = get_global_id(0); + int x = in[gid]; + +// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> V0039(1,15)<0;1,0> + +// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0> +// CHECK: lsc_store.ugm (M1, 32) flat[V0041]:a64 simdShuffleBroadcast:d32 + out[gid] = intel_sub_group_shuffle(x, 31); +} + +// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32" +__attribute__((intel_reqd_sub_group_size(32))) +kernel void test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32(global int* in, global int* ids, uint which_sub_group_local_id, global int* out) { + size_t gid = get_global_id(0); + int x = in[gid]; + +// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> which_sub_group_local_id_0(0,0)<0;1,0> 0x2:uw +// CHECK-NEXT: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0> +// CHECK-NEXT: mov (M1_NM, 1) simdShuffle(0,0)<1> r[A0(0),0]<0;1,0>:d + +// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0> +// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffleBroadcast:d32 + out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id); +} + +// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32" +__attribute__((intel_reqd_sub_group_size(32))) +kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(global int* in, global int* ids, global int* out) { + size_t gid = get_global_id(0); + int x = in[gid]; + uint which_sub_group_local_id = ids[gid]; + +// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw +// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0> +// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d +// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0> +// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d + +// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 simdShuffle:d32 + out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id); +} + +// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32" +__attribute__((intel_reqd_sub_group_size(32))) +kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32(global int* in, global int* ids, uint num_iterations, global int* out) { + size_t gid = get_global_id(0); + int x = in[gid]; + uint which_sub_group_local_id = ids[gid]; + + for (uint i = 0; i < num_iterations; ++i) + { +// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw +// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0> +// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d +// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0> +// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d +// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0> + x = intel_sub_group_shuffle(x, which_sub_group_local_id); + } + +// CHECK: lsc_store.ugm (M1, 32) flat[{{.+}}]:a64 [[X]]:d32 + out[gid] = x; +}