From e98892da604367255da940d56546f53dc6c2c841 Mon Sep 17 00:00:00 2001
From: Andrzej Ratajewski <andrzej.ratajewski@intel.com>
Date: Fri, 30 Aug 2024 13:00:24 +0000
Subject: [PATCH]  Fix `GenISA_WaveShuffleIndex` intrinsic if `src` and `dst`
 are the same variables

To handle `GenISA_WaveShuffleIndex` intrisc with non-uniform `simdChannel`,
IGC needs to generate two SIMD16 indirectly addressed mov instructions,
because address register has only 16 subregisters. If that happens when
`GenISA_WaveShuffleIndex` intrinsic uses the same variable as a source,
and as a destination, then the first SIMD16 instruction may overwrite
values used as a source by the second SIMD16 instruction.

Here is the example of an OpenCL C code that reproduces the issue:
```c
__attribute__((intel_reqd_sub_group_size(32)))
kernel void k(global int* in, global int* ids, uint num_iterations, global int* out) {
    size_t gid = get_global_id(0);
    int x = in[gid];
    uint which_sub_group_local_id = ids[gid];

    for (uint i = 0; i < num_iterations; ++i)
    {
        x = intel_sub_group_shuffle(x, which_sub_group_local_id);
    }

    out[gid] = x;
}
```

This change fixes the issue by writing the result for the first 16
channels into a temporary variable, before executing shuffle index for
the last 16 channels
---
 IGC/Compiler/CISACodeGen/EmitVISAPass.cpp     | 55 +++++++++++--
 .../Builtins/intel_sub_group_shuffle.cl       | 79 +++++++++++++++++++
 2 files changed, 126 insertions(+), 8 deletions(-)
 create mode 100644 IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl

diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
index 698f334905f6..83a0b989e14b 100644
--- a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
+++ b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -5509,18 +5509,47 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
             }
             else if(GII && GII->getIntrinsicID() == GenISAIntrinsic::GenISA_WaveShuffleIndex)
             {
-                m_encoder->SetSimdSize(SIMDMode::SIMD16);
+                if (channelUniform)
+                {
+                    m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
+                    m_encoder->Push();
 
-                m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
-                m_encoder->Push();
+                    m_encoder->Copy(m_destination, pDstArrElm);
+                    m_encoder->Push();
+                }
+                else // !channelUniform
+                {
+                    m_encoder->SetSimdSize(SIMDMode::SIMD16);
 
-                m_encoder->SetSimdSize(SIMDMode::SIMD16);
+                    m_encoder->AddrAdd(pDstArrElm, src, pSrcElm);
+                    m_encoder->Push();
 
-                m_encoder->Copy(m_destination, pDstArrElm);
-                m_encoder->Push();
+                    m_encoder->SetSimdSize(SIMDMode::SIMD16);
 
-                if (!channelUniform)
-                {
+                    // If `src` variable is the same as `m_destination` variable, we cannot write the result
+                    // for the first 16 channels to `m_destionation` right away, because it may overwrite `src`
+                    // values used by the last 16 channels. Instead, the result for the first 16 channels
+                    // gets written into a temporary variable, then shuffle index for the last 16 channels
+                    // is generated and, at the end, the result for first 16 channels is rewritten from
+                    // the temporary variable into the `m_destination`.
+                    const bool isSrcSameAsDst = src == m_destination;
+                    CVariable* first16LanesResult = nullptr;
+                    if (isSrcSameAsDst)
+                    {
+                        first16LanesResult = m_currShader->GetNewVariable(
+                            16,
+                            m_destination->GetType(),
+                            m_destination->GetAlign(),
+                            false, // isUniform
+                            "first16LanesResult");
+
+                        m_encoder->Copy(first16LanesResult, pDstArrElm);
+                    }
+                    else
+                    {
+                        m_encoder->Copy(m_destination, pDstArrElm);
+                    }
+                    m_encoder->Push();
 
                     m_encoder->SetSimdSize(SIMDMode::SIMD16);
                     m_encoder->SetMask(EMASK_H2);
@@ -5536,6 +5565,16 @@ void EmitPass::emitSimdShuffle(llvm::Instruction* inst)
                     m_encoder->Copy(m_destination, pDstArrElm);
                     m_encoder->Push();
                     m_encoder->SetSecondHalf(false);
+
+                    m_encoder->Push();
+
+                    if (isSrcSameAsDst)
+                    {
+                        IGC_ASSERT(first16LanesResult);
+                        m_encoder->SetSimdSize(SIMDMode::SIMD16);
+                        m_encoder->Copy(m_destination, first16LanesResult);
+                        m_encoder->Push();
+                    }
                 }
             }
             if (disableHelperLanes)
diff --git a/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl b/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl
new file mode 100644
index 000000000000..1a3d3efd5c9e
--- /dev/null
+++ b/IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl
@@ -0,0 +1,79 @@
+/*========================== begin_copyright_notice ============================
+
+Copyright (C) 2024 Intel Corporation
+
+SPDX-License-Identifier: MIT
+
+============================= end_copyright_notice ===========================*/
+
+// UNSUPPORTED: sys32
+// REQUIRES: regkeys, pvc-supported
+
+// RUN: ocloc compile -file %s -device pvc -options "-igc_opts 'DumpVISAASMToConsole=1'" | FileCheck %s
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_immediate_index_simd32(global int* in, global int* ids, global int* out) {
+    size_t gid = get_global_id(0);
+    int x = in[gid];
+
+// CHECK: mov (M5_NM, 1) simdShuffle(0,0)<1> V0039(1,15)<0;1,0>
+
+// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
+// CHECK: lsc_store.ugm (M1, 32)  flat[V0041]:a64  simdShuffleBroadcast:d32
+    out[gid] = intel_sub_group_shuffle(x, 31);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_uniform_non_immediate_index_simd32(global int* in, global int* ids, uint which_sub_group_local_id, global int* out) {
+    size_t gid = get_global_id(0);
+    int x = in[gid];
+
+// CHECK: shl (M1_NM, 1) ShuffleTmp(0,0)<1> which_sub_group_local_id_0(0,0)<0;1,0> 0x2:uw
+// CHECK-NEXT: addr_add (M1_NM, 1) A0(0)<1> &{{V[0-9]+}} ShuffleTmp(0,0)<0;1,0>
+// CHECK-NEXT: mov (M1_NM, 1) simdShuffle(0,0)<1> r[A0(0),0]<0;1,0>:d
+
+// CHECK: mov (M1, 32) simdShuffleBroadcast(0,0)<1> simdShuffle(0,0)<0;1,0>
+// CHECK: lsc_store.ugm (M1, 32)  flat[{{.+}}]:a64  simdShuffleBroadcast:d32
+    out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(global int* in, global int* ids, global int* out) {
+    size_t gid = get_global_id(0);
+    int x = in[gid];
+    uint which_sub_group_local_id = ids[gid];
+
+// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
+// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
+// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
+// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d
+
+// CHECK: lsc_store.ugm (M1, 32)  flat[{{.+}}]:a64  simdShuffle:d32
+    out[gid] = intel_sub_group_shuffle(x, which_sub_group_local_id);
+}
+
+// CHECK-LABEL: .kernel "test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32"
+__attribute__((intel_reqd_sub_group_size(32)))
+kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the_same_as_dst_simd32(global int* in, global int* ids, uint num_iterations, global int* out) {
+    size_t gid = get_global_id(0);
+    int x = in[gid];
+    uint which_sub_group_local_id = ids[gid];
+
+    for (uint i = 0; i < num_iterations; ++i)
+    {
+// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
+// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
+// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
+// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d
+// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0>
+        x = intel_sub_group_shuffle(x, which_sub_group_local_id);
+    }
+
+// CHECK: lsc_store.ugm (M1, 32)  flat[{{.+}}]:a64  [[X]]:d32
+    out[gid] = x;
+}