From 4382295d96f180fd5ff2636fb71f2aacf244111a Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Sat, 9 Nov 2024 11:33:11 -0500
Subject: [PATCH 01/12] Add `test_scan_layouts` to skiplist (#2663)

Fix CI failure:
https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/11744998351

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 scripts/skiplist/a770/language.txt    | 2 ++
 scripts/skiplist/conda/language.txt   | 2 ++
 scripts/skiplist/default/language.txt | 2 ++
 scripts/skiplist/lts/language.txt     | 2 ++
 scripts/skiplist/mtl/language.txt     | 2 ++
 scripts/skiplist/xe2/language.txt     | 2 ++
 6 files changed, 12 insertions(+)

diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt
index 6682e5d059..a829b75bf0 100644
--- a/scripts/skiplist/a770/language.txt
+++ b/scripts/skiplist/a770/language.txt
@@ -1,5 +1,7 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt
index b540a2a877..f8a9e48124 100644
--- a/scripts/skiplist/conda/language.txt
+++ b/scripts/skiplist/conda/language.txt
@@ -115,6 +115,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[64-float8e4b15-1
 test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128-256-128-128-256-256]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt
index fd1a7e0a48..36c6d7e69b 100644
--- a/scripts/skiplist/default/language.txt
+++ b/scripts/skiplist/default/language.txt
@@ -1,2 +1,4 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt
index 9a4e14ab13..76dd77c939 100644
--- a/scripts/skiplist/lts/language.txt
+++ b/scripts/skiplist/lts/language.txt
@@ -115,6 +115,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[64-float8e4b15-1
 test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128-256-128-128-256-256]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt
index 62a6f2219c..a346bc76ab 100644
--- a/scripts/skiplist/mtl/language.txt
+++ b/scripts/skiplist/mtl/language.txt
@@ -1,5 +1,7 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt
index cdac848de1..3d923ed1d6 100644
--- a/scripts/skiplist/xe2/language.txt
+++ b/scripts/skiplist/xe2/language.txt
@@ -1,5 +1,7 @@
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/1434
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
+test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]

From 9e495a2cf3c512355b1476a93cbc0910ef824858 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitney.tsang@intel.com>
Date: Sat, 9 Nov 2024 12:21:21 -0500
Subject: [PATCH 02/12] Resolve merge conflicts from `7275ff7` (#2659)

Resolve merge conflicts from 7275ff72916a46ebff4bb8a17e0ee0c71feafbe3.

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
---
 README.md | 60 -------------------------------------------------------
 1 file changed, 60 deletions(-)

diff --git a/README.md b/README.md
index 93bf8dfad0..a8bbe2c2e1 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,6 @@
 
 This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage.
 
-<<<<<<< HEAD
 # Compatibility
 
 * Operating systems:
@@ -22,25 +21,11 @@ This is the development repository of Intel® XPU Backend for Triton\*, a new [T
   * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
 
 Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*.
-=======
-| **`Documentation`** | **`Nightly Wheels`** |
-|-------------------- | -------------------- |
-| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
-
-# Triton
-
-This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
-
-The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton!
-
-The [official documentation](https://triton-lang.org) contains installation instructions and tutorials.  See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required.
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 # Quick Installation
 
 ## Prerequisites
 
-<<<<<<< HEAD
 1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
 2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
 3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -55,35 +40,18 @@ Extract the archive and in the extracted directory execute:
 ```shell
 pip install torch-*.whl triton-*.whl
 ```
-=======
-```shell
-pip install triton
-```
-
-Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain.
 The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
 
 ```shell
-<<<<<<< HEAD
 # replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
 source /opt/intel/oneapi/setvars.sh
-=======
-pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 ```
 
 # Install from source
 
-<<<<<<< HEAD
 ## Prerequisites
-=======
-```shell
-git clone https://github.com/triton-lang/triton.git;
-cd triton;
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
 2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
@@ -104,14 +72,9 @@ source /opt/intel/oneapi/setvars.sh
 Clone this repository:
 
 ```shell
-<<<<<<< HEAD
 git clone https://github.com/intel/intel-xpu-backend-for-triton.git
 cd intel-xpu-backend-for-triton
 ```
-=======
-git clone https://github.com/triton-lang/triton.git;
-cd triton;
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment:
 
@@ -242,7 +205,6 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
 
 # Usage Guide
 
-<<<<<<< HEAD
 ## Code Modifications
 Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels.
 
@@ -346,14 +308,6 @@ Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user
 ```Bash
 export TRITON_XPU_PROFILE=1
 ```
-=======
-Version 2.0 is out! New features include:
-
-- Many, many bug fixes
-- Performance improvements
-- Backend rewritten to use MLIR
-- Support for kernels that contain back-to-back matmuls (e.g., flash attention)
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597
 
 # Contributing
 
@@ -363,7 +317,6 @@ Community contributions are more than welcome, whether it be to fix bugs or to a
 
 _MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file.
 
-<<<<<<< HEAD
 
 ## Security
 
@@ -371,16 +324,3 @@ See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-c
 for information on how to report a potential security issue or vulnerability.
 
 See also: [Security Policy](security.md)
-=======
-# Compatibility
-
-Supported Platforms:
-
-- Linux
-
-Supported Hardware:
-
-- NVIDIA GPUs (Compute Capability 8.0+)
-- AMD GPUs (ROCm 5.2+)
-- Under development: CPUs
->>>>>>> d6739d3c33dee481f2d4dee4f6ecd4123f671597

From 85682e4376f766a606236a6fbd36a8529d6b9d97 Mon Sep 17 00:00:00 2001
From: glados-intel <153325143+glados-intel@users.noreply.github.com>
Date: Sat, 9 Nov 2024 17:06:50 -0800
Subject: [PATCH 03/12] [github-bot] Update spirv-llvm-translator.conf (#2661)

Automated PR to update translator commit id.
---
 lib/Target/SPIRV/spirv-llvm-translator.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Target/SPIRV/spirv-llvm-translator.conf b/lib/Target/SPIRV/spirv-llvm-translator.conf
index 3156bbd12f..22368d5080 100644
--- a/lib/Target/SPIRV/spirv-llvm-translator.conf
+++ b/lib/Target/SPIRV/spirv-llvm-translator.conf
@@ -1 +1 @@
-cf697333b60d2000509ab7e79869ecab5eda9e9c
+1a1bf17d9e8684cd826e4278e78f63aa80e2e2ca

From ca95a70b226a5b92c4e84a9987d920de4cc23a69 Mon Sep 17 00:00:00 2001
From: Si Yudong <yudong.si@intel.com>
Date: Mon, 11 Nov 2024 13:17:52 +0800
Subject: [PATCH 04/12] Improve GEMM performance of shape 4096x8x128x16384
 (#2646)

This change (`grid` order adjustment to improve cache hit) originating
from https://github.com/intel/intel-xpu-backend-for-triton/pull/2600.
Batched gemm only.
~99% of XeTLA for `4096x8x128x16384`.

![image](https://github.com/user-attachments/assets/ef7e9750-b3f7-4adc-aa66-5be704383e40)
---
 benchmarks/triton_kernels_benchmark/gemm_benchmark.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
index da41f1e447..9941b0c5f0 100644
--- a/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
+++ b/benchmarks/triton_kernels_benchmark/gemm_benchmark.py
@@ -129,8 +129,8 @@ def matmul_kernel_with_block_pointers_batched(
         stride_cz: tl.constexpr, stride_cm: tl.constexpr, stride_cn: tl.constexpr,
         # Meta-parameters
         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, GROUP_SIZE_M: tl.constexpr):
-    bid = tl.program_id(axis=0)
-    pid = tl.program_id(axis=1)
+    bid = tl.program_id(axis=1)
+    pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
     num_pid_in_group = GROUP_SIZE_M * num_pid_n
@@ -186,8 +186,8 @@ def matmul(a, b, c, transpose_a=False, transpose_b=False):
         B = a.shape[0]
         # 1D launch kernel where each block gets its own program.
         grid = lambda META: (
-            B,
             triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
+            B,
         )
         matmul_kernel_with_block_pointers_batched[grid](
             a, b, c,  #

From 26781e49cc7289b9cea14e5712fa363aea9af86f Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Mon, 11 Nov 2024 15:45:01 +0100
Subject: [PATCH 05/12] Revert "Revert "Add back barrier after asserts
 (#5043)"" (#2657)

Closes #2644

The error (more details:
https://github.com/intel/intel-xpu-backend-for-triton/issues/2644#issuecomment-2464373902)
seems to be that the operation is incorrectly inserted into the block.
My best guess is that we need to explicitly insert a barrier at the
beginning of the `thenBlock`. However I don't know the exact reason why
this code works for nvidia (maybe because of the different number of
instructions that initially replace `"gpu.barrier"() : () -> ()` however
I'm not sure).
```bash
python: /home/runner/work/triton/triton/llvm-project/llvm/include/llvm/ADT/ilist_iterator.h:168: llvm::ilist_iterator::reference llvm::ilist_iterator<llvm::ilist_detail::node_options<mlir::Operation, true, false, void, false, void>, false, false>::operator*() const [OptionsT = llvm::ilist_detail::node_options<mlir::Operation, true, false, void, false, void>, IsReverse = false, IsConst = false]: Assertion `!NodePtr->isKnownSentinel()' failed.
Aborted (core dumped)
```

---------

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp | 11 +++++++++--
 python/triton/language/semantic.py                |  4 ----
 test/Conversion/tritongpu_to_llvm.mlir            |  2 ++
 3 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp
index 20558c440a..09a7750cee 100644
--- a/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/AssertOpToLLVM.cpp
@@ -35,6 +35,14 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
       }
     }
     llAssert(op, condition, adaptor.getMessage(), rewriter);
+    if (isa<RankedTensorType>(op.getCondition().getType())) {
+      // Add a barrier to avoid a race condition in case an assert is followed
+      // by an op that may trap if the assert condition is true. Since the
+      // tensor in those two operations may have different layout we need to
+      // make sure all the threads are done executing the assert before going to
+      // the next op.
+      barrier();
+    }
     rewriter.eraseOp(op);
     return success();
   }
@@ -42,8 +50,6 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
   // know about the op to split the block.
   void llAssert(Operation *op, Value condition, StringRef message,
                 ConversionPatternRewriter &rewriter) const {
-    ConversionPatternRewriter::InsertionGuard guard(rewriter);
-
     auto ctx = rewriter.getContext();
     auto loc = op->getLoc();
 
@@ -79,6 +85,7 @@ struct AssertOpConversion : public ConvertOpToLLVMPattern<triton::AssertOp> {
     rewriter.create<cf::BranchOp>(loc, thenBlock);
     rewriter.setInsertionPointToEnd(prevBlock);
     rewriter.create<cf::CondBranchOp>(loc, condition, ifBlock, thenBlock);
+    rewriter.setInsertionPointToStart(thenBlock);
   }
 
 protected:
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
index fc31959d27..7bb2515cff 100644
--- a/python/triton/language/semantic.py
+++ b/python/triton/language/semantic.py
@@ -1729,10 +1729,6 @@ def device_print(prefix: str, args: List[tl.tensor], hex: bool, builder: ir.buil
 def device_assert(cond: tl.tensor, msg: str, builder: ir.builder) -> tl.tensor:
     if not builder.options.debug:
         return
-    cond_ty = cond.type
-    if not cond_ty.is_block():
-        cond_ty = tl.block_type(cond_ty.scalar, (1, ))
-        cond = tl.tensor(builder.create_splat(cond.handle, (1, )), cond_ty)
     return tl.tensor(builder.create_assert(cond.handle, msg), tl.void)
 
 
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index 1c3308dd65..4c5d658af1 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1842,6 +1842,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // CHECK-DAG: llvm.mlir.global internal constant @assertFunc_0("unknown\00") {addr_space = 0 : i32}
 // CHECK-DAG: llvm.mlir.global internal constant @assertFile_0("inner_call\00") {addr_space = 0 : i32}
 // CHECK-DAG: llvm.mlir.global internal constant @assertMessage_0("assert text\00") {addr_space = 0 : i32}
+// CHECK: llvm.call @__assertfail
+// CHECK: nvvm.barrier0
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
   tt.func public @add_kernel(%arg0: tensor<1xi1, #blocked>) {
     tt.assert %arg0, "assert text" : tensor<1xi1, #blocked> loc(#loc5)

From 3925bb74c26f8838ed0229c1e32916fa905b9d32 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Mon, 11 Nov 2024 16:08:07 +0100
Subject: [PATCH 06/12] Use `pytorch-stonepia` folder as a root folder for
 building proxy pytorch (#2672)

Closes #2651

Having multiple versions of PyTorch should not be a problem. One of the
possible scenarios is when the build occurs in different environments
and they should not conflict.

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 scripts/compile-pytorch-ipex.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/compile-pytorch-ipex.sh b/scripts/compile-pytorch-ipex.sh
index 483f8e997e..fc3d1cd633 100755
--- a/scripts/compile-pytorch-ipex.sh
+++ b/scripts/compile-pytorch-ipex.sh
@@ -100,7 +100,7 @@ fi
 # Configure, build and install PyTorch from source.
 
 if [[ $BUILD_PYTORCH = true ]]; then
-  PYTORCH_PROJ=$BASE/pytorch
+  PYTORCH_PROJ=$BASE/pytorch-stonepia
 
   echo "**** Cleaning $PYTORCH_PROJ before build ****"
   rm -rf $PYTORCH_PROJ

From 52da140681fea074d7fe8baafc9f7ecd967676cf Mon Sep 17 00:00:00 2001
From: Ettore Tiotto <ettore.tiotto@intel.com>
Date: Mon, 11 Nov 2024 12:19:40 -0500
Subject: [PATCH 07/12] Fix intermittently failing lit test (#2676)

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 test/Analysis/test-liveness.mlir | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/Analysis/test-liveness.mlir b/test/Analysis/test-liveness.mlir
index 2c6bac56d9..6a8ef9de6f 100644
--- a/test/Analysis/test-liveness.mlir
+++ b/test/Analysis/test-liveness.mlir
@@ -19,11 +19,11 @@ module attributes {"triton_gpu.num-warps" = 8 : i32} {
 
     // CHECK: scf.if
     // CHECK-NEXT: LiveIntervals for block: ^bb0
-    // CHECK-NEXT:  [[[LOAD1:%.*]], [[LOAD1]]] for value: %arg0
-    // CHECK-NEXT:  [[[LOAD1]], scf.yield] for value: [[LOAD1]]
-    // CHECK-NEXT: LiveIntervals for block: ^bb0
-    // CHECK-NEXT:  [[[LOAD2:%.*]], [[LOAD2]]] for value: %arg1
-    // CHECK-NEXT:  [[[LOAD2]], scf.yield] for value: [[LOAD2]]
+    // CHECK-DAG:  [[[LOAD1:%.*]], [[LOAD1]]] for value: %arg0
+    // CHECK-DAG:  [[[LOAD1]], scf.yield] for value: [[LOAD1]]
+    // CHECK-DAG: LiveIntervals for block: ^bb0
+    // CHECK-DAG:  [[[LOAD2:%.*]], [[LOAD2]]] for value: %arg1
+    // CHECK-DAG:  [[[LOAD2]], scf.yield] for value: [[LOAD2]]
 
     %c1024_i32 = arith.constant 1024 : i32
     %c64_i32 = arith.constant 64 : i32

From 98dca47227c59394c564cc0991ce44588d320fb5 Mon Sep 17 00:00:00 2001
From: Ettore Tiotto <ettore.tiotto@intel.com>
Date: Mon, 11 Nov 2024 19:54:13 -0500
Subject: [PATCH 08/12] [NFC]: Clean up AccelerateMatmul.cpp (#2679)

Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com>
---
 .../AccelerateMatmul.cpp                      | 93 +++++++++----------
 1 file changed, 45 insertions(+), 48 deletions(-)

diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
index 3e636d5bae..ae385ed960 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -4,7 +4,6 @@
 #include "intel/include/Analysis/DPAS.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
-#include "intel/include/Dialect/TritonIntelGPU/Transforms/Utility.h"
 
 #include "triton/Dialect/Triton/IR/Utility.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
@@ -14,9 +13,8 @@
 #define PVC_2D_LOAD_MAXIMUM_BYTES_OF_COLS 64
 
 using namespace mlir;
-using namespace mlir::triton;
-using namespace mlir::triton::gpu;
-using DPASAnalysis = intel::DPASAnalysis;
+namespace tt = mlir::triton;
+namespace ttg = mlir::triton::gpu;
 
 namespace mlir::triton::gpu::intel {
 #define GEN_PASS_DEF_TRITONINTELGPUACCELERATEMATMUL
@@ -55,7 +53,7 @@ IntelDPASCapability getDPASCapability(unsigned minSGSize) {
   }
 }
 
-SmallVector<unsigned> getWarpsPerTile(DotOp dotOp,
+SmallVector<unsigned> getWarpsPerTile(tt::DotOp dotOp,
                                       struct IntelDPASCapability dpasCap,
                                       const ArrayRef<int64_t> shape,
                                       unsigned numWarps) {
@@ -66,7 +64,7 @@ SmallVector<unsigned> getWarpsPerTile(DotOp dotOp,
   SetVector<Operation *> slices = getSlice(dotOp, {filter});
   // TODO: revisit this in flash attention.
   for (Operation *op : slices)
-    if (isa<DotOp>(op) && (op != dotOp))
+    if (isa<tt::DotOp>(op) && (op != dotOp))
       return {numWarps, 1};
 
   size_t rank = shape.size();
@@ -108,41 +106,41 @@ SmallVector<unsigned> getWarpsPerTile(DotOp dotOp,
   return ret;
 }
 
-class BlockedToDPAS : public RewritePattern {
-  const DPASAnalysis &dpasAnalysis;
+class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
+  const ttg::intel::DPASAnalysis &dpasAnalysis;
 
 public:
-  BlockedToDPAS(MLIRContext *context, const DPASAnalysis &dpasAnalysis)
-      : RewritePattern(DotOp::getOperationName(), 2, context),
-        dpasAnalysis(dpasAnalysis) {}
+  BlockedToDPAS(MLIRContext *context,
+                const ttg::intel::DPASAnalysis &dpasAnalysis)
+      : OpRewritePattern<tt::DotOp>(context), dpasAnalysis(dpasAnalysis) {}
 
-  LogicalResult matchAndRewrite(Operation *op,
+  LogicalResult matchAndRewrite(tt::DotOp dotOp,
                                 PatternRewriter &rewriter) const override {
-    DotOp dotOp = cast<DotOp>(op);
-    RankedTensorType oldRetType =
-        cast<RankedTensorType>(dotOp.getResult().getType());
+    using TensorValue = TypedValue<RankedTensorType>;
+
+    RankedTensorType oldRetType = dotOp.getType();
     if (!oldRetType.getEncoding() ||
-        isa<intel::DpasEncodingAttr>(oldRetType.getEncoding()))
+        isa<ttg::intel::DpasEncodingAttr>(oldRetType.getEncoding()))
       return failure();
 
-    auto funcOp = op->getParentOfType<FunctionOpInterface>();
-    if (dpasAnalysis.canUseDPAS(funcOp) != DPASAnalysis::Result::True)
+    auto funcOp = dotOp->getParentOfType<FunctionOpInterface>();
+    if (dpasAnalysis.canUseDPAS(funcOp) !=
+        ttg::intel::DPASAnalysis::Result::True)
       return failure();
 
     // Create DPAS encoding for the given number of warps
     ArrayRef<int64_t> retShape = oldRetType.getShape();
-    size_t rank = retShape.size();
     ModuleOp mod = funcOp->getParentOfType<ModuleOp>();
-    unsigned numWarps = TritonGPUDialect::getNumWarps(mod);
+    unsigned numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
 
-    Value a = dotOp.getA();
-    Value b = dotOp.getB();
-    RankedTensorType oldAType = cast<RankedTensorType>(a.getType());
-    RankedTensorType oldBType = cast<RankedTensorType>(b.getType());
+    TensorValue a = dotOp.getA();
+    TensorValue b = dotOp.getB();
+    auto oldAType = cast<RankedTensorType>(a.getType());
+    auto oldBType = cast<RankedTensorType>(b.getType());
 
     unsigned minSGSize =
         mod->getAttrOfType<IntegerAttr>(
-               intel::TritonIntelGPUDialect::getMinSGSizeAttrName())
+               ttg::intel::TritonIntelGPUDialect::getMinSGSizeAttrName())
             .getInt();
     IntelDPASCapability dpasCap = getDPASCapability(minSGSize);
     unsigned dpasElemBitWidths =
@@ -156,10 +154,11 @@ class BlockedToDPAS : public RewritePattern {
     unsigned opsPerChan = dpasCap.opsChanBitWidths / dpasElemBitWidths;
     SmallVector<unsigned> warpsPerTile =
         getWarpsPerTile(dotOp, dpasCap, retShape, numWarps);
+    size_t rank = retShape.size();
     SmallVector<unsigned> repCluster(rank, 1);
 
-    unsigned threadsPerWarp = TritonGPUDialect::getThreadsPerWarp(mod);
-    auto dpasEnc = intel::DpasEncodingAttr::get(
+    unsigned threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
+    auto dpasEnc = ttg::intel::DpasEncodingAttr::get(
         oldRetType.getContext(), dpasCap.repeatCount, dpasCap.systolicDepth,
         dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
         threadsPerWarp);
@@ -184,7 +183,7 @@ class BlockedToDPAS : public RewritePattern {
       repCluster[rank - 2] = repClusterDimM;
       repCluster[rank - 1] = repClusterDimN;
 
-      dpasEnc = intel::DpasEncodingAttr::get(
+      dpasEnc = ttg::intel::DpasEncodingAttr::get(
           oldRetType.getContext(), dpasCap.repeatCount, dpasCap.systolicDepth,
           dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,
           threadsPerWarp);
@@ -194,28 +193,28 @@ class BlockedToDPAS : public RewritePattern {
         RankedTensorType::get(retShape, oldRetType.getElementType(), dpasEnc);
 
     // convert accumulator
-    Value oldAcc = dotOp.getC();
-    ConvertLayoutOp newAcc =
-        rewriter.create<ConvertLayoutOp>(oldAcc.getLoc(), newRetType, oldAcc);
+    TensorValue oldAcc = dotOp.getC();
+    auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(oldAcc.getLoc(),
+                                                        newRetType, oldAcc);
 
-    DotOperandEncodingAttr newAEncoding = DotOperandEncodingAttr::get(
+    auto newAEncoding = ttg::DotOperandEncodingAttr::get(
         oldAType.getContext(), 0, newRetType.getEncoding(), opsPerChan);
-    DotOperandEncodingAttr newBEncoding = DotOperandEncodingAttr::get(
+    auto newBEncoding = ttg::DotOperandEncodingAttr::get(
         oldBType.getContext(), 1, newRetType.getEncoding(), opsPerChan);
 
-    RankedTensorType newAType = RankedTensorType::get(
+    auto newAType = RankedTensorType::get(
         oldAType.getShape(), oldAType.getElementType(), newAEncoding);
-    RankedTensorType newBType = RankedTensorType::get(
+    auto newBType = RankedTensorType::get(
         oldBType.getShape(), oldBType.getElementType(), newBEncoding);
 
-    a = rewriter.create<ConvertLayoutOp>(a.getLoc(), newAType, a);
-    b = rewriter.create<ConvertLayoutOp>(b.getLoc(), newBType, b);
-    DotOp newDot = rewriter.create<DotOp>(dotOp.getLoc(), newRetType, a, b,
-                                          newAcc, dotOp.getInputPrecision(),
-                                          dotOp.getMaxNumImpreciseAcc());
+    a = rewriter.create<ttg::ConvertLayoutOp>(a.getLoc(), newAType, a);
+    b = rewriter.create<ttg::ConvertLayoutOp>(b.getLoc(), newBType, b);
+    auto newDot = rewriter.create<tt::DotOp>(dotOp.getLoc(), newRetType, a, b,
+                                             newAcc, dotOp.getInputPrecision(),
+                                             dotOp.getMaxNumImpreciseAcc());
 
-    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, oldRetType,
-                                                 newDot.getResult());
+    rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(dotOp, oldRetType,
+                                                      newDot.getResult());
     return success();
   }
 };
@@ -230,7 +229,7 @@ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
 
   return llvm::TypeSwitch<Type, Value>(elemType)
       .Case<FloatType>([&](auto) {
-        return builder.create<FpToFpOp>(loc, tensorPromotedType, operand);
+        return builder.create<tt::FpToFpOp>(loc, tensorPromotedType, operand);
       })
       .Case<IntegerType>([&](auto) {
         unsigned tgtBitWidth = elemType.getIntOrFloatBitWidth(),
@@ -248,12 +247,12 @@ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand,
 // promote operands of dot op if the existing combination is not natively
 // supported.
 static void decomposeMixedModeDotOp(ModuleOp mod) {
-  mod.walk([](DotOp dotOp) -> void {
+  mod.walk([](tt::DotOp dotOp) -> void {
     auto D = dotOp.getD();
     OpBuilder builder(dotOp);
     Type AElType = dotOp.getA().getType().getElementType();
     auto dpasLayout =
-        dyn_cast<intel::DpasEncodingAttr>(D.getType().getEncoding());
+        dyn_cast<ttg::intel::DpasEncodingAttr>(D.getType().getEncoding());
 
     Type promoteType;
     if (dpasLayout) {
@@ -289,15 +288,13 @@ class TritonIntelGPUAccelerateMatmulPass
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     ModuleOp m = getOperation();
-    DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();
+    auto &dpasAnalysis = getAnalysis<ttg::intel::DPASAnalysis>();
 
     RewritePatternSet patterns(context);
     patterns.add<BlockedToDPAS>(context, dpasAnalysis);
     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed())
       signalPassFailure();
 
-    // now that we pick the scalar type decompose dot that are not natively
-    // supported.
     decomposeMixedModeDotOp(m);
   }
 };

From 16f57384292502e36c362b79d4c780d82e8bbb3c Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 12 Nov 2024 09:18:40 +0100
Subject: [PATCH 09/12] [XPU][Alloc] Optimize SLM allocation size for sub-group
 layout conversions (#2638)

Optimize shared memory allocation sizes for sub-group shuffle and
transpose-like conversions:

- Sub-group shuffle: Do not allocate memory at all.
- Sub-group transpose: Allocate as much memory needed to store the whole
tensor in SLM.

---------

Signed-off-by: victor-eds <victor.perez@codeplay.com>
---
 .../intel/intel-allocate-shared-memory.mlir   | 65 +++++++++++++++++++
 test/Conversion/intel/sub-group-shuffle.mlir  | 12 ++--
 third_party/intel/lib/Analysis/Allocation.cpp | 22 +++++++
 3 files changed, 93 insertions(+), 6 deletions(-)
 create mode 100644 test/Conversion/intel/intel-allocate-shared-memory.mlir

diff --git a/test/Conversion/intel/intel-allocate-shared-memory.mlir b/test/Conversion/intel/intel-allocate-shared-memory.mlir
new file mode 100644
index 0000000000..0aa7990417
--- /dev/null
+++ b/test/Conversion/intel/intel-allocate-shared-memory.mlir
@@ -0,0 +1,65 @@
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check no scratch memory is allocated for sub-group shuffle-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 0 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK: tt.func @test_sub_group_shuffle
+  // CHECK-NOT: llvm.ptr<3>
+  tt.func @test_sub_group_shuffle(%arg0: tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    tt.return %0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 512 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
+    tt.return %0 : tensor<16x16xf16, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 1024 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
+    tt.return %0 : tensor<16x16xf32, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 32768 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>
+    tt.return %0 : tensor<128x64xf32, #blocked1>
+  }
+}
diff --git a/test/Conversion/intel/sub-group-shuffle.mlir b/test/Conversion/intel/sub-group-shuffle.mlir
index 1e9d32a8c7..8bcd1b57dc 100644
--- a/test/Conversion/intel/sub-group-shuffle.mlir
+++ b/test/Conversion/intel/sub-group-shuffle.mlir
@@ -9,7 +9,7 @@
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f16(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_4]])
@@ -49,7 +49,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_bf16(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16)>
   // CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : bf16 to i16
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -91,7 +91,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_i1(
-  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>,
+  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(i1)>
   // CHECK:           %[[VAL_2:.*]] = llvm.zext %[[VAL_1]] : i1 to i8
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -133,7 +133,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_ptr(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(ptr<1>)>
   // CHECK:           %[[VAL_2:.*]] = llvm.ptrtoint %[[VAL_1]] : !llvm.ptr<1> to i64
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -186,7 +186,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f32(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_4]])
@@ -269,7 +269,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_non_sliced_multi_register(
-  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>,
+  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_5:.*]] = llvm.mlir.constant(0 : i32) : i32
diff --git a/third_party/intel/lib/Analysis/Allocation.cpp b/third_party/intel/lib/Analysis/Allocation.cpp
index 1fba62b609..881fe7b162 100644
--- a/third_party/intel/lib/Analysis/Allocation.cpp
+++ b/third_party/intel/lib/Analysis/Allocation.cpp
@@ -15,6 +15,8 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/SmallVector.h"
 
+#include "intel/include/Analysis/Utility.h"
+
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
 using ::mlir::triton::gpu::BlockedEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
@@ -104,6 +106,26 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
+  if (gpu::intel::cvtIsSubGroupShuffle(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group shuffles do not need
+    // scratch memory.
+    return ScratchConfig({}, {});
+  }
+
+  if (gpu::intel::cvtIsSubGroupTranspose(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group transposes store the
+    // whole tensor in shared memory and read it afterwards.
+    auto srcEncoding = cast<gpu::DistributedEncodingTrait>(srcTy.getEncoding());
+    unsigned threadsPerWarp = product(srcEncoding.getThreadsPerWarp());
+    unsigned warpsPerCTA = product(srcEncoding.getWarpsPerCTA());
+    unsigned remaining = product(srcTy.getShape()) /
+                         (threadsPerWarp * threadsPerWarp * warpsPerCTA);
+    SmallVector<unsigned> repShape{threadsPerWarp, threadsPerWarp, remaining,
+                                   warpsPerCTA};
+    return ScratchConfig(repShape, repShape,
+                         /*inVec=*/1, /*outVec=*/threadsPerWarp);
+  }
+
   // Initialize vector sizes and stride
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
   if (repShape.empty())

From 057d82cad8fee2affeeabfeafe1c7db1b772c775 Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 12 Nov 2024 10:07:36 +0100
Subject: [PATCH 10/12] [XPU][Membar] Define basic Intel-specific `Membar`
 filter (#2640)

Define Intel-specific `Membar` filter to reduce synchronization overhead
in the
Intel backend.

`Membar` analysis will insert barriers to avoid race conditions between
operations
accessing the same memory. The granularity of this analysis is a bit
coarse, tho,
and unneeded barriers may be inserted. In order to avoid this, the
analysis
allows users to pass a callback function to filter safe cases.

As one of the recurring cases of barriers being inserted when not
needed, detect
back to back layout conversions implemented as sub-group transposes as
safe so no
barriers are inserted.

See code for further details on why this is safe.

---------

Signed-off-by: victor-eds <victor.perez@codeplay.com>
---
 .../Conversion/intel/sub-group-transpose.mlir | 17 ++++++
 third_party/intel/include/Analysis/Membar.h   | 17 ++++++
 third_party/intel/lib/Analysis/CMakeLists.txt |  1 +
 third_party/intel/lib/Analysis/Membar.cpp     | 58 +++++++++++++++++++
 .../TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp  |  8 ++-
 5 files changed, 98 insertions(+), 3 deletions(-)
 create mode 100644 third_party/intel/include/Analysis/Membar.h
 create mode 100644 third_party/intel/lib/Analysis/Membar.cpp

diff --git a/test/Conversion/intel/sub-group-transpose.mlir b/test/Conversion/intel/sub-group-transpose.mlir
index 8b2c5bd6aa..9387c7dda9 100644
--- a/test/Conversion/intel/sub-group-transpose.mlir
+++ b/test/Conversion/intel/sub-group-transpose.mlir
@@ -426,3 +426,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return %0 : tensor<32x64xf32, #blocked1>
   }
 }
+
+// -----
+
+// Test no barriers are inserted when back to back transpositions are performed.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL: llvm.func spir_kernelcc @test_back_to_back
+  // CHECK-NOT: barrier
+  tt.func @test_back_to_back(%arg0: tensor<32x64xf32, #blocked>, %arg1: tensor<32x64xf32, #blocked>) -> (tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>) {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    %1 = triton_gpu.convert_layout %arg1 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    tt.return %0, %1 : tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>
+  }
+}
diff --git a/third_party/intel/include/Analysis/Membar.h b/third_party/intel/include/Analysis/Membar.h
new file mode 100644
index 0000000000..f6c9aabf4e
--- /dev/null
+++ b/third_party/intel/include/Analysis/Membar.h
@@ -0,0 +1,17 @@
+#ifndef TRITON_INTEL_ANALYSIS_MEMBAR_H
+#define TRITON_INTEL_ANALYSIS_MEMBAR_H
+
+namespace mlir {
+class Operation;
+namespace intel {
+/// Intel-specific callback to filter operations that need no barriers between
+/// each other.
+///
+/// This is useful as the granularity to check whether barriers are needed is
+/// quite coarse. The filter will return true if no barrier is needed between
+/// `lhsOp` and `rhsOp`.
+bool membarFilter(Operation *lhsOp, Operation *rhsOp);
+} // namespace intel
+} // namespace mlir
+
+#endif // TRITON_INTEL_ANALYSIS_MEMBAR_H
diff --git a/third_party/intel/lib/Analysis/CMakeLists.txt b/third_party/intel/lib/Analysis/CMakeLists.txt
index cf10374a69..3da00219a6 100644
--- a/third_party/intel/lib/Analysis/CMakeLists.txt
+++ b/third_party/intel/lib/Analysis/CMakeLists.txt
@@ -3,6 +3,7 @@ add_triton_library(TritonIntelAnalysis
     AxisInfo.cpp
     DPAS.cpp
     Liveness.cpp
+    Membar.cpp
     Utility.cpp
 
     DEPENDS
diff --git a/third_party/intel/lib/Analysis/Membar.cpp b/third_party/intel/lib/Analysis/Membar.cpp
new file mode 100644
index 0000000000..b3dec3c14b
--- /dev/null
+++ b/third_party/intel/lib/Analysis/Membar.cpp
@@ -0,0 +1,58 @@
+#include "intel/include/Analysis/Membar.h"
+
+#include "intel/include/Analysis/Utility.h"
+
+namespace mlir::intel {
+namespace {
+triton::gpu::ConvertLayoutOp dynCastToSubGroupTranspose(Operation *op) {
+  auto convertLayout = dyn_cast<triton::gpu::ConvertLayoutOp>(op);
+  if (!convertLayout)
+    return nullptr;
+
+  if (!triton::gpu::intel::cvtIsSubGroupTranspose(
+          convertLayout.getSrc().getType(),
+          convertLayout.getResult().getType()))
+    return nullptr;
+
+  return convertLayout;
+}
+
+/// Check if `lhsOp` and `rhsOp` are safe to execute back-to-back sub-group
+/// transpose layout conversions.
+///
+/// Sub-group transposes are implemented as follows:
+///
+/// - Each sub-group writes all the elements it is handling in a memory block
+/// - Each sub-group reads all the elements it is handling from the same memory
+/// region.
+///
+/// As there is no need to synchronize work-items in the same sub-group and we
+/// know data won't be shared between sub-groups, executing these operations
+/// back-to-back with no barriers in between is safe.
+bool areSafeToOverlapSubGroupTransposeOps(Operation *lhsOp, Operation *rhsOp) {
+  // Check both are lowered to sub-group transpose operations.
+  auto lhsTranspose = dynCastToSubGroupTranspose(lhsOp);
+  if (!lhsTranspose)
+    return false;
+  auto rhsTranspose = dynCastToSubGroupTranspose(rhsOp);
+  if (!rhsTranspose)
+    return false;
+
+  // Check the types of source and result are the same, i.e., we are expressing
+  // the same kind of transposition.
+  if (lhsTranspose.getSrc().getType() != lhsTranspose.getSrc().getType() ||
+      lhsTranspose.getResult().getType() != lhsTranspose.getResult().getType())
+    return false;
+
+  // Check both have the same offset and thus these operation can be overlapped.
+  return lhsTranspose->getAttr("allocation.offset") ==
+         rhsTranspose->getAttr("allocation.offset");
+}
+} // namespace
+bool membarFilter(Operation *lhsOp, Operation *rhsOp) {
+  // For now, we only check these aren't layout conversions implemented as the
+  // same sub-group transposition.
+  assert(lhsOp && rhsOp && "Expecting valid operations");
+  return areSafeToOverlapSubGroupTransposeOps(lhsOp, rhsOp);
+}
+} // namespace mlir::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
index c10a2e8aff..a4c2da184e 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
@@ -15,6 +15,7 @@
 #include "intel/include/TritonIntelGPUToLLVM/Passes.h"
 
 #include "intel/include/Analysis/Allocation.h"
+#include "intel/include/Analysis/Membar.h"
 #include "triton/Analysis/AxisInfo.h"
 #include "triton/Analysis/Membar.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
@@ -77,7 +78,8 @@ struct ConvertTritonGPUToLLVM
     MLIRContext *context = &getContext();
     ModuleOp mod = getOperation();
 
-    intel::TritonGPUToLLVMPipelineManager pipelineManager(mod, context);
+    mlir::triton::intel::TritonGPUToLLVMPipelineManager pipelineManager(
+        mod, context);
     mlir::LowerToLLVMOptions option(context);
     bool isAdvancedPathEnabled =
         mod->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
@@ -97,7 +99,7 @@ struct ConvertTritonGPUToLLVM
     if (!pipelineManager.skipSharedMemoryAllocation()) {
       ModuleAllocation allocation =
           ModuleAllocation::get<triton::intel::AllocationAnalysis>(mod);
-      ModuleMembarAnalysis membarPass(&allocation);
+      ModuleMembarAnalysis membarPass(&allocation, ::mlir::intel::membarFilter);
       membarPass.run();
     }
 
@@ -116,7 +118,7 @@ struct ConvertTritonGPUToLLVM
         return signalPassFailure();
     }
 
-    intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
+    mlir::triton::intel::ModuleAxisInfoAnalysis axisInfoAnalysis(mod);
     OpBuilder::InsertPoint indexInsertPoint;
 
     RewritePatternSet patterns(context);

From 9952acf7ac33da42949fa366a50516b64d66aa88 Mon Sep 17 00:00:00 2001
From: Anatoly Myachev <anatoliimyachev@mail.com>
Date: Tue, 12 Nov 2024 12:26:56 +0100
Subject: [PATCH 11/12] Fix coverity issues (#2680)

For Triton as well: https://github.com/triton-lang/triton/pull/5118

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
---
 third_party/intel/lib/Analysis/Allocation.cpp | 12 +++---
 third_party/intel/lib/Analysis/AxisInfo.cpp   | 38 ++++++++++++-------
 .../SharedToDotOperandDPAS.cpp                |  3 +-
 3 files changed, 32 insertions(+), 21 deletions(-)

diff --git a/third_party/intel/lib/Analysis/Allocation.cpp b/third_party/intel/lib/Analysis/Allocation.cpp
index 881fe7b162..b868711673 100644
--- a/third_party/intel/lib/Analysis/Allocation.cpp
+++ b/third_party/intel/lib/Analysis/Allocation.cpp
@@ -53,10 +53,10 @@ getCvtOrder(Attribute srcLayout, Attribute dstLayout) {
 
   // mma or dot layout does not have an order, so the order depends on the
   // layout of the other operand.
-  auto inOrd = (srcMmaLayout || srcDotLayout) ? getOrder(dstLayout)
-                                              : getOrder(srcLayout);
-  auto outOrd = (dstMmaLayout || dstDotLayout) ? getOrder(srcLayout)
-                                               : getOrder(dstLayout);
+  const auto &inOrd = (srcMmaLayout || srcDotLayout) ? getOrder(dstLayout)
+                                                     : getOrder(srcLayout);
+  const auto &outOrd = (dstMmaLayout || dstDotLayout) ? getOrder(srcLayout)
+                                                      : getOrder(dstLayout);
 
   return {inOrd, outOrd};
 }
@@ -368,7 +368,7 @@ class AllocationAnalysis {
   /// arguments are involved.
   void resolveAliasBufferLiveness(
       function_ref<Interval<size_t>(Value value)> getLiveness) {
-    for (auto aliasBufferIter : allocation->getAliasBuffer()) {
+    for (const auto &aliasBufferIter : allocation->getAliasBuffer()) {
       auto value = aliasBufferIter.first;
       auto buffers = aliasBufferIter.second;
       auto range = getLiveness(value);
@@ -508,7 +508,7 @@ class AllocationAnalysis {
           std::find_if(xBuffers.begin(), xBuffers.end(), [&](auto *buffer) {
             auto xRange = bufferRange[buffer];
             bool res = xRange.intersects(range);
-            for (auto val : tripleMap)
+            for (const auto &val : tripleMap)
               res = res &&
                     !val.second.intersects(xRange); // only one buffer intersect
             return res;
diff --git a/third_party/intel/lib/Analysis/AxisInfo.cpp b/third_party/intel/lib/Analysis/AxisInfo.cpp
index 7161dedf7a..463fb4522b 100644
--- a/third_party/intel/lib/Analysis/AxisInfo.cpp
+++ b/third_party/intel/lib/Analysis/AxisInfo.cpp
@@ -123,7 +123,8 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
         divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));
       }
     }
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 
 protected:
@@ -543,7 +544,8 @@ class SplatOpAxisInfoVisitor final
       divisibility.push_back(opInfo.getDivisibility(0));
       constancy.push_back(retTy.getShape()[d]);
     }
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -574,7 +576,8 @@ class LoadOpAxisInfoVisitor final : public AxisInfoVisitorImpl<triton::LoadOp> {
               maskInfo.has_value() ? maskInfo->getConstancy(d) : 0));
     }
 
-    return AxisInfo(contiguity, divisibility, constancy);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy));
   }
 };
 
@@ -608,7 +611,8 @@ class ExpandDimsOpAxisInfoVisitor final
     contiguity.insert(contiguity.begin() + op.getAxis(), 1);
     divisibility.insert(divisibility.begin() + op.getAxis(), newDivisibility);
     constancy.insert(constancy.begin() + op.getAxis(), 1);
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -637,7 +641,8 @@ class BroadcastOpAxisInfoVisitor final
       constancy.push_back(opShape[d] == 1 ? retShape[d]
                                           : opInfo.getConstancy(d));
     }
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -712,7 +717,8 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
       contiguity.push_back(1);
     }
 
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 
 private:
@@ -840,7 +846,8 @@ class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         constantValue = lhsInfo.getConstantValue();
     }
 
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 };
 
@@ -993,7 +1000,8 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         contiguity.push_back(
             std::min(lhsInfo.getContiguity(d), rhsInfo.getContiguity(d)));
       }
-      return AxisInfo(contiguity, divisibility, constancy, std::nullopt);
+      return AxisInfo(std::move(contiguity), std::move(divisibility),
+                      std::move(constancy), std::nullopt);
     }
   }
 };
@@ -1038,7 +1046,8 @@ class MakeTensorPtrOpAxisInfoVisitor final
       constancy.push_back(1);
     }
 
-    auto axisInfo = AxisInfo(contiguity, divisibility, constancy);
+    auto axisInfo = AxisInfo(std::move(contiguity), std::move(divisibility),
+                             std::move(constancy));
 
     LLVM_DEBUG({
       std::string axisStr;
@@ -1143,8 +1152,8 @@ LogicalResult AxisInfoAnalysis::visitOperation(
     auto vals = cast<DenseElementsAttr>(attr).getValues<int>();
     newConstancy = AxisInfo::DimVectorT(vals.begin(), vals.end());
   }
-  curr = AxisInfo(newContiguity, newDivisibility, newConstancy,
-                  curr.getConstantValue());
+  curr = AxisInfo(std::move(newContiguity), std::move(newDivisibility),
+                  std::move(newConstancy), curr.getConstantValue());
   // join all lattice elements
   for (auto *result : results)
     propagateIfChanged(result, result->join(curr));
@@ -1154,9 +1163,9 @@ LogicalResult AxisInfoAnalysis::visitOperation(
 void AxisInfoAnalysis::visitForOpInductionVar(
     scf::ForOp op, ArrayRef<dataflow::Lattice<AxisInfo> *> argLattices) {
   ProgramPoint programPoint(op);
-  const auto lb =
+  const auto &lb =
       getLatticeElementFor(&programPoint, op.getLowerBound())->getValue();
-  const auto step =
+  const auto &step =
       getLatticeElementFor(&programPoint, op.getStep())->getValue();
 
   AxisInfo::DimVectorT knownContiguity(1, 1);
@@ -1164,7 +1173,8 @@ void AxisInfoAnalysis::visitForOpInductionVar(
   AxisInfo::DimVectorT knownConstancy(1, 1);
   knownDivisibility[0] = gcd(lb.getDivisibility(0), step.getDivisibility(0));
   auto inductionVar =
-      AxisInfo(knownContiguity, knownDivisibility, knownConstancy);
+      AxisInfo(std::move(knownContiguity), std::move(knownDivisibility),
+               std::move(knownConstancy));
   (void)argLattices[0]->join(inductionVar);
 }
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
index 49d88dca1f..7e1e451ea8 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandDPAS.cpp
@@ -20,7 +20,8 @@ template <unsigned opIdx> class DpasMatmulLoader {
                    ConversionPatternRewriter &rewriter,
                    const LLVMTypeConverter *typeConverter, Location loc)
       : dpasLayout(dpasLayout), descTy(descTy), smemStrides(smemStrides),
-        multiDimWarpId(multiDimWarpId), rewriter(rewriter), loc(loc) {
+        multiDimWarpId(std::move(multiDimWarpId)), rewriter(rewriter),
+        loc(loc) {
     static_assert(opIdx == 0 || opIdx == 1);
 
     size_t rank = warpShape.size();

From ee755e8a24f2dae45d3eebd8050e687a82ac5c35 Mon Sep 17 00:00:00 2001
From: chengjunlu <chengjun.lu@intel.com>
Date: Tue, 12 Nov 2024 22:26:28 +0800
Subject: [PATCH 12/12] To load single DPAS B matrix instead of two per 2D
 block io instruction from the transposed memory  (#2628)

To load single DPAS B matrix per 2D block io instruction from the column
major matrix in memory gets better performance for flash attention.

Because unlike the row major matrix, the values, which includes more
than one DPAS B operands returned by a single 2D transposed block IO,
cannot be used as DPAS operands directly.

We have to shuffle the value in the register before pass it to the DPAS
instruction and this is not optimized by the IGC for now.
---
 include/triton/Tools/Sys/GetEnv.hpp           |  1 +
 test/TritonIntelGPU/blockptr_load.mlir        | 38 ++++++++++---------
 .../LoadStoreOpToLLVM.cpp                     | 20 ++++++----
 3 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
index 1cc88123d0..e4360b762f 100644
--- a/include/triton/Tools/Sys/GetEnv.hpp
+++ b/include/triton/Tools/Sys/GetEnv.hpp
@@ -34,6 +34,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_ADVANCED_PATH",
     "TRITON_INTEL_AGGRESSIVE_DPAS_REUSE",
     "TRITON_INTEL_DO_NOT_SINK_INSTR_ACROSS_RGN",
+    "TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B",
     "TRITON_INTEL_ENABLE_ADDRESS_PAYLOAD_OPT",
     "TRITON_INTEL_ENABLE_FIRST_LOAD_TO_SLM",
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir
index 63bae0b4c1..ff99bbf77f 100644
--- a/test/TritonIntelGPU/blockptr_load.mlir
+++ b/test/TritonIntelGPU/blockptr_load.mlir
@@ -1,4 +1,5 @@
-// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,LARGE-BLOCK-SIZE-TRANS-B
+// RUN: TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B=1 triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm --check-prefixes=CHECK,SMALL-BLOCK-SIZE-TRANS-B
 
 // CHECK-DAG: llvm.func spir_funccc @_Z38intel_sub_group_f16_f16_matrix_mad_k16Dv8_sDv8_iDv8_f(vector<8xi16>, vector<8xi32>, vector<8xf32>) -> vector<8xf32> attributes {convergent, memory_effects = #llvm.memory_effects<other = none, argMem = none, inaccessibleMem = none>, no_unwind, will_return}
 // CHECK-DAG: llvm.func spir_funccc @_Z41intel_sub_group_2d_block_read_16b_8r16x2cPU3AS1viiiDv2_iPt(!llvm.ptr<1> {llvm.nonnull, llvm.readonly}, i32, i32, i32, vector<2xi32>, !llvm.ptr {llvm.nonnull, llvm.writeonly}) attributes {no_unwind, will_return}
@@ -204,22 +205,25 @@ module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-war
       %c0_i32 = arith.constant 0 : i32
       %c32_i64 = arith.constant 32 : i64
       %21 = tt.make_tensor_ptr %arg0, [%c64_i64, %c64_i64], [%c1_i64, %col_stride], [%c0_i32, %c0_i32] {order = array<i32: 0, 1>} : <tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
-      // CHECK:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
-      // CHECK:    %[[VAL_68:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
-      // CHECK:    %[[VAL_69:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
-      // CHECK:    %[[VAL_71:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
-      // CHECK:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
-      // CHECK:    %[[VAL_103:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
-      // CHECK:    %[[VAL_104:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
-      // CHECK:    %[[VAL_106:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
-      // CHECK:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
-      // CHECK:    %[[VAL_138:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
-      // CHECK:    %[[VAL_139:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
-      // CHECK:    %[[VAL_141:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
-      // CHECK:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
-      // CHECK:    %[[VAL_173:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
-      // CHECK:    %[[VAL_174:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
-      // CHECK:    %[[VAL_176:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
+      // COM: One DPAS operand B per load instruction.
+        // SMALL-BLOCK-SIZE-TRANS-B-COUNT-8:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_16r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
+      // COM: Two interleaved DPAS operand B per load instruction. Need to shuffle the loaded value to decompose the VNNI format DPAS operand B.
+        // LARGE-BLOCK-SIZE-TRANS-B:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_68:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_69:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_71:.*]] = llvm.shufflevector %[[VAL_68]], %[[VAL_68]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_103:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_104:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_106:.*]] = llvm.shufflevector %[[VAL_103]], %[[VAL_103]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_138:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_139:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_141:.*]] = llvm.shufflevector %[[VAL_138]], %[[VAL_138]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    llvm.call spir_funccc @_Z51intel_sub_group_2d_block_read_transpose_32b_32r8x1cPU3AS1viiiDv2_iPj({{.*}}, [[DEST:%.*]]) {{.*}} : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> ()
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_173:.*]] = llvm.load [[DEST]] : !llvm.ptr -> vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_174:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [0, 2, 4, 6, 8, 10, 12, 14] : vector<16xi32>
+        // LARGE-BLOCK-SIZE-TRANS-B:    %[[VAL_176:.*]] = llvm.shufflevector %[[VAL_173]], %[[VAL_173]] [1, 3, 5, 7, 9, 11, 13, 15] : vector<16xi32>
       %45 = tt.load %21 {triton_intel_gpu.block_io = "column_major"} : !tt.ptr<tensor<64x32xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
       tt.return
   }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index cf2475c41b..35eb540241 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -627,13 +627,19 @@ struct LoadOpConversion
 
       std::swap(tileHeight, tileWidth);
 
-      // We can decompose the matrix returned by transposed large 2d load
-      // when threads per warp < column size. Otherwise we have to load one
-      // operand per inst.
-      // Note: the tileHeight and numOperandsPer2DLoadM are the column size
-      // now.
-      numOperandsPer2DLoadM =
-          (threadsPerWarp <= tileHeight) ? repCluster[rank - 1] : 1;
+      if (triton::tools::getBoolEnv(
+              "TRITON_INTEL_DISABLE_LARGE_BLOCK_SIZE_IO_FOR_TRANS_DOT_B")) {
+        // Only load 1 operand per inst on row.
+        numOperandsPer2DLoadM = 1;
+      } else {
+        // We can decompose the matrix returned by transposed large 2d load
+        // when threads per warp < column size. Otherwise we have to load one
+        // operand per inst.
+        // Note: the tileHeight and numOperandsPer2DLoadM are the column size
+        // now.
+        numOperandsPer2DLoadM =
+            (threadsPerWarp <= tileHeight) ? repCluster[rank - 1] : 1;
+      }
       // The transpose 2d load only support 1 operand per inst on column.
       // (vBlocks = 1)
       numOperandsPer2DloadN = 1;