diff --git a/.github/README.md b/.github/README.md
new file mode 100644
index 0000000000..0fc64b2764
--- /dev/null
+++ b/.github/README.md
@@ -0,0 +1,324 @@
+[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml)
+[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml)
+
+# Intel® XPU Backend for Triton\*
+
+This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton) backend for Intel GPUs.
+Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/pytorch/pytorch) and standalone usage.
+
+# Compatibility
+
+* Operating systems:
+  * [Ubuntu 22.04](http://releases.ubuntu.com/22.04)
+* GPU Cards:
+  * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)
+  * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html)
+  * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html)
+* GPU Drivers:
+  * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html)
+  * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html)
+* Toolchain:
+  * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+
+Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*.
+
+# Quick Installation
+
+## Prerequisites
+
+1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
+2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+
+## Install PyTorch and Triton from nightly wheels
+
+Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels.
+Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml),
+select the most recent successful run on the top of the page and download an artifact for the corresponding Python version.
+Extract the archive and in the extracted directory execute:
+
+```shell
+pip install torch-*.whl triton-*.whl
+```
+
+Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain.
+The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
+
+```shell
+# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
+source /opt/intel/oneapi/setvars.sh
+```
+
+# Install from source
+
+## Prerequisites
+
+1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
+2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+
+## Compile PyTorch and Triton from source
+
+Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time.
+
+Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain.
+The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
+
+```shell
+# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
+source /opt/intel/oneapi/setvars.sh
+```
+
+Clone this repository:
+
+```shell
+git clone https://github.com/intel/intel-xpu-backend-for-triton.git
+cd intel-xpu-backend-for-triton
+```
+
+To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment:
+
+```shell
+python -m venv .venv --prompt triton
+source .venv/bin/activate
+```
+
+Compile and install PyTorch:
+
+```shell
+scripts/install-pytorch.sh --source
+```
+
+Compile and install Intel® XPU Backend for Triton\*:
+
+```shell
+scripts/compile-triton.sh
+```
+
+# Building with a custom LLVM
+
+Triton uses LLVM to generate code for GPUs and CPUs.  Normally, the Triton build
+downloads a prebuilt LLVM, but you can also build LLVM from source and use that.
+
+LLVM does not have a stable API, so the Triton build will not work at an
+arbitrary LLVM version.
+
+1. Find the version of LLVM that Triton builds against.
+Check `cmake/llvm-hash.txt` to see the current version.
+
+2. Checkout LLVM at this revision to the directory `llvm`,
+which must be in the same directory as `intel-xpu-backend-for-triton`:
+
+3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM:
+
+    ```shell
+    ./scripts/compile-triton.sh --llvm --triton
+    ```
+
+# Tips for building
+
+- Set `TRITON_BUILD_WITH_CLANG_LLD=true` as an environment variable to use clang
+  and lld.  lld in particular results in faster builds.
+
+- Set `TRITON_BUILD_WITH_CCACHE=true` to build with ccache.
+
+- Set `TRITON_HOME=/some/path` to change the location of the `.triton`
+  directory where Triton's cache is located and downloads are stored
+  during the build. By default, this is the user's home directory. It
+  can be changed anytime.
+
+- Pass `--no-build-isolation` to `pip install` to make nop builds faster.
+  Without this, every invocation of `pip install` uses a different symlink to
+  cmake, and this forces ninja to rebuild most of the `.a` files.
+
+- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++
+  (probably because, in our build, users don't invoke cmake directly, but
+  instead use setup.py).  Teach vscode how to compile Triton as follows.
+
+    - Do a local build. Run command `pip install -e python`
+    - Get the full path to the `compile_commands.json` file produced by the build:
+      `find python/build -name 'compile_commands.json' | xargs readlink -f`.
+      You might get a full path similar to `/Users/{username}/triton/python/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json`
+    - In vscode, install the
+      [C/C++
+      extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools),
+      then open the command palette (`Shift + Command + P` on Mac, or `Shift +
+      Ctrl + P` on Windows/Linux) and open `C/C++: Edit Configurations (UI)`.
+    - Open "Advanced Settings" and paste the full path to
+      `compile_commands.json` into the "Compile Commands" textbox.
+
+# Running tests
+
+There currently isn't a turnkey way to run all the Triton tests, but you can
+follow the following recipe.
+
+```shell
+scripts/test-triton.sh
+```
+
+# Tips for hacking
+
+For detailed instructions on how to debug Triton's frontend, please refer to this [tutorial](https://triton-lang.org/main/programming-guide/chapter-3/debugging.html). The following includes additional tips for hacking on Triton's backend.
+
+**Helpful environment variables**
+
+- `MLIR_ENABLE_DUMP=1` dumps the IR before every MLIR pass Triton runs, for all
+   kernels. Use `MLIR_ENABLE_DUMP=kernelName` to dump for a specific kernel only.
+  - Triton cache can interfere with the dump. In cases where `MLIR_ENABLE_DUMP=1` does not work, try cleaning your triton cache: `rm -r ~/.triton/cache/*`
+- `LLVM_IR_ENABLE_DUMP=1` dumps the IR before every pass run over the LLVM IR.
+- `TRITON_INTERPRET=1` uses the Triton interpreter instead of running on the
+  GPU.  You can insert Python breakpoints in your kernel code!
+- `TRITON_ENABLE_LLVM_DEBUG=1` passes `-debug` to LLVM, printing a lot of
+  debugging information to stdout.  If this is too noisy, run with just
+  `TRITON_LLVM_DEBUG_ONLY` instead to limit the output.
+
+  An alternative way to reduce output noisiness is running with
+  `LLVM_IR_ENABLE_DUMP=1`, extract the IR before the LLVM pass of interest, and
+  then run LLVM's `opt` standalone, perhaps passing `-debug-only=foo` on the
+  command line.
+- `TRITON_LLVM_DEBUG_ONLY=<comma-separated>` is the equivalent of LLVM's
+  `-debug-only` command-line option. This limits the LLVM debug output to
+  specific pass or component names (which are specified using `#define
+  DEBUG_TYPE` throughout LLVM and Triton) in order to allow the debug output to
+  be less noisy. `TRITON_LLVM_DEBUG_ONLY` allows for one or more comma
+  separated values to be specified (eg
+  `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions` or
+  `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions,regalloc"`).
+- `USE_IR_LOC={ttir,ttgir}` reparses the IR such that the location information
+  will be the line number of the IR file with that particular extension,
+  instead of line number of the python file. This can provide a direct mapping
+  from the IR to llir/ptx. When used with performance tools, it can provide a
+  breakdown on IR instructions.
+- `TRITON_PRINT_AUTOTUNING=1` prints out the best autotuning config and total time
+  spent for each kernel after autotuning is complete.
+- `DISABLE_LLVM_OPT` will disable llvm optimizations for make_llir and make_ptx
+  if its value is true when parsing as Bool. Otherwise, it will be parsed as a list
+  of flags to disable llvm optimizations. One usage case is
+  `DISABLE_LLVM_OPT="disable-lsr"`
+  Loop strength reduction is known to cause up to 10% performance changes for
+  certain kernels with register pressure.
+- `TRITON_ALWAYS_COMPILE=1` forces to compile kernels regardless of cache hit.
+- `MLIR_ENABLE_TIMING` dumps the timing information for each MLIR pass.
+- `LLVM_ENABLE_TIMING` dumps the timing information for each LLVM pass.
+- `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma).
+- `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks.
+
+# Usage Guide
+
+## Code Modifications
+Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels.
+
+1. Add `import torch` for xpu support.
+2. Put the tensor and models to XPU by calling `to('xpu')`.
+
+This repository contains modified [tutorials](https://github.com/intel/intel-xpu-backend-for-triton/tree/main/python/tutorials) that must be used with Intel® XPU Backend for Triton\*.
+
+The following examples show modifications for the user code.
+
+### Example 1 : Triton Kernel
+
+This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics.
+
+Comparing to the original code, the following code modifies:
+
+```Python
+import torch
+import triton
+import triton.language as tl
+
+
+@triton.jit
+def add_kernel(
+    x_ptr,
+    y_ptr,
+    output_ptr,
+    n_elements,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    block_start = pid * BLOCK_SIZE
+    offsets = block_start + tl.arange(0, BLOCK_SIZE)
+    mask = offsets < n_elements
+    x = tl.load(x_ptr + offsets, mask=mask)
+    y = tl.load(y_ptr + offsets, mask=mask)
+    output = x + y
+    tl.store(output_ptr + offsets, output, mask=mask)
+
+def add(x: torch.Tensor, y: torch.Tensor):
+    # Put the tensor to xpu
+    output = torch.empty_like(x).xpu()
+    assert x.is_xpu and y.is_xpu and output.is_xpu
+    n_elements = output.numel()
+    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+
+    return output
+
+# For manual_seed, needs to use API for XPU
+torch.xpu.manual_seed(0)
+size = 512
+# For tensors, needs to be put on XPU
+x = torch.rand(size, device='xpu')
+y = torch.rand(size, device='xpu')
+output_torch = x + y
+output_triton = add(x, y)
+print(output_torch)
+print(output_triton)
+print(
+    f'The maximum difference between torch and triton is '
+    f'{torch.max(torch.abs(output_torch - output_triton))}'
+)
+```
+
+### Example 2 : End-to-End Model
+Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it.
+
+```Python
+import torch
+from torch._dynamo.testing import rand_strided
+
+from torch.nn import *
+class simpleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        # tensors inside model should be on xpu
+        self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32)
+
+    def forward(self, x):
+        z = x + self.y
+        return z
+
+# tensors passed to the model should be on xpu
+x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32)
+xpu_model = simpleModel()
+# Call torch.compile for optimization
+optimized_mod = torch.compile(xpu_model)
+
+graph_result = optimized_mod(x)
+```
+
+## Performance Analysis Guide
+
+There are several ways of doing performance analysis.
+We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis.
+Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling.
+
+```Bash
+export TRITON_XPU_PROFILE=1
+```
+
+# Contributing
+
+Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/CONTRIBUTING.md).
+
+## License
+
+_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file.
+
+
+## Security
+
+See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html)
+for information on how to report a potential security issue or vulnerability.
+
+See also: [Security Policy](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/SECURITY.md).
diff --git a/.github/pins/pytorch-upstream.txt b/.github/pins/pytorch-upstream.txt
index c2ce8b1a5f..0d9e3cab75 100644
--- a/.github/pins/pytorch-upstream.txt
+++ b/.github/pins/pytorch-upstream.txt
@@ -1 +1 @@
-33dce10ece5b38aa0ab76739b658cd980a6e3d8f
+51e8a13d007b3032af45facb50dfa4ee6012f22a
diff --git a/.github/workflows/auto-update-translator-cid.yml b/.github/workflows/auto-update-translator-cid.yml
index 0b854da307..7c2aad266c 100644
--- a/.github/workflows/auto-update-translator-cid.yml
+++ b/.github/workflows/auto-update-translator-cid.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on:
       - max1100
       - rolling
-      - runner-0.0.19
+      - runner-0.0.20
     defaults:
       run:
         shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}"
diff --git a/.github/workflows/bandit-check.yml b/.github/workflows/bandit-check.yml
index 9edaad048e..9d20af8055 100644
--- a/.github/workflows/bandit-check.yml
+++ b/.github/workflows/bandit-check.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on:
       - max1100
       - rolling
-      - runner-0.0.19
+      - runner-0.0.20
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml
index aca94112b9..0c3598c679 100644
--- a/.github/workflows/build-test-reusable.yml
+++ b/.github/workflows/build-test-reusable.yml
@@ -55,7 +55,7 @@ on:
       runner_version:
         description: Runner label for version
         type: string
-        default: runner-0.0.19
+        default: runner-0.0.20
 
 permissions: read-all
 
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index d3d9b29b56..84d09afa5d 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -76,12 +76,6 @@ jobs:
         run: |
           set -x
           pip install --upgrade pre-commit
-
-          # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed
-          python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true
-          # If first run of yapf worked and made changes reset the tree to the original state
-          git reset --hard
-
           python3 -m pre_commit run --show-diff-on-failure --color=always --all-files --verbose
 
       - name: Save pip cache
diff --git a/.github/workflows/conda-test-reusable.yml b/.github/workflows/conda-test-reusable.yml
index 11989e8587..d81f365489 100644
--- a/.github/workflows/conda-test-reusable.yml
+++ b/.github/workflows/conda-test-reusable.yml
@@ -55,7 +55,7 @@ on:
       runner_version:
         description: Runner label for version
         type: string
-        default: runner-0.0.19
+        default: runner-0.0.20
       env_manager:
         description: Environment manager
         default: conda
@@ -82,7 +82,7 @@ jobs:
         id: conda-cache
         uses: ./.github/actions/load
         env:
-          CACHE_NUMBER: 6
+          CACHE_NUMBER: 7
         with:
           path: $HOME/miniforge3/envs/triton
           key: conda-${{ inputs.env_manager }}-py${{ matrix.python }}-${{ hashFiles('scripts/triton.yml', 'python/pyproject.toml', 'python/setup.py') }}-${{ env.CACHE_NUMBER }}
diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml
index d4b236e43d..832fafce79 100644
--- a/.github/workflows/nightly-wheels.yml
+++ b/.github/workflows/nightly-wheels.yml
@@ -16,7 +16,7 @@ jobs:
     runs-on:
       - max1100
       - rolling
-      - runner-0.0.19
+      - runner-0.0.20
     strategy:
       matrix:
         python:
diff --git a/.github/workflows/no-basekit-build-test.yml b/.github/workflows/no-basekit-build-test.yml
index dfa0313cbb..3c6e5a42df 100644
--- a/.github/workflows/no-basekit-build-test.yml
+++ b/.github/workflows/no-basekit-build-test.yml
@@ -17,7 +17,7 @@ jobs:
     runs-on:
       - max1100
       - rolling
-      - runner-0.0.19
+      - runner-0.0.20
     strategy:
       matrix:
         python: ${{ github.ref_name == 'main' && fromJson('["3.9", "3.10", "3.11"]') || fromJson('["3.9"]') }}
diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml
index 94e419646b..2a5f9937f6 100644
--- a/.github/workflows/triton-benchmarks.yml
+++ b/.github/workflows/triton-benchmarks.yml
@@ -24,6 +24,10 @@ on:
         description: Run name
         type: string
         default: "Triton benchmarks"
+      skip_benchmarks:
+        description: JSON list of benchmarks to skip
+        type: string
+        default: "[]"
   schedule:
     - cron: "5 23 * * *"
   pull_request:
@@ -112,7 +116,7 @@ jobs:
           python setup.py install
 
       - name: Run Triton Softmax kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python fused_softmax.py --reports $REPORTS
@@ -121,7 +125,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton GEMM kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_benchmark.py --reports $REPORTS
@@ -132,7 +136,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton GEMM kernel benchmark - default path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_default') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           # Default path:
@@ -148,7 +152,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM kernel benchmark - advanced path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           # Advanced path:
@@ -164,7 +168,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM (A@B^t) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS
@@ -175,7 +179,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
 
       - name: Run Triton GEMM (A^t@B) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS
@@ -186,7 +190,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG
 
       - name: Run Triton GEMM (stream-k) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_streamk_benchmark.py --reports $REPORTS
@@ -194,7 +198,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM (split-k) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_splitk_benchmark.py --reports $REPORTS
@@ -202,7 +206,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM + PreOp (exp) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_preop_exp_benchmark.py --reports $REPORTS
@@ -210,7 +214,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_gelu_benchmark.py --reports $REPORTS
@@ -218,7 +222,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python gemm_postop_addmatrix_benchmark.py --reports $REPORTS
@@ -226,7 +230,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton FA kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python flash_attention_fwd_benchmark.py --reports $REPORTS
@@ -236,7 +240,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG
 
       - name: Run Triton FA kernel benchmark - default path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           TRITON_INTEL_ADVANCED_PATH=0 \
@@ -249,7 +253,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Triton FA kernel benchmark - advanced path
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py_advanced') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           TRITON_INTEL_ADVANCED_PATH=1 \
@@ -262,7 +266,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run Prefix Sums kernel benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }}
         run: |
           cd benchmarks/triton_kernels_benchmark
           python prefix_sums.py --reports $REPORTS
@@ -270,7 +274,7 @@ jobs:
           python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG
 
       - name: Run micro benchmark
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }}
         run: |
           cd benchmarks/micro_benchmarks
           python run_benchmarks.py --reports $REPORTS
diff --git a/README.md b/README.md
index a8bbe2c2e1..18e46403e7 100644
--- a/README.md
+++ b/README.md
@@ -1,98 +1,60 @@
-[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml)
-[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml)
-[![Conda test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml)
+<div align="center">
+  <img src="https://lh5.googleusercontent.com/wzQKEsTFkrgNQO9JjhGH5wFvslJr1saLtLaJ_a6Fp_gNENpvt3VG7BmztwngU9hFJaU4CPwGiw1opQtDvTkLrxWRbO_a12Q-pdESWHgtmheIHcPbOL5ZMC4TSiJVe5ty1w=w3517" alt="Triton logo">
+</div>
 
-# Intel® XPU Backend for Triton\*
+The Triton Conference is happening again on September 17th, 2024 in Fremont (CA)!
 
-This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage.
+If you are interested in attending, please fill up [this form](https://docs.google.com/forms/d/e/1FAIpQLSecHC1lkalcm0h3JDUbspekDX5bmBvMxgVTLaK3e-61bzDDbg/viewform).
 
-# Compatibility
-
-* Operating systems:
-  * [Ubuntu 22.04](http://releases.ubuntu.com/22.04)
-* GPU Cards:
-  * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html)
-  * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html)
-  * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html)
-* GPU Drivers:
-  * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html)
-  * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html)
-* Toolchain:
-  * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+| **`Documentation`** | **`Nightly Wheels`** |
+|-------------------- | -------------------- |
+| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
 
-Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*.
-
-# Quick Installation
+# Triton
 
-## Prerequisites
+This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs.
 
-1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
-2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
-3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
+The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton!
 
-## Install PyTorch and Triton from nightly wheels
+The [official documentation](https://triton-lang.org) contains installation instructions and tutorials.  See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required.
 
-Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels.
-Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml),
-select the most recent successful run on the top of the page and download an artifact for the corresponding Python version.
-Extract the archive and in the extracted directory execute:
-
-```shell
-pip install torch-*.whl triton-*.whl
-```
+# Quick Installation
 
-Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain.
-The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
+You can install the latest stable release of Triton from pip:
 
 ```shell
-# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
-source /opt/intel/oneapi/setvars.sh
+pip install triton
 ```
 
-# Install from source
-
-## Prerequisites
-
-1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver
-2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
-3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html)
-
-## Compile PyTorch and Triton from source
-
-Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time.
+Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9.
 
-Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain.
-The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user).
+And the latest nightly release:
 
 ```shell
-# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs
-source /opt/intel/oneapi/setvars.sh
+pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
 ```
 
-Clone this repository:
+# Install from source
 
 ```shell
-git clone https://github.com/intel/intel-xpu-backend-for-triton.git
-cd intel-xpu-backend-for-triton
-```
-
-To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment:
+git clone https://github.com/triton-lang/triton.git;
+cd triton;
 
-```shell
-python -m venv .venv --prompt triton
-source .venv/bin/activate
+pip install ninja cmake wheel pybind11; # build-time dependencies
+pip install -e python
 ```
 
-Compile and install PyTorch:
+Or with a virtualenv:
 
 ```shell
-scripts/install-pytorch.sh --source
-```
+git clone https://github.com/triton-lang/triton.git;
+cd triton;
 
-Compile and install Intel® XPU Backend for Triton\*:
+python -m venv .venv --prompt triton;
+source .venv/bin/activate;
 
-```shell
-scripts/compile-triton.sh
+pip install ninja cmake wheel pybind11; # build-time dependencies
+pip install -e python
 ```
 
 # Building with a custom LLVM
@@ -103,17 +65,36 @@ downloads a prebuilt LLVM, but you can also build LLVM from source and use that.
 LLVM does not have a stable API, so the Triton build will not work at an
 arbitrary LLVM version.
 
-1. Find the version of LLVM that Triton builds against.
-Check `cmake/llvm-hash.txt` to see the current version.
+1. Find the version of LLVM that Triton builds against.  Check
+`cmake/llvm-hash.txt` to see the current version. For example, if it says:
+       49af6502c6dcb4a7f7520178bd14df396f78240c
 
-2. Checkout LLVM at this revision to the directory `llvm`,
-which must be in the same directory as `intel-xpu-backend-for-triton`:
+   This means that the version of Triton you have builds against
+   [LLVM](https://github.com/llvm/llvm-project) 49af6502.
 
-3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM:
+2. `git checkout` LLVM at this revision.  Optionally, make additional
+   modifications to LLVM.
 
-    ```shell
-    ./scripts/compile-triton.sh --llvm --triton
-    ```
+3. [Build LLVM](https://llvm.org/docs/CMake.html).  For example, you might run
+
+       $ cd $HOME/llvm-project  # your clone of LLVM.
+       $ mkdir build
+       $ cd build
+       $ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU"
+       $ ninja
+
+4. Grab a snack, this will take a while.
+
+5. Build Triton as above, but set the following environment variables.
+
+       # Modify as appropriate to point to your LLVM build.
+       $ export LLVM_BUILD_DIR=$HOME/llvm-project/build
+
+       $ cd <triton install>
+       $ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \
+         LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib \
+         LLVM_SYSPATH=$LLVM_BUILD_DIR \
+         pip install -e python
 
 # Tips for building
 
@@ -131,7 +112,7 @@ which must be in the same directory as `intel-xpu-backend-for-triton`:
   Without this, every invocation of `pip install` uses a different symlink to
   cmake, and this forces ninja to rebuild most of the `.a` files.
 
-- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++
+- vscode intellisense has some difficulty figuring out how to build Triton's C++
   (probably because, in our build, users don't invoke cmake directly, but
   instead use setup.py).  Teach vscode how to compile Triton as follows.
 
@@ -153,7 +134,36 @@ There currently isn't a turnkey way to run all the Triton tests, but you can
 follow the following recipe.
 
 ```shell
-scripts/test-triton.sh
+# One-time setup.  Note we have to reinstall local Triton because torch
+# overwrites it with the public version.
+$ pip install scipy numpy torch pytest lit pandas matplotlib && pip install -e python
+
+# Run Python tests using your local GPU.
+$ python3 -m pytest python/test/unit
+
+# Move to builddir.  Fill in <...> with the full path, e.g.
+# `cmake.linux-x86_64-cpython-3.11`.
+$ cd python/build/cmake<...>
+
+# Run C++ unit tests.
+$ ctest -j32
+
+# Run lit tests.
+$ lit test
+```
+
+You may find it helpful to make a symlink to the builddir and tell your local
+git to ignore it.
+
+```shell
+$ ln -s python/build/cmake<...> build
+$ echo build >> .git/info/exclude
+```
+
+Then you can e.g. rebuild and run lit with the following command.
+
+```shell
+$ ninja -C build && ( cd build ; lit test )
 ```
 
 # Tips for hacking
@@ -203,124 +213,27 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi
 - `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma).
 - `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks.
 
-# Usage Guide
-
-## Code Modifications
-Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels.
-
-1. Add `import torch` for xpu support.
-2. Put the tensor and models to XPU by calling `to('xpu')`.
-
-This repository contains modified [tutorials](python/tutorials) that must be used with Intel® XPU Backend for Triton\*.
-
-The following examples show modifications for the user code.
-
-### Example 1 : Triton Kernel
-
-This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics.
-
-Comparing to the original code, the following code modifies:
-
-```Python
-import torch
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def add_kernel(
-    x_ptr,
-    y_ptr,
-    output_ptr,
-    n_elements,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    y = tl.load(y_ptr + offsets, mask=mask)
-    output = x + y
-    tl.store(output_ptr + offsets, output, mask=mask)
-
-def add(x: torch.Tensor, y: torch.Tensor):
-    # Put the tensor to xpu
-    output = torch.empty_like(x).xpu()
-    assert x.is_xpu and y.is_xpu and output.is_xpu
-    n_elements = output.numel()
-    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
-
-    return output
-
-# For manual_seed, needs to use API for XPU
-torch.xpu.manual_seed(0)
-size = 512
-# For tensors, needs to be put on XPU
-x = torch.rand(size, device='xpu')
-y = torch.rand(size, device='xpu')
-output_torch = x + y
-output_triton = add(x, y)
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)
-```
-
-### Example 2 : End-to-End Model
-Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it.
-
-```Python
-import torch
-from torch._dynamo.testing import rand_strided
-
-from torch.nn import *
-class simpleModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        # tensors inside model should be on xpu
-        self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32)
-
-    def forward(self, x):
-        z = x + self.y
-        return z
-
-# tensors passed to the model should be on xpu
-x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32)
-xpu_model = simpleModel()
-# Call torch.compile for optimization
-optimized_mod = torch.compile(xpu_model)
-
-graph_result = optimized_mod(x)
-```
-
-## Performance Analysis Guide
-
-There are several ways of doing performance analysis. We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis. We provide a comprehensive guide for those two:
-1. [end_to_end_tests#profiling settings](docs/test_docs/end_to_end_tests.md#profiling-settings) section for using `torch.profiler`.
-2. [VTune Profiling Guide](docs/VTune_Profiling.md) for kernel analysis.
+# Changelog
 
-Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling.
+Version 2.0 is out! New features include:
 
-```Bash
-export TRITON_XPU_PROFILE=1
-```
+- Many, many bug fixes
+- Performance improvements
+- Backend rewritten to use MLIR
+- Support for kernels that contain back-to-back matmuls (e.g., flash attention)
 
 # Contributing
 
-Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md).
-
-## License
+Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/triton-lang/triton/). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md).
 
-_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file.
+# Compatibility
 
+Supported Platforms:
 
-## Security
+- Linux
 
-See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html)
-for information on how to report a potential security issue or vulnerability.
+Supported Hardware:
 
-See also: [Security Policy](security.md)
+- NVIDIA GPUs (Compute Capability 8.0+)
+- AMD GPUs (ROCm 5.2+)
+- Under development: CPUs
diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h
index 0b1dd7ef90..ed769dc4b3 100644
--- a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h
+++ b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h
@@ -4,6 +4,7 @@
 #include "fmha_backward_policy.h"
 #include "fmha_utils.h"
 #include "xetla.hpp"
+#include <math.h>
 
 using T = sycl::half;
 
diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h
index 9327b046ad..94123300f5 100644
--- a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h
+++ b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "xetla.hpp"
+#include <math.h>
 
 namespace gpu::xetla {
 
diff --git a/benchmarks/xetla_kernel/xetla-library.conf b/benchmarks/xetla_kernel/xetla-library.conf
index 2cc1e9f5b3..944094ecd3 100644
--- a/benchmarks/xetla_kernel/xetla-library.conf
+++ b/benchmarks/xetla_kernel/xetla-library.conf
@@ -1 +1 @@
-b9e489ca6a776694a898044a3f2ae023a98db03d
+bde127ffebf502d32ef8ac2748e12d7839597fab
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
index 6b2faf336a..87dc10a71c 100644
--- a/include/triton/Dialect/Triton/IR/TritonOps.td
+++ b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -778,7 +778,8 @@ def TT_ScanReturnOp: TT_Op<"scan.return",
 def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise,
                                                             SameOperandsAndResultEncoding,
                                                             SameVariadicOperandSize,
-                                                            DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+                                                            DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+                                                            ConditionallySpeculatable]> {
 
     let description = [{
         call an external function $symbol implemented in $libpath/$libname with $args
@@ -790,6 +791,12 @@ def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise,
     let results = (outs TT_Type:$result);
 
     let assemblyFormat = "operands attr-dict `:` functional-type(operands, $result)";
+
+    let extraClassDeclaration = [{
+      // Interface method for ConditionallySpeculatable.
+      Speculation::Speculatability getSpeculatability();
+    }];
+
 }
 
 //
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
index 0321cc9b00..6e2d5149be 100644
--- a/lib/Analysis/Utility.cpp
+++ b/lib/Analysis/Utility.cpp
@@ -71,18 +71,25 @@ unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() {
   }
 
   unsigned threadOffset = 1;
-  if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
-    auto parentLayout = sliceLayout.getParent();
-    auto threadsPerWarp = getThreadsPerWarp(parentLayout);
-    threadOffset = threadsPerWarp[sliceLayout.getDim()];
-  } else {
-    auto threadsPerWarp = getThreadsPerWarp(srcLayout);
-    auto order = getThreadOrder(srcLayout);
-    for (unsigned i = 0; i < order.size(); i++) {
-      if (order[i] == axis)
-        break;
-      threadOffset *= threadsPerWarp[order[i]];
-    }
+  SmallVector<int> dimsRemoved;
+  while (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
+    dimsRemoved.push_back(sliceLayout.getDim());
+    srcLayout = sliceLayout.getParent();
+  }
+  // In case of slice layout we want to know the axis dimension relative to the
+  // most inner parent layout. `adjustedAxis` is the matching axis dim in the
+  // parent layout.
+  int adjustedAxis = axis;
+  for (auto dim : dimsRemoved) {
+    if (dim <= adjustedAxis)
+      adjustedAxis++;
+  }
+  auto threadsPerWarp = getThreadsPerWarp(srcLayout);
+  auto order = getThreadOrder(srcLayout);
+  for (unsigned i = 0; i < order.size(); i++) {
+    if (order[i] == adjustedAxis)
+      break;
+    threadOffset *= threadsPerWarp[order[i]];
   }
   return threadOffset;
 }
diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
index 4e80539233..829d4e7104 100644
--- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
+++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp
@@ -218,6 +218,46 @@ struct ReduceOpConversion
     rewriter.replaceOp(op, results);
   }
 
+  // For slice layout some ids are duplicated on multiple lanes, so we need to
+  // handle the delinearization of laneId in a special way. We need to
+  // generalize this part of the logic to work on any kind of linear layout
+  // uniformely.
+  SmallVector<Value>
+  getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc,
+                    ConversionPatternRewriter &rewriter) const {
+    auto srcLayout = helper.getSrcLayout();
+    auto srcShape = helper.getSrcShape();
+    auto order = triton::gpu::getThreadOrder(srcLayout);
+    SmallVector<Value> multiDimLaneId;
+
+    if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
+      auto parentLayout = sliceLayout.getParent();
+      SmallVector<unsigned> dims = {sliceLayout.getDim()};
+      while (auto parentSliceLayout =
+                 mlir::dyn_cast<SliceEncodingAttr>(parentLayout)) {
+        dims.push_back(parentSliceLayout.getDim());
+        parentLayout = parentSliceLayout.getParent();
+      }
+
+      auto parentThreadsPerWarps = triton::gpu::getThreadsPerWarp(parentLayout);
+      auto parentOrder = triton::gpu::getThreadOrder(parentLayout);
+      multiDimLaneId = delinearize(rewriter, loc, laneId, parentThreadsPerWarps,
+                                   parentOrder);
+      for (unsigned dim : llvm::reverse(dims)) {
+        multiDimLaneId.erase(multiDimLaneId.begin() + dim);
+      }
+    } else {
+      SmallVector<unsigned> threadsPerWarps =
+          triton::gpu::getThreadsPerWarp(srcLayout);
+      threadsPerWarps[helper.getAxis()] =
+          triton::gpu::getThreadsPerWarpWithUniqueData(
+              srcLayout, srcShape)[helper.getAxis()];
+      multiDimLaneId =
+          delinearize(rewriter, loc, laneId, threadsPerWarps, order);
+    }
+    return multiDimLaneId;
+  }
+
   SmallVector<Value>
   getMultiDimWarpId(ReduceOpHelper &helper, Value &warpId, Location &loc,
                     ConversionPatternRewriter &rewriter) const {
@@ -231,11 +271,20 @@ struct ReduceOpConversion
     // a way to properly delinearize warpId in the slice case
     if (auto sliceLayout = mlir::dyn_cast<SliceEncodingAttr>(srcLayout)) {
       auto parentLayout = sliceLayout.getParent();
+      SmallVector<unsigned> dims = {sliceLayout.getDim()};
+      while (auto parentSliceLayout =
+                 mlir::dyn_cast<SliceEncodingAttr>(parentLayout)) {
+        dims.push_back(parentSliceLayout.getDim());
+        parentLayout = parentSliceLayout.getParent();
+      }
+
       auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout);
       auto parentOrder = triton::gpu::getWarpOrder(parentLayout);
       multiDimWarpId =
           delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder);
-      multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim());
+      for (unsigned dim : llvm::reverse(dims)) {
+        multiDimWarpId.erase(multiDimWarpId.begin() + dim);
+      }
     } else {
       SmallVector<unsigned> warpsPerCTA =
           triton::gpu::getWarpsPerCTA(srcLayout);
@@ -263,11 +312,8 @@ struct ReduceOpConversion
     unsigned axis = op.getAxis();
     auto smemShape = helper.getScratchRepShape();
 
-    auto threadsPerWarp =
-        triton::gpu::getThreadsPerWarpWithUniqueData(srcLayout, srcShape);
-    auto order = getThreadOrder(srcLayout);
     SmallVector<Value> multiDimLaneId =
-        delinearize(rewriter, loc, laneId, threadsPerWarp, order);
+        getMultiDimLaneId(helper, laneId, loc, rewriter);
     Value laneIdAxis = multiDimLaneId[axis];
     Value zero = i32_val(0);
     Value laneZero = icmp_eq(laneIdAxis, zero);
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
index a16f9a3cad..12f5be29c9 100644
--- a/lib/Dialect/Triton/IR/Ops.cpp
+++ b/lib/Dialect/Triton/IR/Ops.cpp
@@ -1039,6 +1039,12 @@ void ExternElementwiseOp::getEffects(
                        SideEffects::DefaultResource::get());
 }
 
+Speculation::Speculatability ExternElementwiseOp::getSpeculatability() {
+  if (getPure())
+    return Speculation::Speculatable;
+  return Speculation::NotSpeculatable;
+}
+
 // -- ExperimentalTensormapCreateOp --
 LogicalResult ExperimentalTensormapCreateOp::verify() {
   auto rank = getBoxDim().size();
diff --git a/lib/Target/SPIRV/spirv-llvm-translator.conf b/lib/Target/SPIRV/spirv-llvm-translator.conf
index 22368d5080..8134401a4c 100644
--- a/lib/Target/SPIRV/spirv-llvm-translator.conf
+++ b/lib/Target/SPIRV/spirv-llvm-translator.conf
@@ -1 +1 @@
-1a1bf17d9e8684cd826e4278e78f63aa80e2e2ca
+15fd1cc50e12465c74ef34a264f11c8523247b46
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
index f7ec0bbb2b..2239f2578e 100644
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -31,6 +31,7 @@
     is_hip,
     is_hip_cdna,
     is_hip_mi200,
+    is_hip_mi300,
     is_xpu,
     get_arch,
     torch_float8_dtypes,
@@ -3414,8 +3415,8 @@ def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack
     if is_hip():
         if not is_hip_cdna():
             pytest.skip("scaled_dot only implemented for HIP CDNA")
-        if (type_a not in ["e2m1", "e5m2"]) or (type_b not in ["e2m1", "e5m2", "bf16"]):
-            pytest.skip(f"scaled_dot({type_a}, {type_b}) not yet implemented for HIP")
+        if "e4m3" in (type_a, type_b) and not is_hip_mi300():
+            pytest.skip(f"scaled_dot({type_a}, {type_b}) only implemented for MI300")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
     if is_xpu():
@@ -5186,8 +5187,6 @@ def return_poison(x):
 
 
 def test_poison_return(device):
-    if is_xpu():
-        pytest.skip("FIXME: poison is optimized away by opt")
 
     @triton.jit
     def kernel(Out):
@@ -5196,7 +5195,9 @@ def kernel(Out):
     a = torch.empty((), device=device, dtype=torch.int32)
     h = kernel[(1, )](a)
     assert "ub.poison" in h.asm["ttir"], h.asm["ttir"]
-    assert "poison" in h.asm["llir"], h.asm["llir"]
+    # xpu uses llvm.store, which in this case is removed by the optimizer
+    if not is_xpu():
+        assert "poison" in h.asm["llir"], h.asm["llir"]
 
 
 # -----------------------
@@ -6072,3 +6073,33 @@ def sanitize_cumsum_kernel(Z, X, BLOCK: tl.constexpr):
     Z = torch.zeros_like(X)
     sanitize_cumsum_kernel[(1, )](Z, X, BLOCK=BLOCK)
     torch.testing.assert_close(Z, X.cumsum(0).to(torch.int32))
+
+
+# stress test slice layout usages in reductions.
+@pytest.mark.parametrize("in_shape, perm, red_dims", [
+    ((4, 32, 32, 4, 2), [2, 1, 0, 3, 4], [3, 1, 0]),
+    ((8, 2, 32, 4, 16), [4, 0, 1, 3, 2], [0, 2, 0]),
+])
+def test_chained_reductions(in_shape, perm, red_dims, device):
+
+    @triton.jit
+    def kernel(In, Out,  #
+               dim_0: tl.constexpr, dim_1: tl.constexpr, dim_2: tl.constexpr, dim_3: tl.constexpr, dim_4: tl.constexpr,
+               perm_0: tl.constexpr, perm_1: tl.constexpr, perm_2: tl.constexpr, perm_3: tl.constexpr,
+               perm_4: tl.constexpr, red_dim_0: tl.constexpr, red_dim_1: tl.constexpr, red_dim_2: tl.constexpr):
+        idx = tl.arange(0, dim_0 * dim_1 * dim_2 * dim_3 * dim_4)
+        idx = idx.reshape(dim_0, dim_1, dim_2, dim_3, dim_4)
+        vals = tl.load(In + idx)
+        vals = tl.permute(vals, [perm_0, perm_1, perm_2, perm_3, perm_4])
+        r = tl.sum(tl.sum(tl.sum(vals, red_dim_0), red_dim_1), red_dim_2)
+        st_idx = tl.arange(0, r.shape[0] * r.shape[1]).reshape(r.shape)
+        tl.store(Out + st_idx, r)
+
+    input = torch.randint(0, 1000, in_shape, device=device, dtype=torch.int32)
+    temp = torch.permute(input, perm).contiguous()
+    ref = torch.sum(torch.sum(torch.sum(temp, dim=red_dims[0]), dim=red_dims[1]), dim=red_dims[2])
+    result = torch.empty_like(ref)
+    kernel[(1, )](input, result, input.shape[0], input.shape[1], input.shape[2], input.shape[3], input.shape[4],
+                  perm[0], perm[1], perm[2], perm[3], perm[4], red_dims[0], red_dims[1], red_dims[2])
+
+    assert torch.all(ref == result)
diff --git a/python/test/unit/language/test_libdevice.py b/python/test/unit/language/test_libdevice.py
index 58f063cd77..1a18e9ef78 100644
--- a/python/test/unit/language/test_libdevice.py
+++ b/python/test/unit/language/test_libdevice.py
@@ -4,6 +4,7 @@
 import triton
 import triton.language as tl
 from triton.language.extra.intel import libdevice
+from triton.language.extra.libdevice import fast_dividef as my_fast_dividef
 
 
 @pytest.mark.parametrize("dtype_str", ["float32", "float64"])
@@ -39,3 +40,20 @@ def kernel(in_p, out_p, fn: tl.constexpr, SIZE: tl.constexpr):
     kernel[(1, )](x, y_exp, fn=libdevice_fn, SIZE=SIZE, num_warps=4, num_ctas=1)
 
     torch.testing.assert_close(y_ref, y_exp, equal_nan=True)
+
+
+def test_libdevice_rename(device):
+    # mark the import as used by this test
+    _ = my_fast_dividef
+
+    @triton.jit
+    def triton_copy(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr):
+        offsets = tl.arange(0, BLOCK_SIZE)
+        data = tl.load(in_ptr + offsets)
+        tl.store(out_ptr + offsets, data)
+
+    BLOCK_SIZE = 256
+    inp = torch.randn(BLOCK_SIZE, device=device)
+    out = torch.empty_like(inp)
+
+    triton_copy[(1, )](inp, out, BLOCK_SIZE)
diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py
index ec0ef227fa..4ec46f32c4 100644
--- a/python/triton/compiler/code_generator.py
+++ b/python/triton/compiler/code_generator.py
@@ -218,7 +218,7 @@ def __init__(self, context, prototype, gscope, attributes, constants, function_n
 
             module_name = getattr(v, "__module__", "")
             if module_name in module_map:
-                self.gscope[k] = getattr(module_map[module_name], k)
+                self.gscope[k] = getattr(module_map[module_name], v.__name__)
             else:
                 self.gscope[k] = v
 
diff --git a/scripts/install-conda.sh b/scripts/install-conda.sh
index c55cc95399..dd29d57fa7 100755
--- a/scripts/install-conda.sh
+++ b/scripts/install-conda.sh
@@ -2,8 +2,8 @@
 
 link_sycl() {
     mkdir -p $HOME/miniforge3/envs/triton/$1
-    ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl $HOME/miniforge3/envs/triton/$1/
-    ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl/CL $HOME/miniforge3/envs/triton/$1/
+    ln -snf /opt/intel/oneapi/compiler/latest/include/sycl $HOME/miniforge3/envs/triton/$1/
+    ln -snf /opt/intel/oneapi/compiler/latest/include/sycl/CL $HOME/miniforge3/envs/triton/$1/
 }
 
 install_env() {
diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt
index a829b75bf0..e833b924bd 100644
--- a/scripts/skiplist/a770/language.txt
+++ b/scripts/skiplist/a770/language.txt
@@ -2,6 +2,8 @@
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt
index f8a9e48124..41035163ff 100644
--- a/scripts/skiplist/conda/language.txt
+++ b/scripts/skiplist/conda/language.txt
@@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt
index 36c6d7e69b..fb018c5e0f 100644
--- a/scripts/skiplist/default/language.txt
+++ b/scripts/skiplist/default/language.txt
@@ -2,3 +2,5 @@
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt
index 76dd77c939..c2842cdb91 100644
--- a/scripts/skiplist/lts/language.txt
+++ b/scripts/skiplist/lts/language.txt
@@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt
index a346bc76ab..df2e44aae4 100644
--- a/scripts/skiplist/mtl/language.txt
+++ b/scripts/skiplist/mtl/language.txt
@@ -2,6 +2,8 @@
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt
index 3d923ed1d6..436fe52550 100644
--- a/scripts/skiplist/xe2/language.txt
+++ b/scripts/skiplist/xe2/language.txt
@@ -2,6 +2,8 @@
 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)]
 # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662
 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32]
+# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703
+test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32]
 test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32]
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
index d83a0b4b25..f6ddb63183 100644
--- a/test/Conversion/intel/tritongpu_to_gen.mlir
+++ b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -1045,7 +1045,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // -----
 
 module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
   // CHECK-LABEL: atomic_cas_f32_scalar_no_store
   tt.func @atomic_cas_f32_scalar_no_store(%ptr : !tt.ptr<f32>, %cmp : f32, %val : f32) {
     // CHECK:      [[TRUE:%.*]] = llvm.mlir.constant(true) : i1
@@ -1054,7 +1053,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32,
     // CHECK:      [[CMP:%.*]] = llvm.icmp "eq"
     // CHECK:      [[MASK:%.*]] = llvm.and [[MASK0]], [[CMP]]
     // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK:      llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> ()
+    // CHECK:      [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:      [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:      [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32
+    // CHECK:      llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
     // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO]] : i32)
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST1:%.*]] = llvm.bitcast %arg1 : f32 to i32
@@ -1109,7 +1111,6 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32,
 
 #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
   // CHECK-LABEL: atomic_add_f32
   tt.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr<f32>, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) {
     // CHECK:      [[EV0_ARG2:%.*]] = llvm.extractvalue %arg2[0] : !llvm.struct<(f32, f32)>
@@ -1132,7 +1133,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     // CHECK:        [[IE2:%.*]] = llvm.insertelement [[EV1_ARG2]], [[UNDEF2]][{{.*}} : i64] : vector<1xf32>
     // CHECK-NEXT:   [[PRED2:%.*]] = llvm.and [[CST_TRUE]], {{.*}} : i1
     // CHECK-NEXT:   [[ZERO2:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
-    // CHECK:        llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> ()
+    // CHECK:        [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:        [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:        [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32
+    // CHECK:        llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
     // CHECK-NEXT:   llvm.cond_br [[PRED2]], ^bb3, ^bb4([[ZERO2]] : f32)
     // CHECK-NEXT: ^bb3:
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast [[IE2]] : vector<1xf32> to f32
@@ -1147,7 +1151,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 // -----
 
 module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-  // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
   // CHECK-LABEL: atomic_add_f32_scalar_no_store
   tt.func @atomic_add_f32_scalar_no_store(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
     // CHECK:      [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1
@@ -1159,7 +1162,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32,
     // CHECK:      [[IE1:%.*]] = llvm.insertelement %arg2, [[UNDEF1]][{{.*}} : i64] : vector<1xf32>
     // CHECK:      [[PRED:%.*]] = llvm.and [[AND1]], %arg1  : i1
     // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
-    // CHECK:      llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> ()
+    // CHECK:      [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:      [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK:      [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32
+    // CHECK:      llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
     // CHECK-NEXT: llvm.cond_br [[PRED]], ^bb1, ^bb2([[ZERO]] : f32)
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast [[IE1]] : vector<1xf32> to f32
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
index 4c5d658af1..679a18cd9b 100644
--- a/test/Conversion/tritongpu_to_llvm.mlir
+++ b/test/Conversion/tritongpu_to_llvm.mlir
@@ -1855,3 +1855,21 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 #loc3 = loc("inner_call":29:28)
 #loc4 = loc(callsite(#loc3 at #loc1))
 #loc5 = loc(callsite(#loc4 at #loc2))
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} {
+  tt.func public @log1pf_scan(%39: tensor<32x16xf32, #blocked>) attributes {noinline = false} {
+    // CHECK: log1pf_scan
+    // non-speculatable ops will introduce a cond_br; extern_elementwise with pure = true should be considered speculatable.
+    // CHECK-NOT: llvm.cond_br
+    %40 = "tt.scan"(%39) <{axis = 1 : i32, reverse = false}> ({
+    ^bb0(%arg5: f32, %arg6: f32):
+      %43 = tt.extern_elementwise %arg5 {libname = "", libpath = "", pure = true, symbol = "__nv_log1pf"} : (f32) -> f32
+      %44 = arith.addf %43, %43 : f32
+      tt.scan.return %44 : f32
+    }) : (tensor<32x16xf32, #blocked>) -> tensor<32x16xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir
index e4dd0bc962..e3a4cdbfac 100644
--- a/test/TritonGEN/tritongen-to-llvm.mlir
+++ b/test/TritonGEN/tritongen-to-llvm.mlir
@@ -1,20 +1,5 @@
 // RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s
 
-// CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
-
-llvm.func @triton_gen.barrier() {
-  // CHECK-LABEL: triton_gen.barrier
-  // CHECK: [[LOCAL:%.*]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.call spir_funccc @_Z7barrierj([[LOCAL]]) {{.*}} : (i32) -> ()
-  // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(2 : i32) : i32
-  // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL]]) {{.*}} : (i32) -> ()
-  triton_gen.barrier {mem_fence=Local}
-  triton_gen.barrier {mem_fence=Global}
-  llvm.return
-}
-
-// -----
-
 // CHECK-DAG: llvm.func spir_funccc @_Z31intel_work_group_barrier_arriveii(i32, i32) attributes {convergent, no_unwind, will_return}
 // CHECK-DAG: llvm.func spir_funccc @_Z29intel_work_group_barrier_waitii(i32, i32) attributes {convergent, no_unwind, will_return}
 
diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir
index f388da5aa9..90e2336ded 100644
--- a/test/TritonGEN/tritongen.mlir
+++ b/test/TritonGEN/tritongen.mlir
@@ -1,12 +1,5 @@
 // RUN: triton-opt %s -split-input-file -verify-diagnostics | FileCheck %s
 
-llvm.func @triton_gen.barrier() {
-  // CHECK-LABEL: triton_gen.barrier
-  // CHECK: triton_gen.barrier {mem_fence = Local}
-  triton_gen.barrier {mem_fence=Local}
-  llvm.return
-}
-
 llvm.func @triton_gen.split_barrier_signal() {
   // CHECK-LABEL: triton_gen.split_barrier_signal
   // CHECK: triton_gen.split_barrier_signal {mem_fence = None, mem_scope = WorkGroup}
diff --git a/test/TritonIntelGPU/optimize-elementwise.mlir b/test/TritonIntelGPU/optimize-elementwise.mlir
new file mode 100644
index 0000000000..d8b64bab89
--- /dev/null
+++ b/test/TritonIntelGPU/optimize-elementwise.mlir
@@ -0,0 +1,65 @@
+// RUN: triton-opt %s --split-input-file -tritonintelgpu-optimize-elementwise-parallelism | FileCheck %s
+
+// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
+// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+
+#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+// CHECK-LABEL:   tt.func @test_dpas(
+// CHECK-SAME:                       %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>,
+// CHECK-SAME:                       %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>)
+  tt.func @test_dpas(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> {
+// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]>
+// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_0]]>
+    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
+// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
+// CHECK:           tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>
+    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>
+  }
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+// CHECK-LABEL:   tt.func @test_blocked(
+// CHECK-SAME:                          %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
+// CHECK-SAME:                          %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>)
+  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
+// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_1]]>
+    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_1]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
+// CHECK:           tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
+    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+  }
+}
+
+// -----
+
+// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+// CHECK-LABEL:   tt.func @test_blocked_repeat(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>)
+  tt.func @test_blocked_repeat(%arg0: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
+// CHECK:           %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]>
+// CHECK:           %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<64xf32, #[[$ATTR_1]]>
+    %0 = arith.addf %arg0, %arg1 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+// CHECK:           %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<64xf32, #[[$ATTR_1]]> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
+// CHECK:           tt.return %[[VAL_5]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>
+    tt.return %0 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+  }
+}
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
index 50301edd1d..b05f856bbf 100644
--- a/third_party/intel/backend/compiler.py
+++ b/third_party/intel/backend/compiler.py
@@ -251,8 +251,9 @@ def make_ttgir(mod, metadata, opt, properties):
         passes.common.add_cse(pm)
         passes.ttgpuir.add_prefetch(pm)
         passes.ttgpuir.add_optimize_dot_operands(pm, True)
-        if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == 1:
+        if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == "1":
             intel.passes.ttgpuir.add_optimize_reduction_locality(pm)
+            intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm)
         intel.passes.ttgpuir.add_remove_layout_conversions(pm)
         intel.passes.ttgpuir.add_reduce_data_duplication(pm)
         passes.ttgpuir.add_reorder_instructions(pm)
diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
index d9d5266fba..dde9fd97e3 100644
--- a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
+++ b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td
@@ -32,20 +32,6 @@ class TritonGEN_Op<string mnemonic, list<Trait> traits = []> :
 // Synchronization
 //===----------------------------------------------------------------------===//
 
-def TritonGEN_BarrierOp : TritonGEN_Op<"barrier"> {
-  let summary = "Workgroup barrier";
-  let description = [{
-    The `triton_gen.barrier` operation performs a workgroup barrier and ensures
-    all outstanding memory transaction using local or global memory are complete.
-  }];
-  let arguments = (ins TritonGEN_MemFence:$mem_fence);
-  let results = (outs);
-  let assemblyFormat = "attr-dict";
-  let assemblyFormat = [{
-    ` ` `{` `mem_fence` `=` $mem_fence `}` attr-dict
-  }];
-}
-
 def TritonGEN_SplitBarrierSignalOp : TritonGEN_Op<"split_barrier_signal"> {
   let summary = "Split barrier signal";
   let description = [{
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
index c551a96856..1d81bc4741 100644
--- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
+++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td
@@ -365,4 +365,52 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic
                            "mlir::triton::gpu::TritonGPUDialect"];
 }
 
+def TritonIntelGPUOptimizeElementwiseParallelism
+    : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> {
+  let summary =
+      "Improve parallelism of elementwise operations better utilizing hardware resources.";
+
+  let description = [{
+    Detect elementwise operations with an encoding causing sub-par parallelism,
+    i.e., with data duplication across threads, and convert the operands to a
+    more optimal encoding if the cost of doing so is heuristically estimated to
+    be sufficiently low. As of now, the cost should be 0, we only support
+    "unbroadcasting" tensors, i.e., dropping duplicated values held in other
+    threads by re-distributing them.
+
+    As an example, this pass would modify the following code:
+```mlir
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
+    %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+  }
+}
+```
+    Obtaining:
+```mlir
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
+    %1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1>
+    %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1>
+    %3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+  }
+}
+```
+
+  Note how the converted tensors are not sliced and thus each element in the
+  tensor is held by a single thread.
+  }];
+
+  let dependentDialects = [];
+}
+
+
 #endif // TRITON_INTEL_GPU_PASSES
diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
index 3f45a9d779..78df567d61 100644
--- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
+++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
@@ -479,27 +479,6 @@ namespace {
 // Synchronization Ops Lowerings
 //===----------------------------------------------------------------------===//
 
-struct TritonGENBarrierLowering
-    : public ConvertOpToLLVMPattern<TritonGEN::BarrierOp> {
-  using ConvertOpToLLVMPattern<TritonGEN::BarrierOp>::ConvertOpToLLVMPattern;
-
-  LogicalResult
-  matchAndRewrite(TritonGEN::BarrierOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    MLIRContext *ctx = rewriter.getContext();
-    Location loc = op->getLoc();
-    Type retType = void_ty(ctx);
-    IntegerType argType = int_ty(32);
-    Value arg = i32_val(static_cast<int>(op.getMemFence()));
-
-    LLVM::CallOp callOp =
-        createDeviceFunctionCall(rewriter, "_Z7barrierj", {retType}, {argType},
-                                 {arg}, {}, convergentNoUnwindWillReturnAttrs);
-    rewriter.replaceOp(op, callOp);
-    return success();
-  }
-};
-
 struct TritonGENSplitBarrier {
 protected:
   template <typename OpType>
@@ -1092,13 +1071,12 @@ struct TritonGENToLLVMDialectInterface : public ConvertToLLVMPatternInterface {
 
 void mlir::triton::populateTritonGENToLLVMConversionPatterns(
     LLVMTypeConverter &converter, RewritePatternSet &patterns) {
-  patterns
-      .add<TritonGENBarrierLowering, TritonGENSplitBarrierSignalLowering,
-           TritonGENSplitBarrierWaitLowering, TritonSubGroupReduceLowering,
-           TritonSubGroupScanLowering, TritonMatrixDPASLowering,
-           TritonMatrix2DBlockLoadLowering, TritonMatrix2DBlockStoreLowering,
-           TritonMatrix2DBlockPrefetchLowering, TritonSIMDBlockReadLowering,
-           TritonSIMDBlockWriteLowering>(converter);
+  patterns.add<
+      TritonGENSplitBarrierSignalLowering, TritonGENSplitBarrierWaitLowering,
+      TritonSubGroupReduceLowering, TritonSubGroupScanLowering,
+      TritonMatrixDPASLowering, TritonMatrix2DBlockLoadLowering,
+      TritonMatrix2DBlockStoreLowering, TritonMatrix2DBlockPrefetchLowering,
+      TritonSIMDBlockReadLowering, TritonSIMDBlockWriteLowering>(converter);
 }
 
 void registerConvertTritonTritonGENToLLVMInterface(DialectRegistry &registry) {
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt
index f46c265fa4..4e86cbd2f2 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt
@@ -33,6 +33,7 @@ add_triton_library(TritonIntelGPUToLLVM
     LINK_LIBS PUBLIC
     GPUToTritonGEN
     MLIRGPUToLLVMSPV
+    MLIRSPIRVToLLVM
     TritonGENIR
     TritonGENToLLVM
     TritonIntelGPUIR
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
index 35eb540241..edd1999ea5 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1,5 +1,6 @@
 #include "Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
+#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/SmallVector.h"
@@ -1313,7 +1314,10 @@ struct AtomicCASOpConversion
 
       Value zero = (valueElemNBits == 32) ? i32_val(0) : i64_val(0);
       if (!atomicNeedsSharedMemory(op.getResult()))
-        rewriter.create<TritonGEN::BarrierOp>(loc, TritonGEN::MemFence::GLOBAL);
+        rewriter.create<spirv::ControlBarrierOp>(
+            loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup,
+            spirv::MemorySemantics::SequentiallyConsistent |
+                spirv::MemorySemantics::CrossWorkgroupMemory);
       Block &endBlock =
           LLVM::intel::createPredicatedBlock(rewriter, loc, mask, {zero}, [&] {
             // casPtr = bitcast(casPtr, ptr_ty(ctx, 1));
@@ -1462,8 +1466,10 @@ struct AtomicRMWOpConversion
                                  rmwPtr, rmwVal, rmwMask, {zero});
       } else {
         if (!atomicNeedsSharedMemory(op.getResult()))
-          rewriter.create<TritonGEN::BarrierOp>(loc,
-                                                TritonGEN::MemFence::GLOBAL);
+          rewriter.create<spirv::ControlBarrierOp>(
+              loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup,
+              spirv::MemorySemantics::SequentiallyConsistent |
+                  spirv::MemorySemantics::CrossWorkgroupMemory);
         endBlock = &LLVM::intel::createPredicatedBlock(
             rewriter, loc, rmwMask, {zero}, [&] {
               mlir::LLVM::AtomicBinOp rmwKind;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
index b52b3a3b97..0593ca63f1 100644
--- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
+++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
@@ -18,6 +18,7 @@
 #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
+#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/IR/PatternMatch.h"
@@ -268,6 +269,8 @@ class TritonGPUToLLVMPipelineManager {
     triton::populateGPUToTritonGENConversionPatterns(typeConverter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
     populateGpuToLLVMSPVConversionPatterns(typeConverter, patterns);
+    populateSPIRVToLLVMConversionPatterns(typeConverter, patterns,
+                                          spirv::ClientAPI::OpenCL);
   }
 
 private:
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
index dbc641e2a3..46d121a070 100644
--- a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt
@@ -4,6 +4,7 @@ add_triton_library(TritonIntelGPUTransforms
   DistributeToWarps.cpp
   MatchTargetSize.cpp
   MaterializeBlockPointer.cpp
+  OptimizeElementwiseParallelism.cpp
   OptimizeReductionLocality.cpp
   Pipeliner/MatmulLoopPipeline.cpp
   Pipeliner/SoftwarePipeliner.cpp
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp
new file mode 100644
index 0000000000..1bd154306d
--- /dev/null
+++ b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp
@@ -0,0 +1,160 @@
+//===- OptimizeElementwiseParallelism.cpp -------------------------------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// This file implements the `tritonintelgpu-optimize-elementwise-parallelism`
+/// pass.
+//===----------------------------------------------------------------------===//
+
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h"
+
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#include "triton/Dialect/Triton/IR/Dialect.h"
+#include "triton/Dialect/Triton/IR/Utility.h"
+#include "triton/Dialect/TritonGPU/IR/Dialect.h"
+
+#define DEBUG_TYPE "tritonintelgpu-optimize-elementwise-parallelism"
+
+namespace mlir::triton::gpu::intel {
+#define GEN_PASS_DEF_TRITONINTELGPUOPTIMIZEELEMENTWISEPARALLELISM
+#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc"
+
+namespace {
+/// Return whether the input linear layout can be unbroadcasted.
+///
+/// A layout is valid for being "unbroadcasted" along its lanes if:
+/// - The 'lane' input dimension is zero: this means the lane dimension has been
+/// sliced.
+/// - The size of the input 'block' dimension is 1. This is true for XPU
+/// backend.
+/// - The size of the input 'warp' dimension is 1. This is a limitation to keep
+/// things simple for now.
+///
+/// Broadcasted layouts are layouts with sliced lane, warp or block (not
+/// possible for XPU backend) dimensions, i.e., the same data is owned by
+/// different threads.
+bool isValidLayoutForUnbroadcast(const LinearLayout &linearLayout,
+                                 PatternRewriter &rewriter) {
+  StringAttr kLane = rewriter.getStringAttr("lane");
+  StringAttr kWarp = rewriter.getStringAttr("warp");
+  StringAttr kBlock = rewriter.getStringAttr("block");
+  StringAttr kDim0 = rewriter.getStringAttr("dim0");
+  // 'lane' dimension must have been sliced away completely.
+  if (!linearLayout.sublayoutIsZero(kLane, kDim0))
+    return false;
+  // Only single block for now.
+  if (linearLayout.getInDimSize(kBlock) != 1)
+    return false;
+  // Only single warp for now.
+  return linearLayout.getInDimSize(kWarp) == 1;
+}
+
+/// Get optimized unbroadcasted tensor type.
+///
+/// Get optimized ranked tensor type after unbroadcasting. As we only support 1D
+/// tensors, this is as simple as getting an "unboradcasted" blocked-encoded 1D
+/// tensor type.
+RankedTensorType getOptimizedType(RankedTensorType type,
+                                  const LinearLayout &linearLayout,
+                                  PatternRewriter &rewriter) {
+  auto encoding = cast<DistributedEncodingTrait>(type.getEncoding());
+  unsigned threadsPerWarp = product(encoding.getThreadsPerWarp());
+  [[maybe_unused]] unsigned warpsPerCTA = product(encoding.getWarpsPerCTA());
+  assert(warpsPerCTA == 1 && "Expecting single warp");
+  [[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum());
+  assert(ctaSplitNum == 1 && "Expecting single CTA");
+
+  RankedTensorType::Builder builder(type);
+  CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(rewriter.getContext(), 1);
+  auto newEncoding = rewriter.getAttr<BlockedEncodingAttr>(
+      /*sizePerThread=*/1, threadsPerWarp, /*warpsPerCTA=*/1, /*order=*/0,
+      ctaLayout);
+  builder.setEncoding(newEncoding);
+  return builder;
+}
+
+struct ElementwiseOptPattern final
+    : OpTraitRewritePattern<OpTrait::Elementwise> {
+  using OpTraitRewritePattern<OpTrait::Elementwise>::OpTraitRewritePattern;
+
+  LogicalResult matchAndRewrite(Operation *op,
+                                PatternRewriter &rewriter) const final {
+    // Rely on this for a simpler pass.
+    if (!op->hasTrait<OpTrait::SameOperandsAndResultType>() ||
+        op->getNumResults() != 1)
+      return failure();
+
+    // Skip complex operations.
+    if (op->hasSuccessors() || op->getNumRegions() != 0)
+      return failure();
+
+    // Layout optimizations only apply to tensors.
+    auto type = dyn_cast<RankedTensorType>(op->getResultTypes().front());
+    if (!type)
+      return failure();
+
+    // Check if the layout is actually bad and can be optimized using our
+    // approach. We only support 1D tensors for now as these are easier to
+    // handle.
+    Attribute layout = type.getEncoding();
+    if (!layout || type.getRank() != 1)
+      return failure();
+    std::optional<LinearLayout> linearLayout =
+        toLinearLayout(type.getShape(), layout);
+    if (!linearLayout || !isValidLayoutForUnbroadcast(*linearLayout, rewriter))
+      return failure();
+
+    // Check the operands are not used by other operations. This will prevent
+    // register pressure increase:
+    if (!llvm::all_of(op->getOperands(),
+                      [](Value val) { return val.hasOneUse(); }))
+      return failure();
+
+    // As we are dealing with 1D tensors, we can do a simple transform to obtain
+    // a more optimized operation.
+    Location loc = op->getLoc();
+    RankedTensorType newType = getOptimizedType(type, *linearLayout, rewriter);
+    SmallVector<Value> newOperands(op->getNumOperands());
+    llvm::transform(op->getOperands(), std::begin(newOperands),
+                    [&rewriter, loc, newType](Value operand) {
+                      return rewriter.create<ConvertLayoutOp>(loc, newType,
+                                                              operand);
+                    });
+
+    // Now we create the optimized operation:
+    StringAttr opName = op->getName().getIdentifier();
+    ArrayRef<NamedAttribute> attributes = op->getAttrs();
+    Operation *newElementwiseOp =
+        rewriter.create(loc, opName, newOperands, newType, attributes);
+    assert(newElementwiseOp->getNumResults() == 1 &&
+           "Expecting single result operation");
+
+    // Convert to unoptimized encoding for further use.
+    Value newValue = newElementwiseOp->getResult(0);
+    rewriter.replaceOpWithNewOp<ConvertLayoutOp>(op, type, newValue);
+
+    return success();
+  }
+};
+
+struct TritonIntelGPUOptimizeElementwiseParallelism final
+    : impl::TritonIntelGPUOptimizeElementwiseParallelismBase<
+          TritonIntelGPUOptimizeElementwiseParallelism> {
+  using Base::Base;
+
+  void runOnOperation() final {
+    Operation *op = getOperation();
+    MLIRContext *ctx = op->getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.add<ElementwiseOptPattern>(ctx);
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+      signalPassFailure();
+  }
+};
+} // namespace
+} // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index 55db149919..3a3037f6c0 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -100,6 +100,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUMaterializeBlockPointer);
   ADD_PASS_WRAPPER_0("add_optimize_reduction_locality",
                      gpu::intel::createTritonIntelGPUOptimizeReductionLocality);
+  ADD_PASS_WRAPPER_0(
+      "add_optimize_elementwise_parallelism",
+      gpu::intel::createTritonIntelGPUOptimizeElementwiseParallelism);
 }
 
 void init_triton_intel(py::module &&m) {
diff --git a/utils/SPIRVRunner/args_data.json b/utils/SPIRVRunner/args_data.json
index 1578504cdc..db7bb68075 100644
--- a/utils/SPIRVRunner/args_data.json
+++ b/utils/SPIRVRunner/args_data.json
@@ -32,5 +32,6 @@
   "threads_per_warp": 32,
   "shared_memory": 0,
   "kernel_name": "add_kernel",
-  "spv_name": "add_kernel.spv"
+  "spv_name": "add_kernel.spv",
+  "build_flags": ""
 }