diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 0000000000..0fc64b2764 --- /dev/null +++ b/.github/README.md @@ -0,0 +1,324 @@ +[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml) +[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml) + +# Intel® XPU Backend for Triton\* + +This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton) backend for Intel GPUs. +Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/pytorch/pytorch) and standalone usage. + +# Compatibility + +* Operating systems: + * [Ubuntu 22.04](http://releases.ubuntu.com/22.04) +* GPU Cards: + * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) + * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html) + * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html) +* GPU Drivers: + * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html) + * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) +* Toolchain: + * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*. + +# Quick Installation + +## Prerequisites + +1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver +2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +## Install PyTorch and Triton from nightly wheels + +Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels. +Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml), +select the most recent successful run on the top of the page and download an artifact for the corresponding Python version. +Extract the archive and in the extracted directory execute: + +```shell +pip install torch-*.whl triton-*.whl +``` + +Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain. +The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). + +```shell +# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs +source /opt/intel/oneapi/setvars.sh +``` + +# Install from source + +## Prerequisites + +1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver +2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +## Compile PyTorch and Triton from source + +Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time. + +Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain. +The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). + +```shell +# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs +source /opt/intel/oneapi/setvars.sh +``` + +Clone this repository: + +```shell +git clone https://github.com/intel/intel-xpu-backend-for-triton.git +cd intel-xpu-backend-for-triton +``` + +To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment: + +```shell +python -m venv .venv --prompt triton +source .venv/bin/activate +``` + +Compile and install PyTorch: + +```shell +scripts/install-pytorch.sh --source +``` + +Compile and install Intel® XPU Backend for Triton\*: + +```shell +scripts/compile-triton.sh +``` + +# Building with a custom LLVM + +Triton uses LLVM to generate code for GPUs and CPUs. Normally, the Triton build +downloads a prebuilt LLVM, but you can also build LLVM from source and use that. + +LLVM does not have a stable API, so the Triton build will not work at an +arbitrary LLVM version. + +1. Find the version of LLVM that Triton builds against. +Check `cmake/llvm-hash.txt` to see the current version. + +2. Checkout LLVM at this revision to the directory `llvm`, +which must be in the same directory as `intel-xpu-backend-for-triton`: + +3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM: + + ```shell + ./scripts/compile-triton.sh --llvm --triton + ``` + +# Tips for building + +- Set `TRITON_BUILD_WITH_CLANG_LLD=true` as an environment variable to use clang + and lld. lld in particular results in faster builds. + +- Set `TRITON_BUILD_WITH_CCACHE=true` to build with ccache. + +- Set `TRITON_HOME=/some/path` to change the location of the `.triton` + directory where Triton's cache is located and downloads are stored + during the build. By default, this is the user's home directory. It + can be changed anytime. + +- Pass `--no-build-isolation` to `pip install` to make nop builds faster. + Without this, every invocation of `pip install` uses a different symlink to + cmake, and this forces ninja to rebuild most of the `.a` files. + +- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++ + (probably because, in our build, users don't invoke cmake directly, but + instead use setup.py). Teach vscode how to compile Triton as follows. + + - Do a local build. Run command `pip install -e python` + - Get the full path to the `compile_commands.json` file produced by the build: + `find python/build -name 'compile_commands.json' | xargs readlink -f`. + You might get a full path similar to `/Users/{username}/triton/python/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json` + - In vscode, install the + [C/C++ + extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools), + then open the command palette (`Shift + Command + P` on Mac, or `Shift + + Ctrl + P` on Windows/Linux) and open `C/C++: Edit Configurations (UI)`. + - Open "Advanced Settings" and paste the full path to + `compile_commands.json` into the "Compile Commands" textbox. + +# Running tests + +There currently isn't a turnkey way to run all the Triton tests, but you can +follow the following recipe. + +```shell +scripts/test-triton.sh +``` + +# Tips for hacking + +For detailed instructions on how to debug Triton's frontend, please refer to this [tutorial](https://triton-lang.org/main/programming-guide/chapter-3/debugging.html). The following includes additional tips for hacking on Triton's backend. + +**Helpful environment variables** + +- `MLIR_ENABLE_DUMP=1` dumps the IR before every MLIR pass Triton runs, for all + kernels. Use `MLIR_ENABLE_DUMP=kernelName` to dump for a specific kernel only. + - Triton cache can interfere with the dump. In cases where `MLIR_ENABLE_DUMP=1` does not work, try cleaning your triton cache: `rm -r ~/.triton/cache/*` +- `LLVM_IR_ENABLE_DUMP=1` dumps the IR before every pass run over the LLVM IR. +- `TRITON_INTERPRET=1` uses the Triton interpreter instead of running on the + GPU. You can insert Python breakpoints in your kernel code! +- `TRITON_ENABLE_LLVM_DEBUG=1` passes `-debug` to LLVM, printing a lot of + debugging information to stdout. If this is too noisy, run with just + `TRITON_LLVM_DEBUG_ONLY` instead to limit the output. + + An alternative way to reduce output noisiness is running with + `LLVM_IR_ENABLE_DUMP=1`, extract the IR before the LLVM pass of interest, and + then run LLVM's `opt` standalone, perhaps passing `-debug-only=foo` on the + command line. +- `TRITON_LLVM_DEBUG_ONLY=` is the equivalent of LLVM's + `-debug-only` command-line option. This limits the LLVM debug output to + specific pass or component names (which are specified using `#define + DEBUG_TYPE` throughout LLVM and Triton) in order to allow the debug output to + be less noisy. `TRITON_LLVM_DEBUG_ONLY` allows for one or more comma + separated values to be specified (eg + `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions` or + `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions,regalloc"`). +- `USE_IR_LOC={ttir,ttgir}` reparses the IR such that the location information + will be the line number of the IR file with that particular extension, + instead of line number of the python file. This can provide a direct mapping + from the IR to llir/ptx. When used with performance tools, it can provide a + breakdown on IR instructions. +- `TRITON_PRINT_AUTOTUNING=1` prints out the best autotuning config and total time + spent for each kernel after autotuning is complete. +- `DISABLE_LLVM_OPT` will disable llvm optimizations for make_llir and make_ptx + if its value is true when parsing as Bool. Otherwise, it will be parsed as a list + of flags to disable llvm optimizations. One usage case is + `DISABLE_LLVM_OPT="disable-lsr"` + Loop strength reduction is known to cause up to 10% performance changes for + certain kernels with register pressure. +- `TRITON_ALWAYS_COMPILE=1` forces to compile kernels regardless of cache hit. +- `MLIR_ENABLE_TIMING` dumps the timing information for each MLIR pass. +- `LLVM_ENABLE_TIMING` dumps the timing information for each LLVM pass. +- `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma). +- `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks. + +# Usage Guide + +## Code Modifications +Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels. + +1. Add `import torch` for xpu support. +2. Put the tensor and models to XPU by calling `to('xpu')`. + +This repository contains modified [tutorials](https://github.com/intel/intel-xpu-backend-for-triton/tree/main/python/tutorials) that must be used with Intel® XPU Backend for Triton\*. + +The following examples show modifications for the user code. + +### Example 1 : Triton Kernel + +This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics. + +Comparing to the original code, the following code modifies: + +```Python +import torch +import triton +import triton.language as tl + + +@triton.jit +def add_kernel( + x_ptr, + y_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + y = tl.load(y_ptr + offsets, mask=mask) + output = x + y + tl.store(output_ptr + offsets, output, mask=mask) + +def add(x: torch.Tensor, y: torch.Tensor): + # Put the tensor to xpu + output = torch.empty_like(x).xpu() + assert x.is_xpu and y.is_xpu and output.is_xpu + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) + + return output + +# For manual_seed, needs to use API for XPU +torch.xpu.manual_seed(0) +size = 512 +# For tensors, needs to be put on XPU +x = torch.rand(size, device='xpu') +y = torch.rand(size, device='xpu') +output_torch = x + y +output_triton = add(x, y) +print(output_torch) +print(output_triton) +print( + f'The maximum difference between torch and triton is ' + f'{torch.max(torch.abs(output_torch - output_triton))}' +) +``` + +### Example 2 : End-to-End Model +Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it. + +```Python +import torch +from torch._dynamo.testing import rand_strided + +from torch.nn import * +class simpleModel(torch.nn.Module): + def __init__(self): + super().__init__() + # tensors inside model should be on xpu + self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) + + def forward(self, x): + z = x + self.y + return z + +# tensors passed to the model should be on xpu +x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) +xpu_model = simpleModel() +# Call torch.compile for optimization +optimized_mod = torch.compile(xpu_model) + +graph_result = optimized_mod(x) +``` + +## Performance Analysis Guide + +There are several ways of doing performance analysis. +We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis. +Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling. + +```Bash +export TRITON_XPU_PROFILE=1 +``` + +# Contributing + +Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/CONTRIBUTING.md). + +## License + +_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file. + + +## Security + +See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html) +for information on how to report a potential security issue or vulnerability. + +See also: [Security Policy](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/SECURITY.md). diff --git a/.github/pins/pytorch-upstream.txt b/.github/pins/pytorch-upstream.txt index c2ce8b1a5f..0d9e3cab75 100644 --- a/.github/pins/pytorch-upstream.txt +++ b/.github/pins/pytorch-upstream.txt @@ -1 +1 @@ -33dce10ece5b38aa0ab76739b658cd980a6e3d8f +51e8a13d007b3032af45facb50dfa4ee6012f22a diff --git a/.github/workflows/auto-update-translator-cid.yml b/.github/workflows/auto-update-translator-cid.yml index 0b854da307..7c2aad266c 100644 --- a/.github/workflows/auto-update-translator-cid.yml +++ b/.github/workflows/auto-update-translator-cid.yml @@ -15,7 +15,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 defaults: run: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" diff --git a/.github/workflows/bandit-check.yml b/.github/workflows/bandit-check.yml index 9edaad048e..9d20af8055 100644 --- a/.github/workflows/bandit-check.yml +++ b/.github/workflows/bandit-check.yml @@ -11,7 +11,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 defaults: run: shell: bash diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index aca94112b9..0c3598c679 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -55,7 +55,7 @@ on: runner_version: description: Runner label for version type: string - default: runner-0.0.19 + default: runner-0.0.20 permissions: read-all diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index d3d9b29b56..84d09afa5d 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -76,12 +76,6 @@ jobs: run: | set -x pip install --upgrade pre-commit - - # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed - python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true - # If first run of yapf worked and made changes reset the tree to the original state - git reset --hard - python3 -m pre_commit run --show-diff-on-failure --color=always --all-files --verbose - name: Save pip cache diff --git a/.github/workflows/conda-test-reusable.yml b/.github/workflows/conda-test-reusable.yml index 11989e8587..d81f365489 100644 --- a/.github/workflows/conda-test-reusable.yml +++ b/.github/workflows/conda-test-reusable.yml @@ -55,7 +55,7 @@ on: runner_version: description: Runner label for version type: string - default: runner-0.0.19 + default: runner-0.0.20 env_manager: description: Environment manager default: conda @@ -82,7 +82,7 @@ jobs: id: conda-cache uses: ./.github/actions/load env: - CACHE_NUMBER: 6 + CACHE_NUMBER: 7 with: path: $HOME/miniforge3/envs/triton key: conda-${{ inputs.env_manager }}-py${{ matrix.python }}-${{ hashFiles('scripts/triton.yml', 'python/pyproject.toml', 'python/setup.py') }}-${{ env.CACHE_NUMBER }} diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml index d4b236e43d..832fafce79 100644 --- a/.github/workflows/nightly-wheels.yml +++ b/.github/workflows/nightly-wheels.yml @@ -16,7 +16,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 strategy: matrix: python: diff --git a/.github/workflows/no-basekit-build-test.yml b/.github/workflows/no-basekit-build-test.yml index dfa0313cbb..3c6e5a42df 100644 --- a/.github/workflows/no-basekit-build-test.yml +++ b/.github/workflows/no-basekit-build-test.yml @@ -17,7 +17,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 strategy: matrix: python: ${{ github.ref_name == 'main' && fromJson('["3.9", "3.10", "3.11"]') || fromJson('["3.9"]') }} diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 94e419646b..2a5f9937f6 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -24,6 +24,10 @@ on: description: Run name type: string default: "Triton benchmarks" + skip_benchmarks: + description: JSON list of benchmarks to skip + type: string + default: "[]" schedule: - cron: "5 23 * * *" pull_request: @@ -112,7 +116,7 @@ jobs: python setup.py install - name: Run Triton Softmax kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS @@ -121,7 +125,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS @@ -132,7 +136,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_default') }} run: | cd benchmarks/triton_kernels_benchmark # Default path: @@ -148,7 +152,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }} run: | cd benchmarks/triton_kernels_benchmark # Advanced path: @@ -164,7 +168,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM (A@B^t) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS @@ -175,7 +179,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - name: Run Triton GEMM (A^t@B) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS @@ -186,7 +190,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - name: Run Triton GEMM (stream-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS @@ -194,7 +198,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM (split-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS @@ -202,7 +206,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PreOp (exp) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS @@ -210,7 +214,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS @@ -218,7 +222,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS @@ -226,7 +230,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python flash_attention_fwd_benchmark.py --reports $REPORTS @@ -236,7 +240,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }} run: | cd benchmarks/triton_kernels_benchmark TRITON_INTEL_ADVANCED_PATH=0 \ @@ -249,7 +253,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py_advanced') }} run: | cd benchmarks/triton_kernels_benchmark TRITON_INTEL_ADVANCED_PATH=1 \ @@ -262,7 +266,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Prefix Sums kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS @@ -270,7 +274,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run micro benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS diff --git a/README.md b/README.md index a8bbe2c2e1..18e46403e7 100644 --- a/README.md +++ b/README.md @@ -1,98 +1,60 @@ -[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml) -[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml) -[![Conda test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml) +
+ Triton logo +
-# Intel® XPU Backend for Triton\* +The Triton Conference is happening again on September 17th, 2024 in Fremont (CA)! -This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage. +If you are interested in attending, please fill up [this form](https://docs.google.com/forms/d/e/1FAIpQLSecHC1lkalcm0h3JDUbspekDX5bmBvMxgVTLaK3e-61bzDDbg/viewform). -# Compatibility - -* Operating systems: - * [Ubuntu 22.04](http://releases.ubuntu.com/22.04) -* GPU Cards: - * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) - * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html) - * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html) -* GPU Drivers: - * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html) - * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) -* Toolchain: - * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +| **`Documentation`** | **`Nightly Wheels`** | +|-------------------- | -------------------- | +| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) | -Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*. - -# Quick Installation +# Triton -## Prerequisites +This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs. -1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver -2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) -3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton! -## Install PyTorch and Triton from nightly wheels +The [official documentation](https://triton-lang.org) contains installation instructions and tutorials. See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required. -Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels. -Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml), -select the most recent successful run on the top of the page and download an artifact for the corresponding Python version. -Extract the archive and in the extracted directory execute: - -```shell -pip install torch-*.whl triton-*.whl -``` +# Quick Installation -Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain. -The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). +You can install the latest stable release of Triton from pip: ```shell -# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs -source /opt/intel/oneapi/setvars.sh +pip install triton ``` -# Install from source - -## Prerequisites - -1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver -2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) -3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) - -## Compile PyTorch and Triton from source - -Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time. +Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9. -Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain. -The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). +And the latest nightly release: ```shell -# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs -source /opt/intel/oneapi/setvars.sh +pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly ``` -Clone this repository: +# Install from source ```shell -git clone https://github.com/intel/intel-xpu-backend-for-triton.git -cd intel-xpu-backend-for-triton -``` - -To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment: +git clone https://github.com/triton-lang/triton.git; +cd triton; -```shell -python -m venv .venv --prompt triton -source .venv/bin/activate +pip install ninja cmake wheel pybind11; # build-time dependencies +pip install -e python ``` -Compile and install PyTorch: +Or with a virtualenv: ```shell -scripts/install-pytorch.sh --source -``` +git clone https://github.com/triton-lang/triton.git; +cd triton; -Compile and install Intel® XPU Backend for Triton\*: +python -m venv .venv --prompt triton; +source .venv/bin/activate; -```shell -scripts/compile-triton.sh +pip install ninja cmake wheel pybind11; # build-time dependencies +pip install -e python ``` # Building with a custom LLVM @@ -103,17 +65,36 @@ downloads a prebuilt LLVM, but you can also build LLVM from source and use that. LLVM does not have a stable API, so the Triton build will not work at an arbitrary LLVM version. -1. Find the version of LLVM that Triton builds against. -Check `cmake/llvm-hash.txt` to see the current version. +1. Find the version of LLVM that Triton builds against. Check +`cmake/llvm-hash.txt` to see the current version. For example, if it says: + 49af6502c6dcb4a7f7520178bd14df396f78240c -2. Checkout LLVM at this revision to the directory `llvm`, -which must be in the same directory as `intel-xpu-backend-for-triton`: + This means that the version of Triton you have builds against + [LLVM](https://github.com/llvm/llvm-project) 49af6502. -3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM: +2. `git checkout` LLVM at this revision. Optionally, make additional + modifications to LLVM. - ```shell - ./scripts/compile-triton.sh --llvm --triton - ``` +3. [Build LLVM](https://llvm.org/docs/CMake.html). For example, you might run + + $ cd $HOME/llvm-project # your clone of LLVM. + $ mkdir build + $ cd build + $ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" + $ ninja + +4. Grab a snack, this will take a while. + +5. Build Triton as above, but set the following environment variables. + + # Modify as appropriate to point to your LLVM build. + $ export LLVM_BUILD_DIR=$HOME/llvm-project/build + + $ cd + $ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \ + LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib \ + LLVM_SYSPATH=$LLVM_BUILD_DIR \ + pip install -e python # Tips for building @@ -131,7 +112,7 @@ which must be in the same directory as `intel-xpu-backend-for-triton`: Without this, every invocation of `pip install` uses a different symlink to cmake, and this forces ninja to rebuild most of the `.a` files. -- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++ +- vscode intellisense has some difficulty figuring out how to build Triton's C++ (probably because, in our build, users don't invoke cmake directly, but instead use setup.py). Teach vscode how to compile Triton as follows. @@ -153,7 +134,36 @@ There currently isn't a turnkey way to run all the Triton tests, but you can follow the following recipe. ```shell -scripts/test-triton.sh +# One-time setup. Note we have to reinstall local Triton because torch +# overwrites it with the public version. +$ pip install scipy numpy torch pytest lit pandas matplotlib && pip install -e python + +# Run Python tests using your local GPU. +$ python3 -m pytest python/test/unit + +# Move to builddir. Fill in <...> with the full path, e.g. +# `cmake.linux-x86_64-cpython-3.11`. +$ cd python/build/cmake<...> + +# Run C++ unit tests. +$ ctest -j32 + +# Run lit tests. +$ lit test +``` + +You may find it helpful to make a symlink to the builddir and tell your local +git to ignore it. + +```shell +$ ln -s python/build/cmake<...> build +$ echo build >> .git/info/exclude +``` + +Then you can e.g. rebuild and run lit with the following command. + +```shell +$ ninja -C build && ( cd build ; lit test ) ``` # Tips for hacking @@ -203,124 +213,27 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi - `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma). - `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks. -# Usage Guide - -## Code Modifications -Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels. - -1. Add `import torch` for xpu support. -2. Put the tensor and models to XPU by calling `to('xpu')`. - -This repository contains modified [tutorials](python/tutorials) that must be used with Intel® XPU Backend for Triton\*. - -The following examples show modifications for the user code. - -### Example 1 : Triton Kernel - -This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics. - -Comparing to the original code, the following code modifies: - -```Python -import torch -import triton -import triton.language as tl - - -@triton.jit -def add_kernel( - x_ptr, - y_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - tl.store(output_ptr + offsets, output, mask=mask) - -def add(x: torch.Tensor, y: torch.Tensor): - # Put the tensor to xpu - output = torch.empty_like(x).xpu() - assert x.is_xpu and y.is_xpu and output.is_xpu - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) - - return output - -# For manual_seed, needs to use API for XPU -torch.xpu.manual_seed(0) -size = 512 -# For tensors, needs to be put on XPU -x = torch.rand(size, device='xpu') -y = torch.rand(size, device='xpu') -output_torch = x + y -output_triton = add(x, y) -print(output_torch) -print(output_triton) -print( - f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}' -) -``` - -### Example 2 : End-to-End Model -Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it. - -```Python -import torch -from torch._dynamo.testing import rand_strided - -from torch.nn import * -class simpleModel(torch.nn.Module): - def __init__(self): - super().__init__() - # tensors inside model should be on xpu - self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) - - def forward(self, x): - z = x + self.y - return z - -# tensors passed to the model should be on xpu -x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) -xpu_model = simpleModel() -# Call torch.compile for optimization -optimized_mod = torch.compile(xpu_model) - -graph_result = optimized_mod(x) -``` - -## Performance Analysis Guide - -There are several ways of doing performance analysis. We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis. We provide a comprehensive guide for those two: -1. [end_to_end_tests#profiling settings](docs/test_docs/end_to_end_tests.md#profiling-settings) section for using `torch.profiler`. -2. [VTune Profiling Guide](docs/VTune_Profiling.md) for kernel analysis. +# Changelog -Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling. +Version 2.0 is out! New features include: -```Bash -export TRITON_XPU_PROFILE=1 -``` +- Many, many bug fixes +- Performance improvements +- Backend rewritten to use MLIR +- Support for kernels that contain back-to-back matmuls (e.g., flash attention) # Contributing -Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md). - -## License +Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/triton-lang/triton/). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md). -_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file. +# Compatibility +Supported Platforms: -## Security +- Linux -See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html) -for information on how to report a potential security issue or vulnerability. +Supported Hardware: -See also: [Security Policy](security.md) +- NVIDIA GPUs (Compute Capability 8.0+) +- AMD GPUs (ROCm 5.2+) +- Under development: CPUs diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h index 0b1dd7ef90..ed769dc4b3 100644 --- a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h +++ b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h @@ -4,6 +4,7 @@ #include "fmha_backward_policy.h" #include "fmha_utils.h" #include "xetla.hpp" +#include using T = sycl::half; diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h index 9327b046ad..94123300f5 100644 --- a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h +++ b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h @@ -17,6 +17,7 @@ #pragma once #include "xetla.hpp" +#include namespace gpu::xetla { diff --git a/benchmarks/xetla_kernel/xetla-library.conf b/benchmarks/xetla_kernel/xetla-library.conf index 2cc1e9f5b3..944094ecd3 100644 --- a/benchmarks/xetla_kernel/xetla-library.conf +++ b/benchmarks/xetla_kernel/xetla-library.conf @@ -1 +1 @@ -b9e489ca6a776694a898044a3f2ae023a98db03d +bde127ffebf502d32ef8ac2748e12d7839597fab diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td index 6b2faf336a..87dc10a71c 100644 --- a/include/triton/Dialect/Triton/IR/TritonOps.td +++ b/include/triton/Dialect/Triton/IR/TritonOps.td @@ -778,7 +778,8 @@ def TT_ScanReturnOp: TT_Op<"scan.return", def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise, SameOperandsAndResultEncoding, SameVariadicOperandSize, - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods, + ConditionallySpeculatable]> { let description = [{ call an external function $symbol implemented in $libpath/$libname with $args @@ -790,6 +791,12 @@ def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise, let results = (outs TT_Type:$result); let assemblyFormat = "operands attr-dict `:` functional-type(operands, $result)"; + + let extraClassDeclaration = [{ + // Interface method for ConditionallySpeculatable. + Speculation::Speculatability getSpeculatability(); + }]; + } // diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 0321cc9b00..6e2d5149be 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -71,18 +71,25 @@ unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() { } unsigned threadOffset = 1; - if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { - auto parentLayout = sliceLayout.getParent(); - auto threadsPerWarp = getThreadsPerWarp(parentLayout); - threadOffset = threadsPerWarp[sliceLayout.getDim()]; - } else { - auto threadsPerWarp = getThreadsPerWarp(srcLayout); - auto order = getThreadOrder(srcLayout); - for (unsigned i = 0; i < order.size(); i++) { - if (order[i] == axis) - break; - threadOffset *= threadsPerWarp[order[i]]; - } + SmallVector dimsRemoved; + while (auto sliceLayout = mlir::dyn_cast(srcLayout)) { + dimsRemoved.push_back(sliceLayout.getDim()); + srcLayout = sliceLayout.getParent(); + } + // In case of slice layout we want to know the axis dimension relative to the + // most inner parent layout. `adjustedAxis` is the matching axis dim in the + // parent layout. + int adjustedAxis = axis; + for (auto dim : dimsRemoved) { + if (dim <= adjustedAxis) + adjustedAxis++; + } + auto threadsPerWarp = getThreadsPerWarp(srcLayout); + auto order = getThreadOrder(srcLayout); + for (unsigned i = 0; i < order.size(); i++) { + if (order[i] == adjustedAxis) + break; + threadOffset *= threadsPerWarp[order[i]]; } return threadOffset; } diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp index 4e80539233..829d4e7104 100644 --- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp @@ -218,6 +218,46 @@ struct ReduceOpConversion rewriter.replaceOp(op, results); } + // For slice layout some ids are duplicated on multiple lanes, so we need to + // handle the delinearization of laneId in a special way. We need to + // generalize this part of the logic to work on any kind of linear layout + // uniformely. + SmallVector + getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc, + ConversionPatternRewriter &rewriter) const { + auto srcLayout = helper.getSrcLayout(); + auto srcShape = helper.getSrcShape(); + auto order = triton::gpu::getThreadOrder(srcLayout); + SmallVector multiDimLaneId; + + if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { + auto parentLayout = sliceLayout.getParent(); + SmallVector dims = {sliceLayout.getDim()}; + while (auto parentSliceLayout = + mlir::dyn_cast(parentLayout)) { + dims.push_back(parentSliceLayout.getDim()); + parentLayout = parentSliceLayout.getParent(); + } + + auto parentThreadsPerWarps = triton::gpu::getThreadsPerWarp(parentLayout); + auto parentOrder = triton::gpu::getThreadOrder(parentLayout); + multiDimLaneId = delinearize(rewriter, loc, laneId, parentThreadsPerWarps, + parentOrder); + for (unsigned dim : llvm::reverse(dims)) { + multiDimLaneId.erase(multiDimLaneId.begin() + dim); + } + } else { + SmallVector threadsPerWarps = + triton::gpu::getThreadsPerWarp(srcLayout); + threadsPerWarps[helper.getAxis()] = + triton::gpu::getThreadsPerWarpWithUniqueData( + srcLayout, srcShape)[helper.getAxis()]; + multiDimLaneId = + delinearize(rewriter, loc, laneId, threadsPerWarps, order); + } + return multiDimLaneId; + } + SmallVector getMultiDimWarpId(ReduceOpHelper &helper, Value &warpId, Location &loc, ConversionPatternRewriter &rewriter) const { @@ -231,11 +271,20 @@ struct ReduceOpConversion // a way to properly delinearize warpId in the slice case if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { auto parentLayout = sliceLayout.getParent(); + SmallVector dims = {sliceLayout.getDim()}; + while (auto parentSliceLayout = + mlir::dyn_cast(parentLayout)) { + dims.push_back(parentSliceLayout.getDim()); + parentLayout = parentSliceLayout.getParent(); + } + auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout); auto parentOrder = triton::gpu::getWarpOrder(parentLayout); multiDimWarpId = delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder); - multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim()); + for (unsigned dim : llvm::reverse(dims)) { + multiDimWarpId.erase(multiDimWarpId.begin() + dim); + } } else { SmallVector warpsPerCTA = triton::gpu::getWarpsPerCTA(srcLayout); @@ -263,11 +312,8 @@ struct ReduceOpConversion unsigned axis = op.getAxis(); auto smemShape = helper.getScratchRepShape(); - auto threadsPerWarp = - triton::gpu::getThreadsPerWarpWithUniqueData(srcLayout, srcShape); - auto order = getThreadOrder(srcLayout); SmallVector multiDimLaneId = - delinearize(rewriter, loc, laneId, threadsPerWarp, order); + getMultiDimLaneId(helper, laneId, loc, rewriter); Value laneIdAxis = multiDimLaneId[axis]; Value zero = i32_val(0); Value laneZero = icmp_eq(laneIdAxis, zero); diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp index a16f9a3cad..12f5be29c9 100644 --- a/lib/Dialect/Triton/IR/Ops.cpp +++ b/lib/Dialect/Triton/IR/Ops.cpp @@ -1039,6 +1039,12 @@ void ExternElementwiseOp::getEffects( SideEffects::DefaultResource::get()); } +Speculation::Speculatability ExternElementwiseOp::getSpeculatability() { + if (getPure()) + return Speculation::Speculatable; + return Speculation::NotSpeculatable; +} + // -- ExperimentalTensormapCreateOp -- LogicalResult ExperimentalTensormapCreateOp::verify() { auto rank = getBoxDim().size(); diff --git a/lib/Target/SPIRV/spirv-llvm-translator.conf b/lib/Target/SPIRV/spirv-llvm-translator.conf index 22368d5080..8134401a4c 100644 --- a/lib/Target/SPIRV/spirv-llvm-translator.conf +++ b/lib/Target/SPIRV/spirv-llvm-translator.conf @@ -1 +1 @@ -1a1bf17d9e8684cd826e4278e78f63aa80e2e2ca +15fd1cc50e12465c74ef34a264f11c8523247b46 diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index f7ec0bbb2b..2239f2578e 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -31,6 +31,7 @@ is_hip, is_hip_cdna, is_hip_mi200, + is_hip_mi300, is_xpu, get_arch, torch_float8_dtypes, @@ -3414,8 +3415,8 @@ def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack if is_hip(): if not is_hip_cdna(): pytest.skip("scaled_dot only implemented for HIP CDNA") - if (type_a not in ["e2m1", "e5m2"]) or (type_b not in ["e2m1", "e5m2", "bf16"]): - pytest.skip(f"scaled_dot({type_a}, {type_b}) not yet implemented for HIP") + if "e4m3" in (type_a, type_b) and not is_hip_mi300(): + pytest.skip(f"scaled_dot({type_a}, {type_b}) only implemented for MI300") if mma == 16 and K == 64: pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot") if is_xpu(): @@ -5186,8 +5187,6 @@ def return_poison(x): def test_poison_return(device): - if is_xpu(): - pytest.skip("FIXME: poison is optimized away by opt") @triton.jit def kernel(Out): @@ -5196,7 +5195,9 @@ def kernel(Out): a = torch.empty((), device=device, dtype=torch.int32) h = kernel[(1, )](a) assert "ub.poison" in h.asm["ttir"], h.asm["ttir"] - assert "poison" in h.asm["llir"], h.asm["llir"] + # xpu uses llvm.store, which in this case is removed by the optimizer + if not is_xpu(): + assert "poison" in h.asm["llir"], h.asm["llir"] # ----------------------- @@ -6072,3 +6073,33 @@ def sanitize_cumsum_kernel(Z, X, BLOCK: tl.constexpr): Z = torch.zeros_like(X) sanitize_cumsum_kernel[(1, )](Z, X, BLOCK=BLOCK) torch.testing.assert_close(Z, X.cumsum(0).to(torch.int32)) + + +# stress test slice layout usages in reductions. +@pytest.mark.parametrize("in_shape, perm, red_dims", [ + ((4, 32, 32, 4, 2), [2, 1, 0, 3, 4], [3, 1, 0]), + ((8, 2, 32, 4, 16), [4, 0, 1, 3, 2], [0, 2, 0]), +]) +def test_chained_reductions(in_shape, perm, red_dims, device): + + @triton.jit + def kernel(In, Out, # + dim_0: tl.constexpr, dim_1: tl.constexpr, dim_2: tl.constexpr, dim_3: tl.constexpr, dim_4: tl.constexpr, + perm_0: tl.constexpr, perm_1: tl.constexpr, perm_2: tl.constexpr, perm_3: tl.constexpr, + perm_4: tl.constexpr, red_dim_0: tl.constexpr, red_dim_1: tl.constexpr, red_dim_2: tl.constexpr): + idx = tl.arange(0, dim_0 * dim_1 * dim_2 * dim_3 * dim_4) + idx = idx.reshape(dim_0, dim_1, dim_2, dim_3, dim_4) + vals = tl.load(In + idx) + vals = tl.permute(vals, [perm_0, perm_1, perm_2, perm_3, perm_4]) + r = tl.sum(tl.sum(tl.sum(vals, red_dim_0), red_dim_1), red_dim_2) + st_idx = tl.arange(0, r.shape[0] * r.shape[1]).reshape(r.shape) + tl.store(Out + st_idx, r) + + input = torch.randint(0, 1000, in_shape, device=device, dtype=torch.int32) + temp = torch.permute(input, perm).contiguous() + ref = torch.sum(torch.sum(torch.sum(temp, dim=red_dims[0]), dim=red_dims[1]), dim=red_dims[2]) + result = torch.empty_like(ref) + kernel[(1, )](input, result, input.shape[0], input.shape[1], input.shape[2], input.shape[3], input.shape[4], + perm[0], perm[1], perm[2], perm[3], perm[4], red_dims[0], red_dims[1], red_dims[2]) + + assert torch.all(ref == result) diff --git a/python/test/unit/language/test_libdevice.py b/python/test/unit/language/test_libdevice.py index 58f063cd77..1a18e9ef78 100644 --- a/python/test/unit/language/test_libdevice.py +++ b/python/test/unit/language/test_libdevice.py @@ -4,6 +4,7 @@ import triton import triton.language as tl from triton.language.extra.intel import libdevice +from triton.language.extra.libdevice import fast_dividef as my_fast_dividef @pytest.mark.parametrize("dtype_str", ["float32", "float64"]) @@ -39,3 +40,20 @@ def kernel(in_p, out_p, fn: tl.constexpr, SIZE: tl.constexpr): kernel[(1, )](x, y_exp, fn=libdevice_fn, SIZE=SIZE, num_warps=4, num_ctas=1) torch.testing.assert_close(y_ref, y_exp, equal_nan=True) + + +def test_libdevice_rename(device): + # mark the import as used by this test + _ = my_fast_dividef + + @triton.jit + def triton_copy(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): + offsets = tl.arange(0, BLOCK_SIZE) + data = tl.load(in_ptr + offsets) + tl.store(out_ptr + offsets, data) + + BLOCK_SIZE = 256 + inp = torch.randn(BLOCK_SIZE, device=device) + out = torch.empty_like(inp) + + triton_copy[(1, )](inp, out, BLOCK_SIZE) diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py index ec0ef227fa..4ec46f32c4 100644 --- a/python/triton/compiler/code_generator.py +++ b/python/triton/compiler/code_generator.py @@ -218,7 +218,7 @@ def __init__(self, context, prototype, gscope, attributes, constants, function_n module_name = getattr(v, "__module__", "") if module_name in module_map: - self.gscope[k] = getattr(module_map[module_name], k) + self.gscope[k] = getattr(module_map[module_name], v.__name__) else: self.gscope[k] = v diff --git a/scripts/install-conda.sh b/scripts/install-conda.sh index c55cc95399..dd29d57fa7 100755 --- a/scripts/install-conda.sh +++ b/scripts/install-conda.sh @@ -2,8 +2,8 @@ link_sycl() { mkdir -p $HOME/miniforge3/envs/triton/$1 - ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl $HOME/miniforge3/envs/triton/$1/ - ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl/CL $HOME/miniforge3/envs/triton/$1/ + ln -snf /opt/intel/oneapi/compiler/latest/include/sycl $HOME/miniforge3/envs/triton/$1/ + ln -snf /opt/intel/oneapi/compiler/latest/include/sycl/CL $HOME/miniforge3/envs/triton/$1/ } install_env() { diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt index a829b75bf0..e833b924bd 100644 --- a/scripts/skiplist/a770/language.txt +++ b/scripts/skiplist/a770/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt index f8a9e48124..41035163ff 100644 --- a/scripts/skiplist/conda/language.txt +++ b/scripts/skiplist/conda/language.txt @@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt index 36c6d7e69b..fb018c5e0f 100644 --- a/scripts/skiplist/default/language.txt +++ b/scripts/skiplist/default/language.txt @@ -2,3 +2,5 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt index 76dd77c939..c2842cdb91 100644 --- a/scripts/skiplist/lts/language.txt +++ b/scripts/skiplist/lts/language.txt @@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt index a346bc76ab..df2e44aae4 100644 --- a/scripts/skiplist/mtl/language.txt +++ b/scripts/skiplist/mtl/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt index 3d923ed1d6..436fe52550 100644 --- a/scripts/skiplist/xe2/language.txt +++ b/scripts/skiplist/xe2/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir index d83a0b4b25..f6ddb63183 100644 --- a/test/Conversion/intel/tritongpu_to_gen.mlir +++ b/test/Conversion/intel/tritongpu_to_gen.mlir @@ -1045,7 +1045,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_cas_f32_scalar_no_store tt.func @atomic_cas_f32_scalar_no_store(%ptr : !tt.ptr, %cmp : f32, %val : f32) { // CHECK: [[TRUE:%.*]] = llvm.mlir.constant(true) : i1 @@ -1054,7 +1053,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, // CHECK: [[CMP:%.*]] = llvm.icmp "eq" // CHECK: [[MASK:%.*]] = llvm.and [[MASK0]], [[CMP]] // CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO]] : i32) // CHECK-NEXT: ^bb1: // CHECK-NEXT: [[BCAST1:%.*]] = llvm.bitcast %arg1 : f32 to i32 @@ -1109,7 +1111,6 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_add_f32 tt.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) { // CHECK: [[EV0_ARG2:%.*]] = llvm.extractvalue %arg2[0] : !llvm.struct<(f32, f32)> @@ -1132,7 +1133,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: [[IE2:%.*]] = llvm.insertelement [[EV1_ARG2]], [[UNDEF2]][{{.*}} : i64] : vector<1xf32> // CHECK-NEXT: [[PRED2:%.*]] = llvm.and [[CST_TRUE]], {{.*}} : i1 // CHECK-NEXT: [[ZERO2:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[PRED2]], ^bb3, ^bb4([[ZERO2]] : f32) // CHECK-NEXT: ^bb3: // CHECK-NEXT: [[BCAST2:%.*]] = llvm.bitcast [[IE2]] : vector<1xf32> to f32 @@ -1147,7 +1151,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_add_f32_scalar_no_store tt.func @atomic_add_f32_scalar_no_store(%arg0 : !tt.ptr, %arg1 : i1, %arg2 : f32) { // CHECK: [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1 @@ -1159,7 +1162,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, // CHECK: [[IE1:%.*]] = llvm.insertelement %arg2, [[UNDEF1]][{{.*}} : i64] : vector<1xf32> // CHECK: [[PRED:%.*]] = llvm.and [[AND1]], %arg1 : i1 // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[PRED]], ^bb1, ^bb2([[ZERO]] : f32) // CHECK-NEXT: ^bb1: // CHECK-NEXT: [[BCAST2:%.*]] = llvm.bitcast [[IE1]] : vector<1xf32> to f32 diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index 4c5d658af1..679a18cd9b 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -1855,3 +1855,21 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #loc3 = loc("inner_call":29:28) #loc4 = loc(callsite(#loc3 at #loc1)) #loc5 = loc(callsite(#loc4 at #loc2)) + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @log1pf_scan(%39: tensor<32x16xf32, #blocked>) attributes {noinline = false} { + // CHECK: log1pf_scan + // non-speculatable ops will introduce a cond_br; extern_elementwise with pure = true should be considered speculatable. + // CHECK-NOT: llvm.cond_br + %40 = "tt.scan"(%39) <{axis = 1 : i32, reverse = false}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %43 = tt.extern_elementwise %arg5 {libname = "", libpath = "", pure = true, symbol = "__nv_log1pf"} : (f32) -> f32 + %44 = arith.addf %43, %43 : f32 + tt.scan.return %44 : f32 + }) : (tensor<32x16xf32, #blocked>) -> tensor<32x16xf32, #blocked> + tt.return + } +} diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir index e4dd0bc962..e3a4cdbfac 100644 --- a/test/TritonGEN/tritongen-to-llvm.mlir +++ b/test/TritonGEN/tritongen-to-llvm.mlir @@ -1,20 +1,5 @@ // RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s -// CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} - -llvm.func @triton_gen.barrier() { - // CHECK-LABEL: triton_gen.barrier - // CHECK: [[LOCAL:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[LOCAL]]) {{.*}} : (i32) -> () - // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL]]) {{.*}} : (i32) -> () - triton_gen.barrier {mem_fence=Local} - triton_gen.barrier {mem_fence=Global} - llvm.return -} - -// ----- - // CHECK-DAG: llvm.func spir_funccc @_Z31intel_work_group_barrier_arriveii(i32, i32) attributes {convergent, no_unwind, will_return} // CHECK-DAG: llvm.func spir_funccc @_Z29intel_work_group_barrier_waitii(i32, i32) attributes {convergent, no_unwind, will_return} diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir index f388da5aa9..90e2336ded 100644 --- a/test/TritonGEN/tritongen.mlir +++ b/test/TritonGEN/tritongen.mlir @@ -1,12 +1,5 @@ // RUN: triton-opt %s -split-input-file -verify-diagnostics | FileCheck %s -llvm.func @triton_gen.barrier() { - // CHECK-LABEL: triton_gen.barrier - // CHECK: triton_gen.barrier {mem_fence = Local} - triton_gen.barrier {mem_fence=Local} - llvm.return -} - llvm.func @triton_gen.split_barrier_signal() { // CHECK-LABEL: triton_gen.split_barrier_signal // CHECK: triton_gen.split_barrier_signal {mem_fence = None, mem_scope = WorkGroup} diff --git a/test/TritonIntelGPU/optimize-elementwise.mlir b/test/TritonIntelGPU/optimize-elementwise.mlir new file mode 100644 index 0000000000..d8b64bab89 --- /dev/null +++ b/test/TritonIntelGPU/optimize-elementwise.mlir @@ -0,0 +1,65 @@ +// RUN: triton-opt %s --split-input-file -tritonintelgpu-optimize-elementwise-parallelism | FileCheck %s + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> + +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_dpas( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) + tt.func @test_dpas(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_0]]> + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + } +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_blocked( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_1]]> + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_1]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_blocked_repeat( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) + tt.func @test_blocked_repeat(%arg0: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<64xf32, #[[$ATTR_1]]> + %0 = arith.addf %arg0, %arg1 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<64xf32, #[[$ATTR_1]]> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> + tt.return %0 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 50301edd1d..b05f856bbf 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -251,8 +251,9 @@ def make_ttgir(mod, metadata, opt, properties): passes.common.add_cse(pm) passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) - if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == 1: + if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == "1": intel.passes.ttgpuir.add_optimize_reduction_locality(pm) + intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td index d9d5266fba..dde9fd97e3 100644 --- a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td +++ b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td @@ -32,20 +32,6 @@ class TritonGEN_Op traits = []> : // Synchronization //===----------------------------------------------------------------------===// -def TritonGEN_BarrierOp : TritonGEN_Op<"barrier"> { - let summary = "Workgroup barrier"; - let description = [{ - The `triton_gen.barrier` operation performs a workgroup barrier and ensures - all outstanding memory transaction using local or global memory are complete. - }]; - let arguments = (ins TritonGEN_MemFence:$mem_fence); - let results = (outs); - let assemblyFormat = "attr-dict"; - let assemblyFormat = [{ - ` ` `{` `mem_fence` `=` $mem_fence `}` attr-dict - }]; -} - def TritonGEN_SplitBarrierSignalOp : TritonGEN_Op<"split_barrier_signal"> { let summary = "Split barrier signal"; let description = [{ diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td index c551a96856..1d81bc4741 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td @@ -365,4 +365,52 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic "mlir::triton::gpu::TritonGPUDialect"]; } +def TritonIntelGPUOptimizeElementwiseParallelism + : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> { + let summary = + "Improve parallelism of elementwise operations better utilizing hardware resources."; + + let description = [{ + Detect elementwise operations with an encoding causing sub-par parallelism, + i.e., with data duplication across threads, and convert the operands to a + more optimal encoding if the cost of doing so is heuristically estimated to + be sufficiently low. As of now, the cost should be 0, we only support + "unbroadcasting" tensors, i.e., dropping duplicated values held in other + threads by re-distributing them. + + As an example, this pass would modify the following code: +```mlir +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} +``` + Obtaining: +```mlir +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { + %0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> + %1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> + %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1> + %3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} +``` + + Note how the converted tensors are not sliced and thus each element in the + tensor is held by a single thread. + }]; + + let dependentDialects = []; +} + + #endif // TRITON_INTEL_GPU_PASSES diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 3f45a9d779..78df567d61 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -479,27 +479,6 @@ namespace { // Synchronization Ops Lowerings //===----------------------------------------------------------------------===// -struct TritonGENBarrierLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - - LogicalResult - matchAndRewrite(TritonGEN::BarrierOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - MLIRContext *ctx = rewriter.getContext(); - Location loc = op->getLoc(); - Type retType = void_ty(ctx); - IntegerType argType = int_ty(32); - Value arg = i32_val(static_cast(op.getMemFence())); - - LLVM::CallOp callOp = - createDeviceFunctionCall(rewriter, "_Z7barrierj", {retType}, {argType}, - {arg}, {}, convergentNoUnwindWillReturnAttrs); - rewriter.replaceOp(op, callOp); - return success(); - } -}; - struct TritonGENSplitBarrier { protected: template @@ -1092,13 +1071,12 @@ struct TritonGENToLLVMDialectInterface : public ConvertToLLVMPatternInterface { void mlir::triton::populateTritonGENToLLVMConversionPatterns( LLVMTypeConverter &converter, RewritePatternSet &patterns) { - patterns - .add(converter); + patterns.add< + TritonGENSplitBarrierSignalLowering, TritonGENSplitBarrierWaitLowering, + TritonSubGroupReduceLowering, TritonSubGroupScanLowering, + TritonMatrixDPASLowering, TritonMatrix2DBlockLoadLowering, + TritonMatrix2DBlockStoreLowering, TritonMatrix2DBlockPrefetchLowering, + TritonSIMDBlockReadLowering, TritonSIMDBlockWriteLowering>(converter); } void registerConvertTritonTritonGENToLLVMInterface(DialectRegistry ®istry) { diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt index f46c265fa4..4e86cbd2f2 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt @@ -33,6 +33,7 @@ add_triton_library(TritonIntelGPUToLLVM LINK_LIBS PUBLIC GPUToTritonGEN MLIRGPUToLLVMSPV + MLIRSPIRVToLLVM TritonGENIR TritonGENToLLVM TritonIntelGPUIR diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 35eb540241..edd1999ea5 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -1,5 +1,6 @@ #include "Dialect/TritonIntelGPU/IR/Dialect.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "llvm/ADT/SmallVector.h" @@ -1313,7 +1314,10 @@ struct AtomicCASOpConversion Value zero = (valueElemNBits == 32) ? i32_val(0) : i64_val(0); if (!atomicNeedsSharedMemory(op.getResult())) - rewriter.create(loc, TritonGEN::MemFence::GLOBAL); + rewriter.create( + loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup, + spirv::MemorySemantics::SequentiallyConsistent | + spirv::MemorySemantics::CrossWorkgroupMemory); Block &endBlock = LLVM::intel::createPredicatedBlock(rewriter, loc, mask, {zero}, [&] { // casPtr = bitcast(casPtr, ptr_ty(ctx, 1)); @@ -1462,8 +1466,10 @@ struct AtomicRMWOpConversion rmwPtr, rmwVal, rmwMask, {zero}); } else { if (!atomicNeedsSharedMemory(op.getResult())) - rewriter.create(loc, - TritonGEN::MemFence::GLOBAL); + rewriter.create( + loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup, + spirv::MemorySemantics::SequentiallyConsistent | + spirv::MemorySemantics::CrossWorkgroupMemory); endBlock = &LLVM::intel::createPredicatedBlock( rewriter, loc, rmwMask, {zero}, [&] { mlir::LLVM::AtomicBinOp rmwKind; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index b52b3a3b97..0593ca63f1 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -18,6 +18,7 @@ #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h" #include "mlir/Conversion/UBToLLVM/UBToLLVM.h" #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" #include "mlir/IR/PatternMatch.h" @@ -268,6 +269,8 @@ class TritonGPUToLLVMPipelineManager { triton::populateGPUToTritonGENConversionPatterns(typeConverter, patterns); cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns); populateGpuToLLVMSPVConversionPatterns(typeConverter, patterns); + populateSPIRVToLLVMConversionPatterns(typeConverter, patterns, + spirv::ClientAPI::OpenCL); } private: diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt index dbc641e2a3..46d121a070 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_triton_library(TritonIntelGPUTransforms DistributeToWarps.cpp MatchTargetSize.cpp MaterializeBlockPointer.cpp + OptimizeElementwiseParallelism.cpp OptimizeReductionLocality.cpp Pipeliner/MatmulLoopPipeline.cpp Pipeliner/SoftwarePipeliner.cpp diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp new file mode 100644 index 0000000000..1bd154306d --- /dev/null +++ b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp @@ -0,0 +1,160 @@ +//===- OptimizeElementwiseParallelism.cpp -------------------------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// This file implements the `tritonintelgpu-optimize-elementwise-parallelism` +/// pass. +//===----------------------------------------------------------------------===// + +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" + +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "triton/Dialect/Triton/IR/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" + +#define DEBUG_TYPE "tritonintelgpu-optimize-elementwise-parallelism" + +namespace mlir::triton::gpu::intel { +#define GEN_PASS_DEF_TRITONINTELGPUOPTIMIZEELEMENTWISEPARALLELISM +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc" + +namespace { +/// Return whether the input linear layout can be unbroadcasted. +/// +/// A layout is valid for being "unbroadcasted" along its lanes if: +/// - The 'lane' input dimension is zero: this means the lane dimension has been +/// sliced. +/// - The size of the input 'block' dimension is 1. This is true for XPU +/// backend. +/// - The size of the input 'warp' dimension is 1. This is a limitation to keep +/// things simple for now. +/// +/// Broadcasted layouts are layouts with sliced lane, warp or block (not +/// possible for XPU backend) dimensions, i.e., the same data is owned by +/// different threads. +bool isValidLayoutForUnbroadcast(const LinearLayout &linearLayout, + PatternRewriter &rewriter) { + StringAttr kLane = rewriter.getStringAttr("lane"); + StringAttr kWarp = rewriter.getStringAttr("warp"); + StringAttr kBlock = rewriter.getStringAttr("block"); + StringAttr kDim0 = rewriter.getStringAttr("dim0"); + // 'lane' dimension must have been sliced away completely. + if (!linearLayout.sublayoutIsZero(kLane, kDim0)) + return false; + // Only single block for now. + if (linearLayout.getInDimSize(kBlock) != 1) + return false; + // Only single warp for now. + return linearLayout.getInDimSize(kWarp) == 1; +} + +/// Get optimized unbroadcasted tensor type. +/// +/// Get optimized ranked tensor type after unbroadcasting. As we only support 1D +/// tensors, this is as simple as getting an "unboradcasted" blocked-encoded 1D +/// tensor type. +RankedTensorType getOptimizedType(RankedTensorType type, + const LinearLayout &linearLayout, + PatternRewriter &rewriter) { + auto encoding = cast(type.getEncoding()); + unsigned threadsPerWarp = product(encoding.getThreadsPerWarp()); + [[maybe_unused]] unsigned warpsPerCTA = product(encoding.getWarpsPerCTA()); + assert(warpsPerCTA == 1 && "Expecting single warp"); + [[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum()); + assert(ctaSplitNum == 1 && "Expecting single CTA"); + + RankedTensorType::Builder builder(type); + CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(rewriter.getContext(), 1); + auto newEncoding = rewriter.getAttr( + /*sizePerThread=*/1, threadsPerWarp, /*warpsPerCTA=*/1, /*order=*/0, + ctaLayout); + builder.setEncoding(newEncoding); + return builder; +} + +struct ElementwiseOptPattern final + : OpTraitRewritePattern { + using OpTraitRewritePattern::OpTraitRewritePattern; + + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const final { + // Rely on this for a simpler pass. + if (!op->hasTrait() || + op->getNumResults() != 1) + return failure(); + + // Skip complex operations. + if (op->hasSuccessors() || op->getNumRegions() != 0) + return failure(); + + // Layout optimizations only apply to tensors. + auto type = dyn_cast(op->getResultTypes().front()); + if (!type) + return failure(); + + // Check if the layout is actually bad and can be optimized using our + // approach. We only support 1D tensors for now as these are easier to + // handle. + Attribute layout = type.getEncoding(); + if (!layout || type.getRank() != 1) + return failure(); + std::optional linearLayout = + toLinearLayout(type.getShape(), layout); + if (!linearLayout || !isValidLayoutForUnbroadcast(*linearLayout, rewriter)) + return failure(); + + // Check the operands are not used by other operations. This will prevent + // register pressure increase: + if (!llvm::all_of(op->getOperands(), + [](Value val) { return val.hasOneUse(); })) + return failure(); + + // As we are dealing with 1D tensors, we can do a simple transform to obtain + // a more optimized operation. + Location loc = op->getLoc(); + RankedTensorType newType = getOptimizedType(type, *linearLayout, rewriter); + SmallVector newOperands(op->getNumOperands()); + llvm::transform(op->getOperands(), std::begin(newOperands), + [&rewriter, loc, newType](Value operand) { + return rewriter.create(loc, newType, + operand); + }); + + // Now we create the optimized operation: + StringAttr opName = op->getName().getIdentifier(); + ArrayRef attributes = op->getAttrs(); + Operation *newElementwiseOp = + rewriter.create(loc, opName, newOperands, newType, attributes); + assert(newElementwiseOp->getNumResults() == 1 && + "Expecting single result operation"); + + // Convert to unoptimized encoding for further use. + Value newValue = newElementwiseOp->getResult(0); + rewriter.replaceOpWithNewOp(op, type, newValue); + + return success(); + } +}; + +struct TritonIntelGPUOptimizeElementwiseParallelism final + : impl::TritonIntelGPUOptimizeElementwiseParallelismBase< + TritonIntelGPUOptimizeElementwiseParallelism> { + using Base::Base; + + void runOnOperation() final { + Operation *op = getOperation(); + MLIRContext *ctx = op->getContext(); + RewritePatternSet patterns(ctx); + patterns.add(ctx); + if (failed( + applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) + signalPassFailure(); + } +}; +} // namespace +} // namespace mlir::triton::gpu::intel diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 55db149919..3a3037f6c0 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -100,6 +100,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUMaterializeBlockPointer); ADD_PASS_WRAPPER_0("add_optimize_reduction_locality", gpu::intel::createTritonIntelGPUOptimizeReductionLocality); + ADD_PASS_WRAPPER_0( + "add_optimize_elementwise_parallelism", + gpu::intel::createTritonIntelGPUOptimizeElementwiseParallelism); } void init_triton_intel(py::module &&m) { diff --git a/utils/SPIRVRunner/args_data.json b/utils/SPIRVRunner/args_data.json index 1578504cdc..db7bb68075 100644 --- a/utils/SPIRVRunner/args_data.json +++ b/utils/SPIRVRunner/args_data.json @@ -32,5 +32,6 @@ "threads_per_warp": 32, "shared_memory": 0, "kernel_name": "add_kernel", - "spv_name": "add_kernel.spv" + "spv_name": "add_kernel.spv", + "build_flags": "" }