From 03968e6043e63dc5bea62baf513b499d2fce1a18 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Tue, 5 Nov 2024 21:41:57 -0800 Subject: [PATCH 01/18] [AMD] Enable all existing scaled_dot data type tests on MI300 (#5074) https://github.com/triton-lang/triton/pull/5062 enabled upcasting fp8E4M3FN to bf16; so now we can support that variant too. --- python/test/unit/language/test_core.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 4f8c65044..0888bf571 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -31,6 +31,7 @@ is_hip, is_hip_cdna, is_hip_mi200, + is_hip_mi300, get_arch, torch_float8_dtypes, torch_dtypes, @@ -3371,8 +3372,8 @@ def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack if is_hip(): if not is_hip_cdna(): pytest.skip("scaled_dot only implemented for HIP CDNA") - if (type_a not in ["e2m1", "e5m2"]) or (type_b not in ["e2m1", "e5m2", "bf16"]): - pytest.skip(f"scaled_dot({type_a}, {type_b}) not yet implemented for HIP") + if "e4m3" in (type_a, type_b) and not is_hip_mi300(): + pytest.skip(f"scaled_dot({type_a}, {type_b}) only implemented for MI300") if mma == 16 and K == 64: pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot") From 781774c567bd9e73147d6c499734b20f3225405c Mon Sep 17 00:00:00 2001 From: David Berard Date: Tue, 5 Nov 2024 21:56:39 -0800 Subject: [PATCH 02/18] [BACKEND] Make ExternElementwise op implement ConditionallySpeculatable (#5079) ExternElementwise ops have a `pure` attribute that marks the op as pure. If an op is pure, it should also be speculatable. In the reduction/scan ttgir->llvm passes, checks for speculatability are failing for ExternElementwise ops, causing additional conditional handling to be added. This PR makes ExternElementwise ops implement ConditionallySpeculatable, and mark the op as speculatable if the op is marked as pure. This removes the conditional branches from the generated scan/reduction code. --- include/triton/Dialect/Triton/IR/TritonOps.td | 9 ++++++++- lib/Dialect/Triton/IR/Ops.cpp | 6 ++++++ test/Conversion/tritongpu_to_llvm.mlir | 18 ++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td index 6b2faf336..87dc10a71 100644 --- a/include/triton/Dialect/Triton/IR/TritonOps.td +++ b/include/triton/Dialect/Triton/IR/TritonOps.td @@ -778,7 +778,8 @@ def TT_ScanReturnOp: TT_Op<"scan.return", def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise, SameOperandsAndResultEncoding, SameVariadicOperandSize, - DeclareOpInterfaceMethods]> { + DeclareOpInterfaceMethods, + ConditionallySpeculatable]> { let description = [{ call an external function $symbol implemented in $libpath/$libname with $args @@ -790,6 +791,12 @@ def TT_ExternElementwiseOp : TT_Op<"extern_elementwise", [Elementwise, let results = (outs TT_Type:$result); let assemblyFormat = "operands attr-dict `:` functional-type(operands, $result)"; + + let extraClassDeclaration = [{ + // Interface method for ConditionallySpeculatable. + Speculation::Speculatability getSpeculatability(); + }]; + } // diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp index a16f9a3ca..12f5be29c 100644 --- a/lib/Dialect/Triton/IR/Ops.cpp +++ b/lib/Dialect/Triton/IR/Ops.cpp @@ -1039,6 +1039,12 @@ void ExternElementwiseOp::getEffects( SideEffects::DefaultResource::get()); } +Speculation::Speculatability ExternElementwiseOp::getSpeculatability() { + if (getPure()) + return Speculation::Speculatable; + return Speculation::NotSpeculatable; +} + // -- ExperimentalTensormapCreateOp -- LogicalResult ExperimentalTensormapCreateOp::verify() { auto rank = getBoxDim().size(); diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir index 4c5d658af..679a18cd9 100644 --- a/test/Conversion/tritongpu_to_llvm.mlir +++ b/test/Conversion/tritongpu_to_llvm.mlir @@ -1855,3 +1855,21 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : #loc3 = loc("inner_call":29:28) #loc4 = loc(callsite(#loc3 at #loc1)) #loc5 = loc(callsite(#loc4 at #loc2)) + +// ----- + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 4], order = [0, 1]}> +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.target = "cuda:90", "triton_gpu.threads-per-warp" = 32 : i32} { + tt.func public @log1pf_scan(%39: tensor<32x16xf32, #blocked>) attributes {noinline = false} { + // CHECK: log1pf_scan + // non-speculatable ops will introduce a cond_br; extern_elementwise with pure = true should be considered speculatable. + // CHECK-NOT: llvm.cond_br + %40 = "tt.scan"(%39) <{axis = 1 : i32, reverse = false}> ({ + ^bb0(%arg5: f32, %arg6: f32): + %43 = tt.extern_elementwise %arg5 {libname = "", libpath = "", pure = true, symbol = "__nv_log1pf"} : (f32) -> f32 + %44 = arith.addf %43, %43 : f32 + tt.scan.return %44 : f32 + }) : (tensor<32x16xf32, #blocked>) -> tensor<32x16xf32, #blocked> + tt.return + } +} From 35f1827581071a5ac3a385f8776ab1a3a784811a Mon Sep 17 00:00:00 2001 From: Thomas Raoux Date: Tue, 5 Nov 2024 22:30:59 -0800 Subject: [PATCH 03/18] [BACKEND] Fix reduce with slice layout inputs (#5080) Couple of places where not handling slice layout inputs for reductions. Add support for recursive slice layout in those cases. --- lib/Analysis/Utility.cpp | 31 ++++++---- .../TritonGPUToLLVM/ReduceOpToLLVM.cpp | 56 +++++++++++++++++-- python/test/unit/language/test_core.py | 30 ++++++++++ 3 files changed, 100 insertions(+), 17 deletions(-) diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp index 8c62e738f..501e19722 100644 --- a/lib/Analysis/Utility.cpp +++ b/lib/Analysis/Utility.cpp @@ -69,18 +69,25 @@ unsigned ReduceOpHelper::getThreadOffsetOnReductionAxis() { } unsigned threadOffset = 1; - if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { - auto parentLayout = sliceLayout.getParent(); - auto threadsPerWarp = getThreadsPerWarp(parentLayout); - threadOffset = threadsPerWarp[sliceLayout.getDim()]; - } else { - auto threadsPerWarp = getThreadsPerWarp(srcLayout); - auto order = getThreadOrder(srcLayout); - for (unsigned i = 0; i < order.size(); i++) { - if (order[i] == axis) - break; - threadOffset *= threadsPerWarp[order[i]]; - } + SmallVector dimsRemoved; + while (auto sliceLayout = mlir::dyn_cast(srcLayout)) { + dimsRemoved.push_back(sliceLayout.getDim()); + srcLayout = sliceLayout.getParent(); + } + // In case of slice layout we want to know the axis dimension relative to the + // most inner parent layout. `adjustedAxis` is the matching axis dim in the + // parent layout. + int adjustedAxis = axis; + for (auto dim : dimsRemoved) { + if (dim <= adjustedAxis) + adjustedAxis++; + } + auto threadsPerWarp = getThreadsPerWarp(srcLayout); + auto order = getThreadOrder(srcLayout); + for (unsigned i = 0; i < order.size(); i++) { + if (order[i] == adjustedAxis) + break; + threadOffset *= threadsPerWarp[order[i]]; } return threadOffset; } diff --git a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp index 4e8053923..829d4e710 100644 --- a/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp +++ b/lib/Conversion/TritonGPUToLLVM/ReduceOpToLLVM.cpp @@ -218,6 +218,46 @@ struct ReduceOpConversion rewriter.replaceOp(op, results); } + // For slice layout some ids are duplicated on multiple lanes, so we need to + // handle the delinearization of laneId in a special way. We need to + // generalize this part of the logic to work on any kind of linear layout + // uniformely. + SmallVector + getMultiDimLaneId(ReduceOpHelper &helper, Value &laneId, Location &loc, + ConversionPatternRewriter &rewriter) const { + auto srcLayout = helper.getSrcLayout(); + auto srcShape = helper.getSrcShape(); + auto order = triton::gpu::getThreadOrder(srcLayout); + SmallVector multiDimLaneId; + + if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { + auto parentLayout = sliceLayout.getParent(); + SmallVector dims = {sliceLayout.getDim()}; + while (auto parentSliceLayout = + mlir::dyn_cast(parentLayout)) { + dims.push_back(parentSliceLayout.getDim()); + parentLayout = parentSliceLayout.getParent(); + } + + auto parentThreadsPerWarps = triton::gpu::getThreadsPerWarp(parentLayout); + auto parentOrder = triton::gpu::getThreadOrder(parentLayout); + multiDimLaneId = delinearize(rewriter, loc, laneId, parentThreadsPerWarps, + parentOrder); + for (unsigned dim : llvm::reverse(dims)) { + multiDimLaneId.erase(multiDimLaneId.begin() + dim); + } + } else { + SmallVector threadsPerWarps = + triton::gpu::getThreadsPerWarp(srcLayout); + threadsPerWarps[helper.getAxis()] = + triton::gpu::getThreadsPerWarpWithUniqueData( + srcLayout, srcShape)[helper.getAxis()]; + multiDimLaneId = + delinearize(rewriter, loc, laneId, threadsPerWarps, order); + } + return multiDimLaneId; + } + SmallVector getMultiDimWarpId(ReduceOpHelper &helper, Value &warpId, Location &loc, ConversionPatternRewriter &rewriter) const { @@ -231,11 +271,20 @@ struct ReduceOpConversion // a way to properly delinearize warpId in the slice case if (auto sliceLayout = mlir::dyn_cast(srcLayout)) { auto parentLayout = sliceLayout.getParent(); + SmallVector dims = {sliceLayout.getDim()}; + while (auto parentSliceLayout = + mlir::dyn_cast(parentLayout)) { + dims.push_back(parentSliceLayout.getDim()); + parentLayout = parentSliceLayout.getParent(); + } + auto parentWarpsPerCTA = triton::gpu::getWarpsPerCTA(parentLayout); auto parentOrder = triton::gpu::getWarpOrder(parentLayout); multiDimWarpId = delinearize(rewriter, loc, warpId, parentWarpsPerCTA, parentOrder); - multiDimWarpId.erase(multiDimWarpId.begin() + sliceLayout.getDim()); + for (unsigned dim : llvm::reverse(dims)) { + multiDimWarpId.erase(multiDimWarpId.begin() + dim); + } } else { SmallVector warpsPerCTA = triton::gpu::getWarpsPerCTA(srcLayout); @@ -263,11 +312,8 @@ struct ReduceOpConversion unsigned axis = op.getAxis(); auto smemShape = helper.getScratchRepShape(); - auto threadsPerWarp = - triton::gpu::getThreadsPerWarpWithUniqueData(srcLayout, srcShape); - auto order = getThreadOrder(srcLayout); SmallVector multiDimLaneId = - delinearize(rewriter, loc, laneId, threadsPerWarp, order); + getMultiDimLaneId(helper, laneId, loc, rewriter); Value laneIdAxis = multiDimLaneId[axis]; Value zero = i32_val(0); Value laneZero = icmp_eq(laneIdAxis, zero); diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 0888bf571..3fe93ddbd 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -6013,3 +6013,33 @@ def sanitize_cumsum_kernel(Z, X, BLOCK: tl.constexpr): Z = torch.zeros_like(X) sanitize_cumsum_kernel[(1, )](Z, X, BLOCK=BLOCK) torch.testing.assert_close(Z, X.cumsum(0).to(torch.int32)) + + +# stress test slice layout usages in reductions. +@pytest.mark.parametrize("in_shape, perm, red_dims", [ + ((4, 32, 32, 4, 2), [2, 1, 0, 3, 4], [3, 1, 0]), + ((8, 2, 32, 4, 16), [4, 0, 1, 3, 2], [0, 2, 0]), +]) +def test_chained_reductions(in_shape, perm, red_dims, device): + + @triton.jit + def kernel(In, Out, # + dim_0: tl.constexpr, dim_1: tl.constexpr, dim_2: tl.constexpr, dim_3: tl.constexpr, dim_4: tl.constexpr, + perm_0: tl.constexpr, perm_1: tl.constexpr, perm_2: tl.constexpr, perm_3: tl.constexpr, + perm_4: tl.constexpr, red_dim_0: tl.constexpr, red_dim_1: tl.constexpr, red_dim_2: tl.constexpr): + idx = tl.arange(0, dim_0 * dim_1 * dim_2 * dim_3 * dim_4) + idx = idx.reshape(dim_0, dim_1, dim_2, dim_3, dim_4) + vals = tl.load(In + idx) + vals = tl.permute(vals, [perm_0, perm_1, perm_2, perm_3, perm_4]) + r = tl.sum(tl.sum(tl.sum(vals, red_dim_0), red_dim_1), red_dim_2) + st_idx = tl.arange(0, r.shape[0] * r.shape[1]).reshape(r.shape) + tl.store(Out + st_idx, r) + + input = torch.randint(0, 1000, in_shape, device=device, dtype=torch.int32) + temp = torch.permute(input, perm).contiguous() + ref = torch.sum(torch.sum(torch.sum(temp, dim=red_dims[0]), dim=red_dims[1]), dim=red_dims[2]) + result = torch.empty_like(ref) + kernel[(1, )](input, result, input.shape[0], input.shape[1], input.shape[2], input.shape[3], input.shape[4], + perm[0], perm[1], perm[2], perm[3], perm[4], red_dims[0], red_dims[1], red_dims[2]) + + assert torch.all(ref == result) From 04f87d021a3550aa536862aecee21bf0a30a2452 Mon Sep 17 00:00:00 2001 From: David Berard Date: Wed, 6 Nov 2024 03:10:16 -0800 Subject: [PATCH 04/18] [FRONTEND] Fix handling of `from m import x as y` in CodeGenerator (#5081) Context: in `CodeGenerator.__init__`, globals for a given triton function are modified to handle remapping the libdevice module to cuda or hip (from https://github.com/triton-lang/triton/pull/4539). In particular, this logic: ```python for k, v in gscope.items(): # gscope is a dict of fn.__globals__ ... self.gscope[k] = getattr(module_map[module_name], k) ``` was failing if you do this in the global scope: `from triton.language.extras.libdevice import fast_dividef as my_fast_dividef`. --- python/test/unit/language/test_libdevice.py | 23 +++++++++++++++++++++ python/triton/compiler/code_generator.py | 2 +- 2 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 python/test/unit/language/test_libdevice.py diff --git a/python/test/unit/language/test_libdevice.py b/python/test/unit/language/test_libdevice.py new file mode 100644 index 000000000..da0d7d49c --- /dev/null +++ b/python/test/unit/language/test_libdevice.py @@ -0,0 +1,23 @@ +import torch + +import triton +import triton.language as tl + +from triton.language.extra.libdevice import fast_dividef as my_fast_dividef + + +def test_libdevice_rename(device): + # mark the import as used by this test + _ = my_fast_dividef + + @triton.jit + def triton_copy(in_ptr, out_ptr, BLOCK_SIZE: tl.constexpr): + offsets = tl.arange(0, BLOCK_SIZE) + data = tl.load(in_ptr + offsets) + tl.store(out_ptr + offsets, data) + + BLOCK_SIZE = 256 + inp = torch.randn(BLOCK_SIZE, device=device) + out = torch.empty_like(inp) + + triton_copy[(1, )](inp, out, BLOCK_SIZE) diff --git a/python/triton/compiler/code_generator.py b/python/triton/compiler/code_generator.py index ec0ef227f..4ec46f32c 100644 --- a/python/triton/compiler/code_generator.py +++ b/python/triton/compiler/code_generator.py @@ -218,7 +218,7 @@ def __init__(self, context, prototype, gscope, attributes, constants, function_n module_name = getattr(v, "__module__", "") if module_name in module_map: - self.gscope[k] = getattr(module_map[module_name], k) + self.gscope[k] = getattr(module_map[module_name], v.__name__) else: self.gscope[k] = v From 0d9c0d3e09857d4b59038819b05db882090a6399 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Tue, 12 Nov 2024 16:20:55 +0100 Subject: [PATCH 05/18] [XPU][OptEW] Define `-intel-triton-optimize-elementwise-parallelism` pass (#2631) Define pass improving elementwise parallelism by avoiding layout conversions leading to data duplication between threads. See pass documentation for more information. --------- Signed-off-by: victor-eds --- test/TritonIntelGPU/optimize-elementwise.mlir | 65 +++++++ .../TritonIntelGPU/Transforms/Passes.td | 48 ++++++ .../TritonIntelGPUTransforms/CMakeLists.txt | 1 + .../OptimizeElementwiseParallelism.cpp | 160 ++++++++++++++++++ 4 files changed, 274 insertions(+) create mode 100644 test/TritonIntelGPU/optimize-elementwise.mlir create mode 100644 third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp diff --git a/test/TritonIntelGPU/optimize-elementwise.mlir b/test/TritonIntelGPU/optimize-elementwise.mlir new file mode 100644 index 000000000..d8b64bab8 --- /dev/null +++ b/test/TritonIntelGPU/optimize-elementwise.mlir @@ -0,0 +1,65 @@ +// RUN: triton-opt %s --split-input-file -tritonintelgpu-optimize-elementwise-parallelism | FileCheck %s + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> + +#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [1, 1], repCluster = [2, 2], A = [16, 16], B = [16, 32], C = [16, 32]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_dpas( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>>) + tt.func @test_dpas(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> -> tensor<16xf32, #[[$ATTR_0]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_0]]> + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_0]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_1]]}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> + } +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_blocked( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<16xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<16xf32, #[[$ATTR_1]]> + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<16xf32, #[[$ATTR_1]]> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} + +// ----- + +// CHECK: #[[$ATTR_0:.+]] = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +// CHECK: #[[$ATTR_1:.+]] = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { +// CHECK-LABEL: tt.func @test_blocked_repeat( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>>) + tt.func @test_blocked_repeat(%arg0: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { +// CHECK: %[[VAL_2:.*]] = triton_gpu.convert_layout %[[VAL_0]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_3:.*]] = triton_gpu.convert_layout %[[VAL_1]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> -> tensor<64xf32, #[[$ATTR_1]]> +// CHECK: %[[VAL_4:.*]] = arith.addf %[[VAL_2]], %[[VAL_3]] : tensor<64xf32, #[[$ATTR_1]]> + %0 = arith.addf %arg0, %arg1 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> +// CHECK: %[[VAL_5:.*]] = triton_gpu.convert_layout %[[VAL_4]] : tensor<64xf32, #[[$ATTR_1]]> -> tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> +// CHECK: tt.return %[[VAL_5]] : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #[[$ATTR_0]]}>> + tt.return %0 : tensor<64xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td index c551a9685..1d81bc474 100644 --- a/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td +++ b/third_party/intel/include/Dialect/TritonIntelGPU/Transforms/Passes.td @@ -365,4 +365,52 @@ tt.func @test(%arg0: tensor<32x32xf32, #mma>) -> tensor<32xf32, #triton_gpu.slic "mlir::triton::gpu::TritonGPUDialect"]; } +def TritonIntelGPUOptimizeElementwiseParallelism + : Pass<"tritonintelgpu-optimize-elementwise-parallelism", "mlir::ModuleOp"> { + let summary = + "Improve parallelism of elementwise operations better utilizing hardware resources."; + + let description = [{ + Detect elementwise operations with an encoding causing sub-par parallelism, + i.e., with data duplication across threads, and convert the operands to a + more optimal encoding if the cost of doing so is heuristically estimated to + be sufficiently low. As of now, the cost should be 0, we only support + "unbroadcasting" tensors, i.e., dropping duplicated values held in other + threads by re-distributing them. + + As an example, this pass would modify the following code: +```mlir +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { + %0 = arith.addf %arg0, %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + tt.return %0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} +``` + Obtaining: +```mlir +#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}> +#blocked1 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [16], warpsPerCTA = [1], order = [0]}> + +module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} { + tt.func @test_blocked(%arg0: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>, %arg1: tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> { + %0 = triton_gpu.convert_layout %arg0 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> + %1 = triton_gpu.convert_layout %arg1 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf32, #blocked1> + %2 = arith.addf %0, %1 : tensor<16xf32, #blocked1> + %3 = triton_gpu.convert_layout %2 : tensor<16xf32, #blocked1> -> tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + tt.return %3 : tensor<16xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> + } +} +``` + + Note how the converted tensors are not sliced and thus each element in the + tensor is held by a single thread. + }]; + + let dependentDialects = []; +} + + #endif // TRITON_INTEL_GPU_PASSES diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt index dbc641e2a..46d121a07 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUTransforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_triton_library(TritonIntelGPUTransforms DistributeToWarps.cpp MatchTargetSize.cpp MaterializeBlockPointer.cpp + OptimizeElementwiseParallelism.cpp OptimizeReductionLocality.cpp Pipeliner/MatmulLoopPipeline.cpp Pipeliner/SoftwarePipeliner.cpp diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp new file mode 100644 index 000000000..1bd154306 --- /dev/null +++ b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeElementwiseParallelism.cpp @@ -0,0 +1,160 @@ +//===- OptimizeElementwiseParallelism.cpp -------------------------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// This file implements the `tritonintelgpu-optimize-elementwise-parallelism` +/// pass. +//===----------------------------------------------------------------------===// + +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h" + +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#include "triton/Dialect/Triton/IR/Dialect.h" +#include "triton/Dialect/Triton/IR/Utility.h" +#include "triton/Dialect/TritonGPU/IR/Dialect.h" + +#define DEBUG_TYPE "tritonintelgpu-optimize-elementwise-parallelism" + +namespace mlir::triton::gpu::intel { +#define GEN_PASS_DEF_TRITONINTELGPUOPTIMIZEELEMENTWISEPARALLELISM +#include "intel/include/Dialect/TritonIntelGPU/Transforms/Passes.h.inc" + +namespace { +/// Return whether the input linear layout can be unbroadcasted. +/// +/// A layout is valid for being "unbroadcasted" along its lanes if: +/// - The 'lane' input dimension is zero: this means the lane dimension has been +/// sliced. +/// - The size of the input 'block' dimension is 1. This is true for XPU +/// backend. +/// - The size of the input 'warp' dimension is 1. This is a limitation to keep +/// things simple for now. +/// +/// Broadcasted layouts are layouts with sliced lane, warp or block (not +/// possible for XPU backend) dimensions, i.e., the same data is owned by +/// different threads. +bool isValidLayoutForUnbroadcast(const LinearLayout &linearLayout, + PatternRewriter &rewriter) { + StringAttr kLane = rewriter.getStringAttr("lane"); + StringAttr kWarp = rewriter.getStringAttr("warp"); + StringAttr kBlock = rewriter.getStringAttr("block"); + StringAttr kDim0 = rewriter.getStringAttr("dim0"); + // 'lane' dimension must have been sliced away completely. + if (!linearLayout.sublayoutIsZero(kLane, kDim0)) + return false; + // Only single block for now. + if (linearLayout.getInDimSize(kBlock) != 1) + return false; + // Only single warp for now. + return linearLayout.getInDimSize(kWarp) == 1; +} + +/// Get optimized unbroadcasted tensor type. +/// +/// Get optimized ranked tensor type after unbroadcasting. As we only support 1D +/// tensors, this is as simple as getting an "unboradcasted" blocked-encoded 1D +/// tensor type. +RankedTensorType getOptimizedType(RankedTensorType type, + const LinearLayout &linearLayout, + PatternRewriter &rewriter) { + auto encoding = cast(type.getEncoding()); + unsigned threadsPerWarp = product(encoding.getThreadsPerWarp()); + [[maybe_unused]] unsigned warpsPerCTA = product(encoding.getWarpsPerCTA()); + assert(warpsPerCTA == 1 && "Expecting single warp"); + [[maybe_unused]] unsigned ctaSplitNum = product(encoding.getCTASplitNum()); + assert(ctaSplitNum == 1 && "Expecting single CTA"); + + RankedTensorType::Builder builder(type); + CTALayoutAttr ctaLayout = CTALayoutAttr::getDefault(rewriter.getContext(), 1); + auto newEncoding = rewriter.getAttr( + /*sizePerThread=*/1, threadsPerWarp, /*warpsPerCTA=*/1, /*order=*/0, + ctaLayout); + builder.setEncoding(newEncoding); + return builder; +} + +struct ElementwiseOptPattern final + : OpTraitRewritePattern { + using OpTraitRewritePattern::OpTraitRewritePattern; + + LogicalResult matchAndRewrite(Operation *op, + PatternRewriter &rewriter) const final { + // Rely on this for a simpler pass. + if (!op->hasTrait() || + op->getNumResults() != 1) + return failure(); + + // Skip complex operations. + if (op->hasSuccessors() || op->getNumRegions() != 0) + return failure(); + + // Layout optimizations only apply to tensors. + auto type = dyn_cast(op->getResultTypes().front()); + if (!type) + return failure(); + + // Check if the layout is actually bad and can be optimized using our + // approach. We only support 1D tensors for now as these are easier to + // handle. + Attribute layout = type.getEncoding(); + if (!layout || type.getRank() != 1) + return failure(); + std::optional linearLayout = + toLinearLayout(type.getShape(), layout); + if (!linearLayout || !isValidLayoutForUnbroadcast(*linearLayout, rewriter)) + return failure(); + + // Check the operands are not used by other operations. This will prevent + // register pressure increase: + if (!llvm::all_of(op->getOperands(), + [](Value val) { return val.hasOneUse(); })) + return failure(); + + // As we are dealing with 1D tensors, we can do a simple transform to obtain + // a more optimized operation. + Location loc = op->getLoc(); + RankedTensorType newType = getOptimizedType(type, *linearLayout, rewriter); + SmallVector newOperands(op->getNumOperands()); + llvm::transform(op->getOperands(), std::begin(newOperands), + [&rewriter, loc, newType](Value operand) { + return rewriter.create(loc, newType, + operand); + }); + + // Now we create the optimized operation: + StringAttr opName = op->getName().getIdentifier(); + ArrayRef attributes = op->getAttrs(); + Operation *newElementwiseOp = + rewriter.create(loc, opName, newOperands, newType, attributes); + assert(newElementwiseOp->getNumResults() == 1 && + "Expecting single result operation"); + + // Convert to unoptimized encoding for further use. + Value newValue = newElementwiseOp->getResult(0); + rewriter.replaceOpWithNewOp(op, type, newValue); + + return success(); + } +}; + +struct TritonIntelGPUOptimizeElementwiseParallelism final + : impl::TritonIntelGPUOptimizeElementwiseParallelismBase< + TritonIntelGPUOptimizeElementwiseParallelism> { + using Base::Base; + + void runOnOperation() final { + Operation *op = getOperation(); + MLIRContext *ctx = op->getContext(); + RewritePatternSet patterns(ctx); + patterns.add(ctx); + if (failed( + applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) + signalPassFailure(); + } +}; +} // namespace +} // namespace mlir::triton::gpu::intel From 9e41b65ecc110c1428f522c2556f5486131931bd Mon Sep 17 00:00:00 2001 From: Kirill Suvorov Date: Tue, 12 Nov 2024 16:29:54 +0100 Subject: [PATCH 06/18] Fix UT test_core.py::test_poison_return failure (#2685) Closes #2482 Cuda converts tt.store to inline_asm command, and xpu converts it to llvm.store. Then the LLVM optimizer for the SROA function removes llvm.store with the poison value, but ignores the cuda assembly instruction. Signed-off-by: Kirill Suvorov --- python/test/unit/language/test_core.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index f7ec0bbb2..a65e5ff4b 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -5186,8 +5186,6 @@ def return_poison(x): def test_poison_return(device): - if is_xpu(): - pytest.skip("FIXME: poison is optimized away by opt") @triton.jit def kernel(Out): @@ -5196,7 +5194,9 @@ def kernel(Out): a = torch.empty((), device=device, dtype=torch.int32) h = kernel[(1, )](a) assert "ub.poison" in h.asm["ttir"], h.asm["ttir"] - assert "poison" in h.asm["llir"], h.asm["llir"] + # xpu uses llvm.store, which in this case is removed by the optimizer + if not is_xpu(): + assert "poison" in h.asm["llir"], h.asm["llir"] # ----------------------- From b74f57eca51e70f3cd4bf94015fb731bf3d03012 Mon Sep 17 00:00:00 2001 From: Pavel Chekin Date: Tue, 12 Nov 2024 08:17:04 -0800 Subject: [PATCH 07/18] Remove workaround for yapf (#2664) Fixes #2688. --- .github/workflows/build-test.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index d3d9b29b5..84d09afa5 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -76,12 +76,6 @@ jobs: run: | set -x pip install --upgrade pre-commit - - # TODO: ignore the first yapf failure until https://github.com/google/yapf/issues/1164 is fixed - python3 -m pre_commit run --all-files --verbose yapf &> /dev/null || true - # If first run of yapf worked and made changes reset the tree to the original state - git reset --hard - python3 -m pre_commit run --show-diff-on-failure --color=always --all-files --verbose - name: Save pip cache From 0187e3617d77b0d4b5cfef5d3ba8fabf97ac2877 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 12 Nov 2024 17:19:36 +0100 Subject: [PATCH 08/18] Update PyTorch pin (#2689) Extra CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/11799749929 (passed) Signed-off-by: Anatoly Myachev --- .github/pins/pytorch-upstream.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/pins/pytorch-upstream.txt b/.github/pins/pytorch-upstream.txt index c2ce8b1a5..0d9e3cab7 100644 --- a/.github/pins/pytorch-upstream.txt +++ b/.github/pins/pytorch-upstream.txt @@ -1 +1 @@ -33dce10ece5b38aa0ab76739b658cd980a6e3d8f +51e8a13d007b3032af45facb50dfa4ee6012f22a From e8b34a0afffc2cf4875faeab1d1d680e676fc43a Mon Sep 17 00:00:00 2001 From: Vadim Musin Date: Tue, 12 Nov 2024 22:45:01 +0300 Subject: [PATCH 09/18] Benchmarks subset (#2614) Related to #2522 --------- Co-authored-by: Pavel Chekin --- .github/workflows/triton-benchmarks.yml | 36 ++++++++++++++----------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/.github/workflows/triton-benchmarks.yml b/.github/workflows/triton-benchmarks.yml index 94e419646..2a5f9937f 100644 --- a/.github/workflows/triton-benchmarks.yml +++ b/.github/workflows/triton-benchmarks.yml @@ -24,6 +24,10 @@ on: description: Run name type: string default: "Triton benchmarks" + skip_benchmarks: + description: JSON list of benchmarks to skip + type: string + default: "[]" schedule: - cron: "5 23 * * *" pull_request: @@ -112,7 +116,7 @@ jobs: python setup.py install - name: Run Triton Softmax kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'fused_softmax.py') }} run: | cd benchmarks/triton_kernels_benchmark python fused_softmax.py --reports $REPORTS @@ -121,7 +125,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/softmax-performance.csv $REPORTS/softmax-xetla-report.csv --benchmark softmax --compiler xetla --param_cols "N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_benchmark.py --reports $REPORTS @@ -132,7 +136,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-base.csv $REPORTS/gemm-xetla-report.csv --benchmark gemm --compiler xetla --param_cols "B,M,K,N" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_default') }} run: | cd benchmarks/triton_kernels_benchmark # Default path: @@ -148,7 +152,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-default-path.csv $REPORTS/gemm-triton-default-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_advanced') }} run: | cd benchmarks/triton_kernels_benchmark # Advanced path: @@ -164,7 +168,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-adv-path.csv $REPORTS/gemm-triton-advanced-report.csv --benchmark gemm --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM (A@B^t) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_abt') }} run: | cd benchmarks/triton_kernels_benchmark TRANSPOSE_B=1 python gemm_benchmark.py --reports $REPORTS @@ -175,7 +179,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-bt.csv $REPORTS/gemm-bt-onednn-report.csv --benchmark gemm-bt --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - name: Run Triton GEMM (A^t@B) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_benchmark.py_atb') }} run: | cd benchmarks/triton_kernels_benchmark TRANSPOSE_A=1 python gemm_benchmark.py --reports $REPORTS @@ -186,7 +190,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-at.csv $REPORTS/gemm-at-onednn-report.csv --benchmark gemm-at --compiler onednn --param_cols "B,M,K,N" --tflops_col onednn-TFlops --hbm_col "onednn-GB/s" --tag $TAG - name: Run Triton GEMM (stream-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_streamk_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_streamk_benchmark.py --reports $REPORTS @@ -194,7 +198,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-streamk-performance.csv $REPORTS/gemm-streamk-triton-report.csv --benchmark gemm-streamk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM (split-k) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_splitk_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_splitk_benchmark.py --reports $REPORTS @@ -202,7 +206,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-splitk-performance.csv $REPORTS/gemm-splitk-triton-report.csv --benchmark gemm-splitk --compiler triton --param_cols "M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PreOp (exp) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_preop_exp_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_preop_exp_benchmark.py --reports $REPORTS @@ -210,7 +214,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-preop-exp.csv $REPORTS/gemm-preop-exp-triton-report.csv --benchmark gemm-preop-exp --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PostOp (Gelu) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_gelu_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_postop_gelu_benchmark.py --reports $REPORTS @@ -218,7 +222,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-gelu.csv $REPORTS/gemm-postop-gelu-triton-report.csv --benchmark gemm-postop-gelu --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton GEMM + PostOp (add matrix) kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'gemm_postop_addmatrix_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python gemm_postop_addmatrix_benchmark.py --reports $REPORTS @@ -226,7 +230,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/matmul-performance-postop-addmatrix.csv $REPORTS/gemm-postop-addmatrix-triton-report.csv --benchmark gemm-postop-addmatrix --compiler triton --param_cols "B,M,K,N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py') }} run: | cd benchmarks/triton_kernels_benchmark python flash_attention_fwd_benchmark.py --reports $REPORTS @@ -236,7 +240,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-xetla-report.csv --benchmark attn --compiler xetla --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col XeTLA-TFlops --hbm_col "XeTLA-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - default path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmark || '[]'), 'flash_attention_fwd_benchmark.py_default') }} run: | cd benchmarks/triton_kernels_benchmark TRITON_INTEL_ADVANCED_PATH=0 \ @@ -249,7 +253,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-default-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Triton FA kernel benchmark - advanced path - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'flash_attention_fwd_benchmark.py_advanced') }} run: | cd benchmarks/triton_kernels_benchmark TRITON_INTEL_ADVANCED_PATH=1 \ @@ -262,7 +266,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/attn-performance.csv $REPORTS/attn-triton-advanced-report.csv --benchmark attn --compiler triton --param_cols "Z,H,N_CTX,D_HEAD,CAUSAL" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run Prefix Sums kernel benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'prefix_sums.py') }} run: | cd benchmarks/triton_kernels_benchmark python prefix_sums.py --reports $REPORTS @@ -270,7 +274,7 @@ jobs: python ../../scripts/build_report.py $REPORTS/prefix-sums.csv $REPORTS/prefix_sums-triton-report.csv --benchmark prefix_sums --compiler triton --param_cols "N" --tflops_col Triton-TFlops --hbm_col "Triton-GB/s" --tag $TAG - name: Run micro benchmark - if: ${{ steps.install.outcome == 'success' && !cancelled() }} + if: ${{ steps.install.outcome == 'success' && !cancelled() && !contains(fromJson(inputs.skip_benchmarks || '[]'), 'micro_benchmarks') }} run: | cd benchmarks/micro_benchmarks python run_benchmarks.py --reports $REPORTS From 60f4dadcad5f03c2b372229419c706a74f289bda Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 13 Nov 2024 11:38:01 +0100 Subject: [PATCH 10/18] Prepare XeTLA benchmarks for 2025 compiler (#2692) `math.h` header needs to be added as a direct dependency for `INFINITY` constant which is used in these files. Signed-off-by: Anatoly Myachev --- benchmarks/xetla_kernel/flash_attention/fmha_backward.h | 1 + benchmarks/xetla_kernel/flash_attention/fmha_utils.h | 1 + 2 files changed, 2 insertions(+) diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h index 0b1dd7ef9..ed769dc4b 100644 --- a/benchmarks/xetla_kernel/flash_attention/fmha_backward.h +++ b/benchmarks/xetla_kernel/flash_attention/fmha_backward.h @@ -4,6 +4,7 @@ #include "fmha_backward_policy.h" #include "fmha_utils.h" #include "xetla.hpp" +#include using T = sycl::half; diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h index 9327b046a..94123300f 100644 --- a/benchmarks/xetla_kernel/flash_attention/fmha_utils.h +++ b/benchmarks/xetla_kernel/flash_attention/fmha_utils.h @@ -17,6 +17,7 @@ #pragma once #include "xetla.hpp" +#include namespace gpu::xetla { From 97c3dc7212b3988fdd54a03c3d31730a629759e2 Mon Sep 17 00:00:00 2001 From: Kali Uday Balleda Date: Wed, 13 Nov 2024 18:36:35 +0530 Subject: [PATCH 11/18] spirvrunner: default add tutorial fails (#2695) This PR addresses the issue of the tutorial addition failure in SPIRVRunner. Currently, the default add tutorial sample, which runs as part of SPIRVRunner, fails with the following error: ``` intel-xpu-backend-for-triton/utils/SPIRVRunner$ ./build/SPIRVRunner tensor_2 Running on device: Intel(R) Data Center GPU Max 1100 terminate called after throwing an instance of 'nlohmann::json_abi_v3_11_2::detail::out_of_range' what(): [json.exception.out_of_range.403] key 'build_flags' not found Aborted (core dumped) ``` --- utils/SPIRVRunner/args_data.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/utils/SPIRVRunner/args_data.json b/utils/SPIRVRunner/args_data.json index 1578504cd..db7bb6807 100644 --- a/utils/SPIRVRunner/args_data.json +++ b/utils/SPIRVRunner/args_data.json @@ -32,5 +32,6 @@ "threads_per_warp": 32, "shared_memory": 0, "kernel_name": "add_kernel", - "spv_name": "add_kernel.spv" + "spv_name": "add_kernel.spv", + "build_flags": "" } From 3bd49eab1d487fb4a90634af374aa7f03cc66013 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Wed, 13 Nov 2024 15:48:22 +0100 Subject: [PATCH 12/18] [XPU][GEN] Drop `barrier` operation (#2683) Drop TritonGEN `barrier` operation replacing its uses with equivalent `spirv.OpControlBarrier` operations. Note the `GLOBAL` memory semantics specified by the original operation correspond to `SequentiallyConsistent | CrossWorkgroupMemory` in SPIR-V, as `SequentiallyConsistent` is implied. Closes #2431 --------- Signed-off-by: victor-eds --- test/Conversion/intel/tritongpu_to_gen.mlir | 18 ++++++---- test/TritonGEN/tritongen-to-llvm.mlir | 15 -------- test/TritonGEN/tritongen.mlir | 7 ---- .../Dialect/TritonGEN/IR/TritonGENOps.td | 14 -------- .../TritonGENToLLVM/TritonGENToLLVMPass.cpp | 34 ++++--------------- .../lib/TritonIntelGPUToLLVM/CMakeLists.txt | 1 + .../LoadStoreOpToLLVM.cpp | 12 +++++-- .../TritonIntelGPUToLLVM/PipelineManager.h | 3 ++ 8 files changed, 31 insertions(+), 73 deletions(-) diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir index d83a0b4b2..f6ddb6318 100644 --- a/test/Conversion/intel/tritongpu_to_gen.mlir +++ b/test/Conversion/intel/tritongpu_to_gen.mlir @@ -1045,7 +1045,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_cas_f32_scalar_no_store tt.func @atomic_cas_f32_scalar_no_store(%ptr : !tt.ptr, %cmp : f32, %val : f32) { // CHECK: [[TRUE:%.*]] = llvm.mlir.constant(true) : i1 @@ -1054,7 +1053,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, // CHECK: [[CMP:%.*]] = llvm.icmp "eq" // CHECK: [[MASK:%.*]] = llvm.and [[MASK0]], [[CMP]] // CHECK: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO]] : i32) // CHECK-NEXT: ^bb1: // CHECK-NEXT: [[BCAST1:%.*]] = llvm.bitcast %arg1 : f32 to i32 @@ -1109,7 +1111,6 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, #blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0], CTAsPerCGA = [1], CTASplitNum = [1], CTAOrder = [0]}> module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_add_f32 tt.func @atomic_add_f32(%arg0 : tensor<256x!tt.ptr, #blocked0>, %arg1 : tensor<256xi1, #blocked0>, %arg2 : tensor<256xf32, #blocked0>) { // CHECK: [[EV0_ARG2:%.*]] = llvm.extractvalue %arg2[0] : !llvm.struct<(f32, f32)> @@ -1132,7 +1133,10 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // CHECK: [[IE2:%.*]] = llvm.insertelement [[EV1_ARG2]], [[UNDEF2]][{{.*}} : i64] : vector<1xf32> // CHECK-NEXT: [[PRED2:%.*]] = llvm.and [[CST_TRUE]], {{.*}} : i1 // CHECK-NEXT: [[ZERO2:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[PRED2]], ^bb3, ^bb4([[ZERO2]] : f32) // CHECK-NEXT: ^bb3: // CHECK-NEXT: [[BCAST2:%.*]] = llvm.bitcast [[IE2]] : vector<1xf32> to f32 @@ -1147,7 +1151,6 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : // ----- module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} { - // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} // CHECK-LABEL: atomic_add_f32_scalar_no_store tt.func @atomic_add_f32_scalar_no_store(%arg0 : !tt.ptr, %arg1 : i1, %arg2 : f32) { // CHECK: [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1 @@ -1159,7 +1162,10 @@ module attributes {"triton_gpu.target" = "xpu", "triton_gpu.num-ctas" = 1 : i32, // CHECK: [[IE1:%.*]] = llvm.insertelement %arg2, [[UNDEF1]][{{.*}} : i64] : vector<1xf32> // CHECK: [[PRED:%.*]] = llvm.and [[AND1]], %arg1 : i1 // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32 - // CHECK: llvm.call spir_funccc @_Z7barrierj({{.*}}) {{.*}} : (i32) -> () + // CHECK: [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32 + // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32 + // CHECK: llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> () // CHECK-NEXT: llvm.cond_br [[PRED]], ^bb1, ^bb2([[ZERO]] : f32) // CHECK-NEXT: ^bb1: // CHECK-NEXT: [[BCAST2:%.*]] = llvm.bitcast [[IE1]] : vector<1xf32> to f32 diff --git a/test/TritonGEN/tritongen-to-llvm.mlir b/test/TritonGEN/tritongen-to-llvm.mlir index e4dd0bc96..e3a4cdbfa 100644 --- a/test/TritonGEN/tritongen-to-llvm.mlir +++ b/test/TritonGEN/tritongen-to-llvm.mlir @@ -1,20 +1,5 @@ // RUN: triton-opt -convert-tritongen-to-llvm -split-input-file %s | FileCheck %s -// CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return} - -llvm.func @triton_gen.barrier() { - // CHECK-LABEL: triton_gen.barrier - // CHECK: [[LOCAL:%.*]] = llvm.mlir.constant(1 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[LOCAL]]) {{.*}} : (i32) -> () - // CHECK: [[GLOBAL:%.*]] = llvm.mlir.constant(2 : i32) : i32 - // CHECK: llvm.call spir_funccc @_Z7barrierj([[GLOBAL]]) {{.*}} : (i32) -> () - triton_gen.barrier {mem_fence=Local} - triton_gen.barrier {mem_fence=Global} - llvm.return -} - -// ----- - // CHECK-DAG: llvm.func spir_funccc @_Z31intel_work_group_barrier_arriveii(i32, i32) attributes {convergent, no_unwind, will_return} // CHECK-DAG: llvm.func spir_funccc @_Z29intel_work_group_barrier_waitii(i32, i32) attributes {convergent, no_unwind, will_return} diff --git a/test/TritonGEN/tritongen.mlir b/test/TritonGEN/tritongen.mlir index f388da5aa..90e2336de 100644 --- a/test/TritonGEN/tritongen.mlir +++ b/test/TritonGEN/tritongen.mlir @@ -1,12 +1,5 @@ // RUN: triton-opt %s -split-input-file -verify-diagnostics | FileCheck %s -llvm.func @triton_gen.barrier() { - // CHECK-LABEL: triton_gen.barrier - // CHECK: triton_gen.barrier {mem_fence = Local} - triton_gen.barrier {mem_fence=Local} - llvm.return -} - llvm.func @triton_gen.split_barrier_signal() { // CHECK-LABEL: triton_gen.split_barrier_signal // CHECK: triton_gen.split_barrier_signal {mem_fence = None, mem_scope = WorkGroup} diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td index d9d5266fb..dde9fd97e 100644 --- a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td +++ b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENOps.td @@ -32,20 +32,6 @@ class TritonGEN_Op traits = []> : // Synchronization //===----------------------------------------------------------------------===// -def TritonGEN_BarrierOp : TritonGEN_Op<"barrier"> { - let summary = "Workgroup barrier"; - let description = [{ - The `triton_gen.barrier` operation performs a workgroup barrier and ensures - all outstanding memory transaction using local or global memory are complete. - }]; - let arguments = (ins TritonGEN_MemFence:$mem_fence); - let results = (outs); - let assemblyFormat = "attr-dict"; - let assemblyFormat = [{ - ` ` `{` `mem_fence` `=` $mem_fence `}` attr-dict - }]; -} - def TritonGEN_SplitBarrierSignalOp : TritonGEN_Op<"split_barrier_signal"> { let summary = "Split barrier signal"; let description = [{ diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 3f45a9d77..78df567d6 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -479,27 +479,6 @@ namespace { // Synchronization Ops Lowerings //===----------------------------------------------------------------------===// -struct TritonGENBarrierLowering - : public ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; - - LogicalResult - matchAndRewrite(TritonGEN::BarrierOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - MLIRContext *ctx = rewriter.getContext(); - Location loc = op->getLoc(); - Type retType = void_ty(ctx); - IntegerType argType = int_ty(32); - Value arg = i32_val(static_cast(op.getMemFence())); - - LLVM::CallOp callOp = - createDeviceFunctionCall(rewriter, "_Z7barrierj", {retType}, {argType}, - {arg}, {}, convergentNoUnwindWillReturnAttrs); - rewriter.replaceOp(op, callOp); - return success(); - } -}; - struct TritonGENSplitBarrier { protected: template @@ -1092,13 +1071,12 @@ struct TritonGENToLLVMDialectInterface : public ConvertToLLVMPatternInterface { void mlir::triton::populateTritonGENToLLVMConversionPatterns( LLVMTypeConverter &converter, RewritePatternSet &patterns) { - patterns - .add(converter); + patterns.add< + TritonGENSplitBarrierSignalLowering, TritonGENSplitBarrierWaitLowering, + TritonSubGroupReduceLowering, TritonSubGroupScanLowering, + TritonMatrixDPASLowering, TritonMatrix2DBlockLoadLowering, + TritonMatrix2DBlockStoreLowering, TritonMatrix2DBlockPrefetchLowering, + TritonSIMDBlockReadLowering, TritonSIMDBlockWriteLowering>(converter); } void registerConvertTritonTritonGENToLLVMInterface(DialectRegistry ®istry) { diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt index f46c265fa..4e86cbd2f 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/CMakeLists.txt @@ -33,6 +33,7 @@ add_triton_library(TritonIntelGPUToLLVM LINK_LIBS PUBLIC GPUToTritonGEN MLIRGPUToLLVMSPV + MLIRSPIRVToLLVM TritonGENIR TritonGENToLLVM TritonIntelGPUIR diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index 35eb54024..edd1999ea 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -1,5 +1,6 @@ #include "Dialect/TritonIntelGPU/IR/Dialect.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/SPIRV/IR/SPIRVOps.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/TypeUtilities.h" #include "llvm/ADT/SmallVector.h" @@ -1313,7 +1314,10 @@ struct AtomicCASOpConversion Value zero = (valueElemNBits == 32) ? i32_val(0) : i64_val(0); if (!atomicNeedsSharedMemory(op.getResult())) - rewriter.create(loc, TritonGEN::MemFence::GLOBAL); + rewriter.create( + loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup, + spirv::MemorySemantics::SequentiallyConsistent | + spirv::MemorySemantics::CrossWorkgroupMemory); Block &endBlock = LLVM::intel::createPredicatedBlock(rewriter, loc, mask, {zero}, [&] { // casPtr = bitcast(casPtr, ptr_ty(ctx, 1)); @@ -1462,8 +1466,10 @@ struct AtomicRMWOpConversion rmwPtr, rmwVal, rmwMask, {zero}); } else { if (!atomicNeedsSharedMemory(op.getResult())) - rewriter.create(loc, - TritonGEN::MemFence::GLOBAL); + rewriter.create( + loc, spirv::Scope::Workgroup, spirv::Scope::Workgroup, + spirv::MemorySemantics::SequentiallyConsistent | + spirv::MemorySemantics::CrossWorkgroupMemory); endBlock = &LLVM::intel::createPredicatedBlock( rewriter, loc, rmwMask, {zero}, [&] { mlir::LLVM::AtomicBinOp rmwKind; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index b52b3a3b9..0593ca63f 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -18,6 +18,7 @@ #include "mlir/Conversion/GPUToLLVMSPV/GPUToLLVMSPVPass.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/SPIRVToLLVM/SPIRVToLLVM.h" #include "mlir/Conversion/UBToLLVM/UBToLLVM.h" #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h" #include "mlir/IR/PatternMatch.h" @@ -268,6 +269,8 @@ class TritonGPUToLLVMPipelineManager { triton::populateGPUToTritonGENConversionPatterns(typeConverter, patterns); cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns); populateGpuToLLVMSPVConversionPatterns(typeConverter, patterns); + populateSPIRVToLLVMConversionPatterns(typeConverter, patterns, + spirv::ClientAPI::OpenCL); } private: From b2474a39cf098e8fed1d61e080220fe9db2a3893 Mon Sep 17 00:00:00 2001 From: Victor Perez Date: Wed, 13 Nov 2024 15:50:21 +0100 Subject: [PATCH 13/18] [XPU] Conditionally add elementwise optimization pass to the pipeline (#2696) Conditionally add `-tritonintelgpu-optimize-elementwise-parallelism` to the pipeline. Signed-off-by: victor-eds --- third_party/intel/backend/compiler.py | 3 ++- third_party/intel/triton_xpu.cc | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 50301edd1..b05f856bb 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -251,8 +251,9 @@ def make_ttgir(mod, metadata, opt, properties): passes.common.add_cse(pm) passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) - if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == 1: + if os.getenv("TRITON_INTEL_OPTIMIZE_REDUCTION_LOCALITY", "0") == "1": intel.passes.ttgpuir.add_optimize_reduction_locality(pm) + intel.passes.ttgpuir.add_optimize_elementwise_parallelism(pm) intel.passes.ttgpuir.add_remove_layout_conversions(pm) intel.passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 55db14991..3a3037f6c 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -100,6 +100,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) { gpu::intel::createTritonIntelGPUMaterializeBlockPointer); ADD_PASS_WRAPPER_0("add_optimize_reduction_locality", gpu::intel::createTritonIntelGPUOptimizeReductionLocality); + ADD_PASS_WRAPPER_0( + "add_optimize_elementwise_parallelism", + gpu::intel::createTritonIntelGPUOptimizeElementwiseParallelism); } void init_triton_intel(py::module &&m) { From e33285ba71470e4692af1cc93842f4fe2126d444 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 13 Nov 2024 16:45:04 +0100 Subject: [PATCH 14/18] Switch XeTLA to the version compatible with 2025 compiler (#2697) For ref: https://github.com/intel/xetla/commit/bde127ffebf502d32ef8ac2748e12d7839597fab Signed-off-by: Anatoly Myachev --- benchmarks/xetla_kernel/xetla-library.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/xetla_kernel/xetla-library.conf b/benchmarks/xetla_kernel/xetla-library.conf index 2cc1e9f5b..944094ecd 100644 --- a/benchmarks/xetla_kernel/xetla-library.conf +++ b/benchmarks/xetla_kernel/xetla-library.conf @@ -1 +1 @@ -b9e489ca6a776694a898044a3f2ae023a98db03d +bde127ffebf502d32ef8ac2748e12d7839597fab From a5393bff3cbe6942bee345706e6cee6bee787871 Mon Sep 17 00:00:00 2001 From: Pavel Chekin Date: Wed, 13 Nov 2024 09:44:22 -0800 Subject: [PATCH 15/18] runner-0.0.20: DLE instead of PTDB (#2691) Fixes #2592 for CI runners except performance runners. --- .github/workflows/auto-update-translator-cid.yml | 2 +- .github/workflows/bandit-check.yml | 2 +- .github/workflows/build-test-reusable.yml | 2 +- .github/workflows/conda-test-reusable.yml | 4 ++-- .github/workflows/nightly-wheels.yml | 2 +- .github/workflows/no-basekit-build-test.yml | 2 +- scripts/install-conda.sh | 4 ++-- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/auto-update-translator-cid.yml b/.github/workflows/auto-update-translator-cid.yml index 0b854da30..7c2aad266 100644 --- a/.github/workflows/auto-update-translator-cid.yml +++ b/.github/workflows/auto-update-translator-cid.yml @@ -15,7 +15,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 defaults: run: shell: bash -noprofile --norc -eo pipefail -c "source /opt/intel/oneapi/setvars.sh > /dev/null; source {0}" diff --git a/.github/workflows/bandit-check.yml b/.github/workflows/bandit-check.yml index 9edaad048..9d20af805 100644 --- a/.github/workflows/bandit-check.yml +++ b/.github/workflows/bandit-check.yml @@ -11,7 +11,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 defaults: run: shell: bash diff --git a/.github/workflows/build-test-reusable.yml b/.github/workflows/build-test-reusable.yml index aca94112b..0c3598c67 100644 --- a/.github/workflows/build-test-reusable.yml +++ b/.github/workflows/build-test-reusable.yml @@ -55,7 +55,7 @@ on: runner_version: description: Runner label for version type: string - default: runner-0.0.19 + default: runner-0.0.20 permissions: read-all diff --git a/.github/workflows/conda-test-reusable.yml b/.github/workflows/conda-test-reusable.yml index 11989e858..d81f36548 100644 --- a/.github/workflows/conda-test-reusable.yml +++ b/.github/workflows/conda-test-reusable.yml @@ -55,7 +55,7 @@ on: runner_version: description: Runner label for version type: string - default: runner-0.0.19 + default: runner-0.0.20 env_manager: description: Environment manager default: conda @@ -82,7 +82,7 @@ jobs: id: conda-cache uses: ./.github/actions/load env: - CACHE_NUMBER: 6 + CACHE_NUMBER: 7 with: path: $HOME/miniforge3/envs/triton key: conda-${{ inputs.env_manager }}-py${{ matrix.python }}-${{ hashFiles('scripts/triton.yml', 'python/pyproject.toml', 'python/setup.py') }}-${{ env.CACHE_NUMBER }} diff --git a/.github/workflows/nightly-wheels.yml b/.github/workflows/nightly-wheels.yml index d4b236e43..832fafce7 100644 --- a/.github/workflows/nightly-wheels.yml +++ b/.github/workflows/nightly-wheels.yml @@ -16,7 +16,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 strategy: matrix: python: diff --git a/.github/workflows/no-basekit-build-test.yml b/.github/workflows/no-basekit-build-test.yml index dfa0313cb..3c6e5a42d 100644 --- a/.github/workflows/no-basekit-build-test.yml +++ b/.github/workflows/no-basekit-build-test.yml @@ -17,7 +17,7 @@ jobs: runs-on: - max1100 - rolling - - runner-0.0.19 + - runner-0.0.20 strategy: matrix: python: ${{ github.ref_name == 'main' && fromJson('["3.9", "3.10", "3.11"]') || fromJson('["3.9"]') }} diff --git a/scripts/install-conda.sh b/scripts/install-conda.sh index c55cc9539..dd29d57fa 100755 --- a/scripts/install-conda.sh +++ b/scripts/install-conda.sh @@ -2,8 +2,8 @@ link_sycl() { mkdir -p $HOME/miniforge3/envs/triton/$1 - ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl $HOME/miniforge3/envs/triton/$1/ - ln -snf /opt/intel/oneapi/compiler/2024.1/include/sycl/CL $HOME/miniforge3/envs/triton/$1/ + ln -snf /opt/intel/oneapi/compiler/latest/include/sycl $HOME/miniforge3/envs/triton/$1/ + ln -snf /opt/intel/oneapi/compiler/latest/include/sycl/CL $HOME/miniforge3/envs/triton/$1/ } install_env() { From 26c23a44be296255ce2faffc7f0fd2248cd32768 Mon Sep 17 00:00:00 2001 From: Whitney Tsang Date: Thu, 14 Nov 2024 00:28:47 +0000 Subject: [PATCH 16/18] Add test_chained_reductions to skiplist Signed-off-by: Whitney Tsang --- scripts/skiplist/a770/language.txt | 2 ++ scripts/skiplist/conda/language.txt | 2 ++ scripts/skiplist/default/language.txt | 2 ++ scripts/skiplist/lts/language.txt | 2 ++ scripts/skiplist/mtl/language.txt | 2 ++ scripts/skiplist/xe2/language.txt | 2 ++ 6 files changed, 12 insertions(+) diff --git a/scripts/skiplist/a770/language.txt b/scripts/skiplist/a770/language.txt index a829b75bf..e833b924b 100644 --- a/scripts/skiplist/a770/language.txt +++ b/scripts/skiplist/a770/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/conda/language.txt b/scripts/skiplist/conda/language.txt index f8a9e4812..41035163f 100644 --- a/scripts/skiplist/conda/language.txt +++ b/scripts/skiplist/conda/language.txt @@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/default/language.txt b/scripts/skiplist/default/language.txt index 36c6d7e69..fb018c5e0 100644 --- a/scripts/skiplist/default/language.txt +++ b/scripts/skiplist/default/language.txt @@ -2,3 +2,5 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] diff --git a/scripts/skiplist/lts/language.txt b/scripts/skiplist/lts/language.txt index 76dd77c93..c2842cdb9 100644 --- a/scripts/skiplist/lts/language.txt +++ b/scripts/skiplist/lts/language.txt @@ -117,6 +117,8 @@ test/unit/language/test_core.py::test_dot_max_num_imprecise_acc[128-float8e5-128 test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/mtl/language.txt b/scripts/skiplist/mtl/language.txt index a346bc76a..df2e44aae 100644 --- a/scripts/skiplist/mtl/language.txt +++ b/scripts/skiplist/mtl/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] diff --git a/scripts/skiplist/xe2/language.txt b/scripts/skiplist/xe2/language.txt index 3d923ed1d..436fe5255 100644 --- a/scripts/skiplist/xe2/language.txt +++ b/scripts/skiplist/xe2/language.txt @@ -2,6 +2,8 @@ test/unit/language/test_core.py::test_precise_math[1-tl.math.sqrt_rn(x)-tl.math.sqrt(x.to(tl.float64)).to(tl.float32)] # https://github.com/intel/intel-xpu-backend-for-triton/issues/2662 test/unit/language/test_core.py::test_scan_layouts[True-1-src_layout10-64-32] +# https://github.com/intel/intel-xpu-backend-for-triton/issues/2703 +test/unit/language/test_core.py::test_chained_reductions[in_shape0-perm0-red_dims0] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float16] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float16-float32] test/unit/language/test_core.py::test_dot3d[1-1-32-32-32-32-32-float32-float32] From 532728c06c4b58ecae32f7e12722886c8521f276 Mon Sep 17 00:00:00 2001 From: glados-intel <153325143+glados-intel@users.noreply.github.com> Date: Wed, 13 Nov 2024 18:10:21 -0800 Subject: [PATCH 17/18] [github-bot] Update spirv-llvm-translator.conf (#2702) Automated PR to update translator commit id. --- lib/Target/SPIRV/spirv-llvm-translator.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/SPIRV/spirv-llvm-translator.conf b/lib/Target/SPIRV/spirv-llvm-translator.conf index 22368d508..8134401a4 100644 --- a/lib/Target/SPIRV/spirv-llvm-translator.conf +++ b/lib/Target/SPIRV/spirv-llvm-translator.conf @@ -1 +1 @@ -1a1bf17d9e8684cd826e4278e78f63aa80e2e2ca +15fd1cc50e12465c74ef34a264f11c8523247b46 From e30e00f47005eb5ca121ef529cc05d14b4fec6b7 Mon Sep 17 00:00:00 2001 From: Pavel Chekin Date: Thu, 14 Nov 2024 08:36:39 -0800 Subject: [PATCH 18/18] Custom README location (#2705) Move README specific to this repository to `.github` and keep the upstream README as is to minimize difference with the upstream. The content from `.github/README.md` is displayed for the repository, see https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-readmes. Fixes #2706. --- .github/README.md | 324 ++++++++++++++++++++++++++++++++++++++++++++++ README.md | 291 +++++++++++++++-------------------------- 2 files changed, 426 insertions(+), 189 deletions(-) create mode 100644 .github/README.md diff --git a/.github/README.md b/.github/README.md new file mode 100644 index 000000000..0fc64b276 --- /dev/null +++ b/.github/README.md @@ -0,0 +1,324 @@ +[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml) +[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml) + +# Intel® XPU Backend for Triton\* + +This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton) backend for Intel GPUs. +Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/pytorch/pytorch) and standalone usage. + +# Compatibility + +* Operating systems: + * [Ubuntu 22.04](http://releases.ubuntu.com/22.04) +* GPU Cards: + * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) + * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html) + * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html) +* GPU Drivers: + * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html) + * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) +* Toolchain: + * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*. + +# Quick Installation + +## Prerequisites + +1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver +2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +## Install PyTorch and Triton from nightly wheels + +Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels. +Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml), +select the most recent successful run on the top of the page and download an artifact for the corresponding Python version. +Extract the archive and in the extracted directory execute: + +```shell +pip install torch-*.whl triton-*.whl +``` + +Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain. +The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). + +```shell +# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs +source /opt/intel/oneapi/setvars.sh +``` + +# Install from source + +## Prerequisites + +1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver +2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) + +## Compile PyTorch and Triton from source + +Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time. + +Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain. +The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). + +```shell +# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs +source /opt/intel/oneapi/setvars.sh +``` + +Clone this repository: + +```shell +git clone https://github.com/intel/intel-xpu-backend-for-triton.git +cd intel-xpu-backend-for-triton +``` + +To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment: + +```shell +python -m venv .venv --prompt triton +source .venv/bin/activate +``` + +Compile and install PyTorch: + +```shell +scripts/install-pytorch.sh --source +``` + +Compile and install Intel® XPU Backend for Triton\*: + +```shell +scripts/compile-triton.sh +``` + +# Building with a custom LLVM + +Triton uses LLVM to generate code for GPUs and CPUs. Normally, the Triton build +downloads a prebuilt LLVM, but you can also build LLVM from source and use that. + +LLVM does not have a stable API, so the Triton build will not work at an +arbitrary LLVM version. + +1. Find the version of LLVM that Triton builds against. +Check `cmake/llvm-hash.txt` to see the current version. + +2. Checkout LLVM at this revision to the directory `llvm`, +which must be in the same directory as `intel-xpu-backend-for-triton`: + +3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM: + + ```shell + ./scripts/compile-triton.sh --llvm --triton + ``` + +# Tips for building + +- Set `TRITON_BUILD_WITH_CLANG_LLD=true` as an environment variable to use clang + and lld. lld in particular results in faster builds. + +- Set `TRITON_BUILD_WITH_CCACHE=true` to build with ccache. + +- Set `TRITON_HOME=/some/path` to change the location of the `.triton` + directory where Triton's cache is located and downloads are stored + during the build. By default, this is the user's home directory. It + can be changed anytime. + +- Pass `--no-build-isolation` to `pip install` to make nop builds faster. + Without this, every invocation of `pip install` uses a different symlink to + cmake, and this forces ninja to rebuild most of the `.a` files. + +- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++ + (probably because, in our build, users don't invoke cmake directly, but + instead use setup.py). Teach vscode how to compile Triton as follows. + + - Do a local build. Run command `pip install -e python` + - Get the full path to the `compile_commands.json` file produced by the build: + `find python/build -name 'compile_commands.json' | xargs readlink -f`. + You might get a full path similar to `/Users/{username}/triton/python/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json` + - In vscode, install the + [C/C++ + extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode.cpptools), + then open the command palette (`Shift + Command + P` on Mac, or `Shift + + Ctrl + P` on Windows/Linux) and open `C/C++: Edit Configurations (UI)`. + - Open "Advanced Settings" and paste the full path to + `compile_commands.json` into the "Compile Commands" textbox. + +# Running tests + +There currently isn't a turnkey way to run all the Triton tests, but you can +follow the following recipe. + +```shell +scripts/test-triton.sh +``` + +# Tips for hacking + +For detailed instructions on how to debug Triton's frontend, please refer to this [tutorial](https://triton-lang.org/main/programming-guide/chapter-3/debugging.html). The following includes additional tips for hacking on Triton's backend. + +**Helpful environment variables** + +- `MLIR_ENABLE_DUMP=1` dumps the IR before every MLIR pass Triton runs, for all + kernels. Use `MLIR_ENABLE_DUMP=kernelName` to dump for a specific kernel only. + - Triton cache can interfere with the dump. In cases where `MLIR_ENABLE_DUMP=1` does not work, try cleaning your triton cache: `rm -r ~/.triton/cache/*` +- `LLVM_IR_ENABLE_DUMP=1` dumps the IR before every pass run over the LLVM IR. +- `TRITON_INTERPRET=1` uses the Triton interpreter instead of running on the + GPU. You can insert Python breakpoints in your kernel code! +- `TRITON_ENABLE_LLVM_DEBUG=1` passes `-debug` to LLVM, printing a lot of + debugging information to stdout. If this is too noisy, run with just + `TRITON_LLVM_DEBUG_ONLY` instead to limit the output. + + An alternative way to reduce output noisiness is running with + `LLVM_IR_ENABLE_DUMP=1`, extract the IR before the LLVM pass of interest, and + then run LLVM's `opt` standalone, perhaps passing `-debug-only=foo` on the + command line. +- `TRITON_LLVM_DEBUG_ONLY=` is the equivalent of LLVM's + `-debug-only` command-line option. This limits the LLVM debug output to + specific pass or component names (which are specified using `#define + DEBUG_TYPE` throughout LLVM and Triton) in order to allow the debug output to + be less noisy. `TRITON_LLVM_DEBUG_ONLY` allows for one or more comma + separated values to be specified (eg + `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions` or + `TRITON_LLVM_DEBUG_ONLY="tritongpu-remove-layout-conversions,regalloc"`). +- `USE_IR_LOC={ttir,ttgir}` reparses the IR such that the location information + will be the line number of the IR file with that particular extension, + instead of line number of the python file. This can provide a direct mapping + from the IR to llir/ptx. When used with performance tools, it can provide a + breakdown on IR instructions. +- `TRITON_PRINT_AUTOTUNING=1` prints out the best autotuning config and total time + spent for each kernel after autotuning is complete. +- `DISABLE_LLVM_OPT` will disable llvm optimizations for make_llir and make_ptx + if its value is true when parsing as Bool. Otherwise, it will be parsed as a list + of flags to disable llvm optimizations. One usage case is + `DISABLE_LLVM_OPT="disable-lsr"` + Loop strength reduction is known to cause up to 10% performance changes for + certain kernels with register pressure. +- `TRITON_ALWAYS_COMPILE=1` forces to compile kernels regardless of cache hit. +- `MLIR_ENABLE_TIMING` dumps the timing information for each MLIR pass. +- `LLVM_ENABLE_TIMING` dumps the timing information for each LLVM pass. +- `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma). +- `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks. + +# Usage Guide + +## Code Modifications +Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels. + +1. Add `import torch` for xpu support. +2. Put the tensor and models to XPU by calling `to('xpu')`. + +This repository contains modified [tutorials](https://github.com/intel/intel-xpu-backend-for-triton/tree/main/python/tutorials) that must be used with Intel® XPU Backend for Triton\*. + +The following examples show modifications for the user code. + +### Example 1 : Triton Kernel + +This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics. + +Comparing to the original code, the following code modifies: + +```Python +import torch +import triton +import triton.language as tl + + +@triton.jit +def add_kernel( + x_ptr, + y_ptr, + output_ptr, + n_elements, + BLOCK_SIZE: tl.constexpr, +): + pid = tl.program_id(axis=0) + block_start = pid * BLOCK_SIZE + offsets = block_start + tl.arange(0, BLOCK_SIZE) + mask = offsets < n_elements + x = tl.load(x_ptr + offsets, mask=mask) + y = tl.load(y_ptr + offsets, mask=mask) + output = x + y + tl.store(output_ptr + offsets, output, mask=mask) + +def add(x: torch.Tensor, y: torch.Tensor): + # Put the tensor to xpu + output = torch.empty_like(x).xpu() + assert x.is_xpu and y.is_xpu and output.is_xpu + n_elements = output.numel() + grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) + add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) + + return output + +# For manual_seed, needs to use API for XPU +torch.xpu.manual_seed(0) +size = 512 +# For tensors, needs to be put on XPU +x = torch.rand(size, device='xpu') +y = torch.rand(size, device='xpu') +output_torch = x + y +output_triton = add(x, y) +print(output_torch) +print(output_triton) +print( + f'The maximum difference between torch and triton is ' + f'{torch.max(torch.abs(output_torch - output_triton))}' +) +``` + +### Example 2 : End-to-End Model +Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it. + +```Python +import torch +from torch._dynamo.testing import rand_strided + +from torch.nn import * +class simpleModel(torch.nn.Module): + def __init__(self): + super().__init__() + # tensors inside model should be on xpu + self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) + + def forward(self, x): + z = x + self.y + return z + +# tensors passed to the model should be on xpu +x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) +xpu_model = simpleModel() +# Call torch.compile for optimization +optimized_mod = torch.compile(xpu_model) + +graph_result = optimized_mod(x) +``` + +## Performance Analysis Guide + +There are several ways of doing performance analysis. +We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis. +Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling. + +```Bash +export TRITON_XPU_PROFILE=1 +``` + +# Contributing + +Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/CONTRIBUTING.md). + +## License + +_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file. + + +## Security + +See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html) +for information on how to report a potential security issue or vulnerability. + +See also: [Security Policy](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/SECURITY.md). diff --git a/README.md b/README.md index a8bbe2c2e..18e46403e 100644 --- a/README.md +++ b/README.md @@ -1,98 +1,60 @@ -[![Build and test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/build-test.yml) -[![Triton wheels](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml) -[![Conda test](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml/badge.svg?branch=main)](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/conda-build-test.yml) +
+ Triton logo +
-# Intel® XPU Backend for Triton\* +The Triton Conference is happening again on September 17th, 2024 in Fremont (CA)! -This is the development repository of Intel® XPU Backend for Triton\*, a new [Triton](https://github.com/triton-lang/triton/) backend for Intel GPUs. Intel® XPU Backend for Triton\* is a out of tree backend module for [Triton](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) used to provide best-in-class performance and productivity on any Intel GPUs for [PyTorch](https://github.com/triton-lang/triton/blob/main/CONTRIBUTING.md) and standalone usage. +If you are interested in attending, please fill up [this form](https://docs.google.com/forms/d/e/1FAIpQLSecHC1lkalcm0h3JDUbspekDX5bmBvMxgVTLaK3e-61bzDDbg/viewform). -# Compatibility - -* Operating systems: - * [Ubuntu 22.04](http://releases.ubuntu.com/22.04) -* GPU Cards: - * [Intel® Data Center GPU Max Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/max-series.html) - * [Intel® Data Center Flex Series](https://www.intel.com/content/www/us/en/products/details/discrete-gpus/data-center-gpu/flex-series.html) - * [Intel Arc A770](https://www.intel.com/content/www/us/en/products/sku/229151/intel-arc-a770-graphics-16gb/specifications.html) -* GPU Drivers: - * Latest [Long Term Support (LTS) Release](https://dgpu-docs.intel.com/driver/installation.html) - * Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) -* Toolchain: - * Latest [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +| **`Documentation`** | **`Nightly Wheels`** | +|-------------------- | -------------------- | +| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) | -Note that Intel® XPU Backend for Triton\* is not compatible with Intel® Extension for PyTorch\* and Intel® oneAPI Base Toolkit\*. - -# Quick Installation +# Triton -## Prerequisites +This is the development repository of Triton, a language and compiler for writing highly efficient custom Deep-Learning primitives. The aim of Triton is to provide an open-source environment to write fast code at higher productivity than CUDA, but also with higher flexibility than other existing DSLs. -1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver -2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) -3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) +The foundations of this project are described in the following MAPL2019 publication: [Triton: An Intermediate Language and Compiler for Tiled Neural Network Computations](http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf). Please consider citing this work if you use Triton! -## Install PyTorch and Triton from nightly wheels +The [official documentation](https://triton-lang.org) contains installation instructions and tutorials. See also these third-party [Triton puzzles](https://github.com/srush/Triton-Puzzles), which can all be run using the Triton interpreter -- no GPU required. -Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both can be installed from nightly wheels. -Navigate to the [nightly wheels workflow](https://github.com/intel/intel-xpu-backend-for-triton/actions/workflows/nightly-wheels.yml), -select the most recent successful run on the top of the page and download an artifact for the corresponding Python version. -Extract the archive and in the extracted directory execute: - -```shell -pip install torch-*.whl triton-*.whl -``` +# Quick Installation -Before using Intel® XPU Backend for Triton\* you need to initialize the toolchain. -The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). +You can install the latest stable release of Triton from pip: ```shell -# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs -source /opt/intel/oneapi/setvars.sh +pip install triton ``` -# Install from source - -## Prerequisites - -1. Latest [Rolling Release](https://dgpu-docs.intel.com/driver/installation-rolling.html) or [Long Term Support Release](https://dgpu-docs.intel.com/driver/installation.html) of GPU driver -2. Latest release of [PyTorch Prerequisites for Intel GPUs](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) -3. Latest release of [Profiling Tools Interfaces for Intel GPU (PTI for GPU)](https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html) - -## Compile PyTorch and Triton from source - -Currently, Intel® XPU Backend for Triton\* requires a special version of PyTorch and both need to be compiled at the same time. +Binary wheels are available for CPython 3.8-3.12 and PyPy 3.8-3.9. -Before compiling PyTorch and Intel® XPU Backend for Triton\* you need to initialize the toolchain. -The default location is `/opt/intel/oneapi` (if installed as a `root` user) or `~/intel/oneapi` (if installed as a regular user). +And the latest nightly release: ```shell -# replace /opt/intel/oneapi with the actual location of PyTorch Prerequisites for Intel GPUs -source /opt/intel/oneapi/setvars.sh +pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly ``` -Clone this repository: +# Install from source ```shell -git clone https://github.com/intel/intel-xpu-backend-for-triton.git -cd intel-xpu-backend-for-triton -``` - -To avoid potential conflicts with installed packages it is recommended to create and activate a new Python virtual environment: +git clone https://github.com/triton-lang/triton.git; +cd triton; -```shell -python -m venv .venv --prompt triton -source .venv/bin/activate +pip install ninja cmake wheel pybind11; # build-time dependencies +pip install -e python ``` -Compile and install PyTorch: +Or with a virtualenv: ```shell -scripts/install-pytorch.sh --source -``` +git clone https://github.com/triton-lang/triton.git; +cd triton; -Compile and install Intel® XPU Backend for Triton\*: +python -m venv .venv --prompt triton; +source .venv/bin/activate; -```shell -scripts/compile-triton.sh +pip install ninja cmake wheel pybind11; # build-time dependencies +pip install -e python ``` # Building with a custom LLVM @@ -103,17 +65,36 @@ downloads a prebuilt LLVM, but you can also build LLVM from source and use that. LLVM does not have a stable API, so the Triton build will not work at an arbitrary LLVM version. -1. Find the version of LLVM that Triton builds against. -Check `cmake/llvm-hash.txt` to see the current version. +1. Find the version of LLVM that Triton builds against. Check +`cmake/llvm-hash.txt` to see the current version. For example, if it says: + 49af6502c6dcb4a7f7520178bd14df396f78240c -2. Checkout LLVM at this revision to the directory `llvm`, -which must be in the same directory as `intel-xpu-backend-for-triton`: + This means that the version of Triton you have builds against + [LLVM](https://github.com/llvm/llvm-project) 49af6502. -3. In the directory `intel-xpu-backend-for-triton`, build Triton with custom LLVM: +2. `git checkout` LLVM at this revision. Optionally, make additional + modifications to LLVM. - ```shell - ./scripts/compile-triton.sh --llvm --triton - ``` +3. [Build LLVM](https://llvm.org/docs/CMake.html). For example, you might run + + $ cd $HOME/llvm-project # your clone of LLVM. + $ mkdir build + $ cd build + $ cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_ASSERTIONS=ON ../llvm -DLLVM_ENABLE_PROJECTS="mlir;llvm" -DLLVM_TARGETS_TO_BUILD="host;NVPTX;AMDGPU" + $ ninja + +4. Grab a snack, this will take a while. + +5. Build Triton as above, but set the following environment variables. + + # Modify as appropriate to point to your LLVM build. + $ export LLVM_BUILD_DIR=$HOME/llvm-project/build + + $ cd + $ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \ + LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib \ + LLVM_SYSPATH=$LLVM_BUILD_DIR \ + pip install -e python # Tips for building @@ -131,7 +112,7 @@ which must be in the same directory as `intel-xpu-backend-for-triton`: Without this, every invocation of `pip install` uses a different symlink to cmake, and this forces ninja to rebuild most of the `.a` files. -- VSCcode IntelliSense has some difficulty figuring out how to build Triton's C++ +- vscode intellisense has some difficulty figuring out how to build Triton's C++ (probably because, in our build, users don't invoke cmake directly, but instead use setup.py). Teach vscode how to compile Triton as follows. @@ -153,7 +134,36 @@ There currently isn't a turnkey way to run all the Triton tests, but you can follow the following recipe. ```shell -scripts/test-triton.sh +# One-time setup. Note we have to reinstall local Triton because torch +# overwrites it with the public version. +$ pip install scipy numpy torch pytest lit pandas matplotlib && pip install -e python + +# Run Python tests using your local GPU. +$ python3 -m pytest python/test/unit + +# Move to builddir. Fill in <...> with the full path, e.g. +# `cmake.linux-x86_64-cpython-3.11`. +$ cd python/build/cmake<...> + +# Run C++ unit tests. +$ ctest -j32 + +# Run lit tests. +$ lit test +``` + +You may find it helpful to make a symlink to the builddir and tell your local +git to ignore it. + +```shell +$ ln -s python/build/cmake<...> build +$ echo build >> .git/info/exclude +``` + +Then you can e.g. rebuild and run lit with the following command. + +```shell +$ ninja -C build && ( cd build ; lit test ) ``` # Tips for hacking @@ -203,124 +213,27 @@ For detailed instructions on how to debug Triton's frontend, please refer to thi - `TRITON_DEFAULT_FP_FUSION` overrides the default behavior of allowing fp fusion (mul+add->fma). - `MLIR_ENABLE_REMARK` enables the performance warnings that are emitted as remarks. -# Usage Guide - -## Code Modifications -Intel® XPU Backend for Triton\* requires a special version of PyTorch that can be built from sources or installed from nightly wheels. - -1. Add `import torch` for xpu support. -2. Put the tensor and models to XPU by calling `to('xpu')`. - -This repository contains modified [tutorials](python/tutorials) that must be used with Intel® XPU Backend for Triton\*. - -The following examples show modifications for the user code. - -### Example 1 : Triton Kernel - -This example is a modified version of [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) triton kernel. Please refer to [Vector Add](https://triton-lang.org/main/getting-started/tutorials/01-vector-add.html#vector-addition) for detailed comments and illustration about the code semantics. - -Comparing to the original code, the following code modifies: - -```Python -import torch -import triton -import triton.language as tl - - -@triton.jit -def add_kernel( - x_ptr, - y_ptr, - output_ptr, - n_elements, - BLOCK_SIZE: tl.constexpr, -): - pid = tl.program_id(axis=0) - block_start = pid * BLOCK_SIZE - offsets = block_start + tl.arange(0, BLOCK_SIZE) - mask = offsets < n_elements - x = tl.load(x_ptr + offsets, mask=mask) - y = tl.load(y_ptr + offsets, mask=mask) - output = x + y - tl.store(output_ptr + offsets, output, mask=mask) - -def add(x: torch.Tensor, y: torch.Tensor): - # Put the tensor to xpu - output = torch.empty_like(x).xpu() - assert x.is_xpu and y.is_xpu and output.is_xpu - n_elements = output.numel() - grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) - add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) - - return output - -# For manual_seed, needs to use API for XPU -torch.xpu.manual_seed(0) -size = 512 -# For tensors, needs to be put on XPU -x = torch.rand(size, device='xpu') -y = torch.rand(size, device='xpu') -output_torch = x + y -output_triton = add(x, y) -print(output_torch) -print(output_triton) -print( - f'The maximum difference between torch and triton is ' - f'{torch.max(torch.abs(output_torch - output_triton))}' -) -``` - -### Example 2 : End-to-End Model -Triton is transparent for end-to-end models. One could easily use `torch.compile` with `inductor` as backend by default. It will automatically generates triton kernel and gets benefit from it. - -```Python -import torch -from torch._dynamo.testing import rand_strided - -from torch.nn import * -class simpleModel(torch.nn.Module): - def __init__(self): - super().__init__() - # tensors inside model should be on xpu - self.y = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) - - def forward(self, x): - z = x + self.y - return z - -# tensors passed to the model should be on xpu -x = rand_strided((32, 8), (8, 1), device='xpu:0', dtype=torch.float32) -xpu_model = simpleModel() -# Call torch.compile for optimization -optimized_mod = torch.compile(xpu_model) - -graph_result = optimized_mod(x) -``` - -## Performance Analysis Guide - -There are several ways of doing performance analysis. We recommend using `torch.profiler` for end-to-end performance analysis and using Intel® VTune™ Profiler for more detailed kernel analysis. We provide a comprehensive guide for those two: -1. [end_to_end_tests#profiling settings](docs/test_docs/end_to_end_tests.md#profiling-settings) section for using `torch.profiler`. -2. [VTune Profiling Guide](docs/VTune_Profiling.md) for kernel analysis. +# Changelog -Note that the user needs to explicitly set `TRITON_XPU_PROFILE=1` when the user needs to enable kernel profiling. +Version 2.0 is out! New features include: -```Bash -export TRITON_XPU_PROFILE=1 -``` +- Many, many bug fixes +- Performance improvements +- Backend rewritten to use MLIR +- Support for kernels that contain back-to-back matmuls (e.g., flash attention) # Contributing -Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/intel/intel-xpu-backend-for-triton). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md). - -## License +Community contributions are more than welcome, whether it be to fix bugs or to add new features at [github](https://github.com/triton-lang/triton/). For more detailed instructions, please visit our [contributor's guide](CONTRIBUTING.md). -_MIT License_. As found in [LICENSE](https://github.com/intel/intel-xpu-backend-for-triton/blob/main/LICENSE) file. +# Compatibility +Supported Platforms: -## Security +- Linux -See Intel's [Security Center](https://www.intel.com/content/www/us/en/security-center/default.html) -for information on how to report a potential security issue or vulnerability. +Supported Hardware: -See also: [Security Policy](security.md) +- NVIDIA GPUs (Compute Capability 8.0+) +- AMD GPUs (ROCm 5.2+) +- Under development: CPUs