From ee02496fac151c7083f4f9083c0dc91dea29bae7 Mon Sep 17 00:00:00 2001 From: Mohammed Anany Date: Fri, 25 Oct 2024 07:15:55 -0700 Subject: [PATCH] [Triton] Allow reorderValues to handle downcast with dot_op layout on 16-bit -> 8-bit in the same way it handles 8-bit -> 16-bit. We already needed to do something similar for 16/32 bits previously. PiperOrigin-RevId: 689778145 --- .../further_mixed_precision_fix.patch | 36 +++++++++++++++++++ third_party/triton/temporary/series.bzl | 1 + 2 files changed, 37 insertions(+) create mode 100644 third_party/triton/temporary/further_mixed_precision_fix.patch diff --git a/third_party/triton/temporary/further_mixed_precision_fix.patch b/third_party/triton/temporary/further_mixed_precision_fix.patch new file mode 100644 index 0000000000000..6152ab48194c0 --- /dev/null +++ b/third_party/triton/temporary/further_mixed_precision_fix.patch @@ -0,0 +1,36 @@ +This resolves the issue here b/372630230. The patch is not intended to be +submitted to Triton upstream. This is because OAI historically refused these +similar work-arounds and the proper fixes are considerably more expensive to do. +diff --git a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp +--- a/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp ++++ b/lib/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVM.cpp +@@ -55,7 +55,8 @@ SmallVector reorderValues(const S + } + return ret; + } +- if (inBitWidth == 8 && ouBitWidth == 16) { ++ if ((inBitWidth == 8 && ouBitWidth == 16) || ++ (inBitWidth == 16 && ouBitWidth == 8)) { + SmallVector ret; + for (unsigned i = 0; i < values.size(); i += 16) { + ret.push_back(values[i + 0]); +diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir +--- a/test/Conversion/tritongpu_to_llvm.mlir ++++ b/test/Conversion/tritongpu_to_llvm.mlir +@@ -1693,3 +1693,16 @@ module attributes {"triton_gpu.num-ctas" + tt.return + } + } ++ ++// ----- ++ ++#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [0, 1], instrShape = [16, 8]}> ++#dot_operand = #triton_gpu.dot_op<{opIdx=0, parent=#mma, kWidth=4}> ++module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} { ++ tt.func @f16_to_f8_dot_operand(%f16_inp: tensor<32x32xf16, #dot_operand>) { ++ // CHECK-LABEL: @f16_to_f8_dot_operand ++ ++ %f8 = tt.fp_to_fp %f16_inp, rounding = rtne : tensor<32x32xf16, #dot_operand> -> tensor<32x32xf8E5M2, #dot_operand> ++ tt.return ++ } ++} diff --git a/third_party/triton/temporary/series.bzl b/third_party/triton/temporary/series.bzl index 274faf600e048..d6b1a6a31f783 100644 --- a/third_party/triton/temporary/series.bzl +++ b/third_party/triton/temporary/series.bzl @@ -17,5 +17,6 @@ temporary_patch_list = [ "//third_party/triton:temporary/fix_left_shift_overflow.patch", "//third_party/triton:temporary/prefetch.patch", "//third_party/triton:temporary/i4_to_bf16.patch", + "//third_party/triton:temporary/further_mixed_precision_fix.patch", # Add new patches just above this line ]