Refactors triton_kernel_call_lowering to support both cuda and rocm.

This is a rollforward of #287 with fixes. PiperOrigin-RevId: 673490870
jax-ml · Sep 11, 2024 · 285dba9 · 285dba9
1 parent e82c529
commit 285dba9
Show file tree

Hide file tree

Showing 2 changed files with 192 additions and 40 deletions.
diff --git a/jax_triton/triton_lib.py b/jax_triton/triton_lib.py
@@ -27,6 +27,7 @@
 import types
 from typing import Any, Callable, Dict, Optional, Protocol, Sequence, Tuple, Union
 import zlib
+from functools import partial
 
 from absl import logging
 import jax
@@ -56,6 +57,14 @@
   CAN_USE_TRITON = True
 except ModuleNotFoundError:
   pass
+
+try:
+  import triton.backends.amd.compiler as hb
+except ImportError:
+  hb = None
+  pass
+
+
 try:
   from jax._src.lib import gpu_triton as triton_kernel_call_lib
 except ImportError:
@@ -90,7 +99,6 @@
     jnp.dtype("bool"): "B",
 }
 
-
 Grid = Union[int, Tuple[int], Tuple[int, int], Tuple[int, int, int]]
 GridOrLambda = Union[Grid, Callable[[Dict[str, Any]], Grid]]
 
@@ -157,22 +165,61 @@ def aval_size_bytes(aval):
   return np.dtype(aval.dtype).itemsize * aval.size
 
 
+def get_cuda_backend(device, compute_capability):
+  target = cb.GPUTarget('cuda', compute_capability, 32)
+  backend = cb.CUDABackend(target)
+  return backend
+
+def get_hip_backend(device, compute_capability):
+  arch = triton_kernel_call_lib.get_arch_details(device)
+  arch = arch.split(":")[0]
+  target = hb.GPUTarget('hip', arch, 64)
+  backend = hb.HIPBackend(target)
+  return backend
+
 @dataclasses.dataclass
-class PtxCompilationResult:
-  ptx: str
+class CompilationResult:
+  binary: str
   name: str
   shared_mem_bytes: int
   cluster_dims: tuple
   ttgir: Optional[str]
   llir: Optional[str]
 
+def compile_ttir_inplace(
+    ttir,
+    backend: [cb.CUDABackend | hb.HIPBackend],
+    options: [cb.CUDAOptions | hb.HIPOptions],
+    compute_capability,
+    platform
+):
+  if platform == 'cuda':
+    return compile_ttir_to_ptx_inplace(
+          ttir,
+          backend,
+          options,
+          compute_capability,
+    )
+
+  elif platform == 'rocm':
+    return compile_ttir_to_hsaco_inplace(
+          ttir,
+          backend,
+          options,
+          compute_capability,
+    )
+  else:
+    raise ValueError(
+      "Unsupported device."
+    )
+
 
 def compile_ttir_to_ptx_inplace(
     ttir,
     cuda_backend: cb.CUDABackend,
     cuda_options: cb.CUDAOptions,
     compute_capability,
-) -> PtxCompilationResult:
+) -> CompilationResult:
   if cuda_options.debug:
     print(ttir)
   if isinstance(ttir, ir.Module):
@@ -189,7 +236,7 @@ def compile_ttir_to_ptx_inplace(
       ttir = tl_ir.parse_mlir_module(f.name, context)
     ttir.context = context
   try:
-    metadata = dict()
+    metadata = {}
     opt_ttir = cuda_backend.make_ttir(ttir, metadata, cuda_options)
     ttgir = cuda_backend.make_ttgir(
         opt_ttir,
@@ -227,20 +274,95 @@ def compile_ttir_to_ptx_inplace(
   cluster_dims = metadata["cluster_dims"]
   ttgir = str(ttgir) if _JAX_TRITON_DUMP_DIR else None
   llir = str(llir) if _JAX_TRITON_DUMP_DIR else None
-  return PtxCompilationResult(
-      ptx=ptx,
+  return CompilationResult(
+      binary=ptx,
       name=name,
       shared_mem_bytes=shared_mem_bytes,
       cluster_dims=cluster_dims,
       ttgir=ttgir,
       llir=llir,
   )
 
+def compile_ttir_to_hsaco_inplace(
+    ttir,
+    hip_backend: hb.HIPBackend,
+    hip_options: hb.HIPOptions,
+    compute_capability,
+) -> CompilationResult:
+  if hip_options.debug:
+    print(ttir)
+  if isinstance(ttir, ir.Module):
+    context = _triton.ir.context()
+    _triton.ir.load_dialects(context)
+    hip_backend.load_dialects(context)
+
+    # Triton compilation APIs only accept Triton-specific MLIR wrappers.
+    # So, here we serialize an ir.Module to a file and then deserialize
+    # it as a tl_ir.module.
+    with tempfile.NamedTemporaryFile(mode="wb") as f:
+      ttir.operation.write_bytecode(f)
+      f.flush()
+      ttir = tl_ir.parse_mlir_module(f.name, context)
+    ttir.context = context
+  try:
+    metadata = {}
+    opt_ttir = hip_backend.make_ttir(ttir, metadata, hip_options)
+    ttgir = hip_backend.make_ttgir(
+        opt_ttir,
+        metadata,
+        hip_options
+    )
+  except RuntimeError as e:
+    ttir.dump()
+    raise ValueError("TTIR->TTGIR pass failed!") from e
+  if hip_options.debug:
+    print(ttgir)
+  try:
+    llir = hip_backend.make_llir(
+        ttgir,
+        metadata,
+        hip_options
+    )
+  except RuntimeError as e:
+    ttgir.dump()
+    raise ValueError("TTGIR->LLIR pass failed!") from e
+  shared_mem_bytes = metadata["shared"]
+  if hip_options.debug:
+    print(llir)
+
+  amdgcn = hip_backend.make_amdgcn(llir, metadata, hip_options)
+  hsaco = hip_backend.make_hsaco(amdgcn, metadata, hip_options)
+
+  if hip_options.debug:
+    print(x)
+  name = metadata["name"]
+  ttgir = str(ttgir) if _JAX_TRITON_DUMP_DIR else None
+  llir = str(llir) if _JAX_TRITON_DUMP_DIR else None
+  # cluster dims are NOT useful on hip backend.
+  # We just fill up with some value for API compatibility
+  cluster_dims = (0, 0, 0)
+  # Instead of passing hsaco which are "bytes", we first write
+  # to a file and then pass the "string" path. This is needed because
+  # nanobind doesn't automatically convert between bytes and string.
+  # https://github.com/wjakob/nanobind/discussions/137
+  fd, hsaco_path = tempfile.mkstemp()
+  with os.fdopen(fd, "wb") as f:
+    f.write(hsaco)
+  return CompilationResult(
+      binary=hsaco_path,
+      name=name,
+      shared_mem_bytes=shared_mem_bytes,
+      cluster_dims=cluster_dims,
+      ttgir=ttgir,
+      llir=llir,
+  )
 
 _COMPILED_KERNEL_CACHE = {}  # TODO(cjfj): Convert to LRU cache?
 
 
 def get_or_create_triton_kernel(
+    backend_init_func,
+    platform,
     fn,
     arg_dtypes,
     scalar_args,
@@ -257,11 +379,11 @@ def get_or_create_triton_kernel(
     num_warps = 4
   if num_stages is None:
     num_stages = 3
+  # TODO(sharadmv): handle multiple devices, right now we assume device 0
+  # which is fine when we have multiple of the same GPU but this won't work in
+  # general.
+  device = 0
   if compute_capability is None:
-    # TODO(sharadmv): handle multiple devices, right now we assume device 0
-    # which is fine when we have multiple of the same GPU but this won't work in
-    # general.
-    device = 0
     compute_capability = triton_kernel_call_lib.get_compute_capability(device)
   if num_ctas > 1 and compute_capability < 90:
     raise ValueError("num_ctas > 1 unsupported before Hopper.")
@@ -297,29 +419,29 @@ def get_or_create_triton_kernel(
   kernel = _COMPILED_KERNEL_CACHE.get(cache_key)
 
   if kernel is None:
-    target = cb.GPUTarget('cuda', compute_capability, 32)
-    cuda_backend = cb.CUDABackend(target)
-    cuda_options = cuda_backend.parse_options(
-        dict(
-            num_warps=num_warps,
-            num_stages=num_stages,
-            num_ctas=num_ctas,
-            optimize_epilogue=False,
-            debug=dump,
-            enable_fp_fusion=enable_fp_fusion,
-        )
-    )
+    opts = {
+        "num_warps": num_warps,
+        "num_stages": num_stages,
+        "num_ctas": num_ctas,
+        "optimize_epilogue": False,
+        "debug": dump,
+        "enable_fp_fusion": enable_fp_fusion,
+    }
+
+    backend = backend_init_func(device, compute_capability)
+    options = backend.parse_options(opts)
+
     kernel_hash = abs(hash(cache_key))
     if _JAX_TRITON_DUMP_DIR:
       os.makedirs(f"{_JAX_TRITON_DUMP_DIR}/{kernel_hash}")
       with open(f"{_JAX_TRITON_DUMP_DIR}/{kernel_hash}/config", "w") as f:
         pprint.pprint(cache_key, stream=f)
-        pprint.pprint(cuda_options, stream=f)
+        pprint.pprint(options, stream=f)
 
     context = _triton.ir.context()
     _triton.ir.load_dialects(context)
-    cuda_backend.load_dialects(context)
-    codegen_fns = cuda_backend.get_codegen_implementation()
+    backend.load_dialects(context)
+    codegen_fns = backend.get_codegen_implementation()
 
     module = (
         code_gen.ast_to_ttir(
@@ -330,10 +452,10 @@ def get_or_create_triton_kernel(
                signature=signature,
                attrs=specialization_attr,
              ),
-            options=cuda_options,
+            options=options,
             codegen_fns=codegen_fns,
             context=context,
-            module_map=cuda_backend.get_module_map(),
+            module_map=backend.get_module_map(),
         )
         if "module_map" in inspect.getfullargspec(code_gen.ast_to_ttir).args
         # Triton changes ASTSource.ast_to_ttir to include module_map. Handle
@@ -346,19 +468,21 @@ def get_or_create_triton_kernel(
                signature=signature,
                attrs=specialization_attr,
              ),
-            options=cuda_options,
+            options=options,
             codegen_fns=codegen_fns,
             context=context,
         )
     )
     ttir = str(module)
 
-    compilation_result = compile_ttir_to_ptx_inplace(
-        module,
-        cuda_backend,
-        cuda_options,
-        compute_capability,
+    compilation_result = compile_ttir_inplace(
+      module,
+      backend,
+      options,
+      compute_capability,
+      platform
     )
+
     kernel_name = compilation_result.name
     if _JAX_TRITON_DUMP_DIR:
       with open(
@@ -391,7 +515,7 @@ def get_or_create_triton_kernel(
         kernel_name,
         num_warps,
         compilation_result.shared_mem_bytes,
-        compilation_result.ptx,
+        compilation_result.binary,
         ttir,
         compute_capability,
         *compilation_result.cluster_dims,
@@ -403,6 +527,7 @@ def get_or_create_triton_kernel(
 
 
 def triton_kernel_call_lowering(
+    backend_init_func,
     ctx,
     *array_args,
     fn,
@@ -427,6 +552,7 @@ def triton_kernel_call_lowering(
         "`input_output_aliases` only supported on `jaxlib>=0.3.22"
     )
 
+
   kernel_call_name = name
   args = list(ctx.avals_in)
   arg_dtypes = list(map(get_triton_type, ctx.avals_in))
@@ -521,6 +647,8 @@ def prune_configs(configs, named_args, **kwargs):
   kernel_calls = []
   for params in config_params:
     kernel, specialization_attr = get_or_create_triton_kernel(
+        backend_init_func,
+        ctx.module_context.platforms[0],
         fn,
         arg_dtypes,
         scalar_args,
@@ -590,9 +718,13 @@ def prune_configs(configs, named_args, **kwargs):
       operand_output_aliases=dict(input_output_aliases),
   ).results
 
+mlir.register_lowering(triton_kernel_call_p,
+                       partial(triton_kernel_call_lowering, get_cuda_backend),
+                       platform='cuda')
 
-mlir.register_lowering(triton_kernel_call_p, triton_kernel_call_lowering)
-
+mlir.register_lowering(triton_kernel_call_p,
+                       partial(triton_kernel_call_lowering, get_hip_backend),
+                       platform='rocm')
 
 class ShapeDtype(Protocol):