From 5cef547eab6edb8a9970f912e637bf45d69887b6 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev Date: Thu, 26 Sep 2024 08:20:15 -0700 Subject: [PATCH] Added support for `lax.cond_p` to Pallas Mosaic GPU lowering PiperOrigin-RevId: 679156819 --- jax/_src/pallas/mosaic_gpu/lowering.py | 60 ++++++++++++++++++- .../mosaic/gpu/fragmented_array.py | 4 +- jax/experimental/mosaic/gpu/utils.py | 4 +- tests/pallas/mosaic_gpu_test.py | 21 +++++++ 4 files changed, 85 insertions(+), 4 deletions(-) diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py index 902624192dcf..cc19d96aa194 100644 --- a/jax/_src/pallas/mosaic_gpu/lowering.py +++ b/jax/_src/pallas/mosaic_gpu/lowering.py @@ -34,6 +34,7 @@ from jax._src.lib.mlir.dialects import arith as arith_dialect from jax._src.lib.mlir.dialects import gpu as gpu_dialect from jax._src.lib.mlir.dialects import memref as memref_dialect +from jax._src.lib.mlir.dialects import scf as scf_dialect from jax._src.pallas import core as pallas_core from jax._src.pallas import primitives from jax._src.pallas import utils as pallas_utils @@ -677,6 +678,22 @@ def _pjit_lowering_rule(ctx: LoweringRuleContext, *args, jaxpr, **_): ) +@register_lowering_rule(lax.select_n_p) +def _select_n_lowering_rule(ctx: LoweringRuleContext, pred, *cases): + if len(cases) != 2: + raise NotImplementedError( + "Mosaic GPU lowering only supports select_n with 2 cases, got" + f" {len(cases)}" + ) + pred_aval, *cases_avals = ctx.avals_in + [out_aval] = ctx.avals_out + pred = _ensure_fa(pred, pred_aval.dtype) + cases = _bcast(*cases, *cases_avals, out_aval) + # ``select`` expects the first case to be the true branch, but ``select_n`` + # orders the cases in reverse. + return pred.select(*reversed(cases)) + + @register_lowering_rule(lax.broadcast_in_dim_p) def _broadcast_in_dim_lowering_rule( ctx: LoweringRuleContext, @@ -712,6 +729,16 @@ def _binary_op_lowering_rule(ctx: LoweringRuleContext, x, y, *, impl): lax.sub_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x - y), lax.mul_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x * y), lax.div_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x / y), + lax.rem_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x % y), + lax.and_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x & y), + lax.or_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x | y), + lax.xor_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x ^ y), + lax.gt_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x > y), + lax.lt_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x < y), + lax.ge_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x >= y), + lax.le_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x <= y), + lax.eq_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x == y), + lax.ne_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x != y), }) @@ -909,13 +936,41 @@ def _scan_lowering_rule( return for_out +@register_lowering_rule(lax.cond_p) +def _cond_lowering_rule(ctx: LoweringRuleContext, index, *args, branches): + index_aval, *_arg_avals = ctx.avals_in + switch_op = scf_dialect.IndexSwitchOp( + map(mgpu_utils.dtype_to_ir_type, ctx.avals_out), + _as_index(_ensure_ir_value(index, index_aval.dtype)), + ir.DenseI64ArrayAttr.get(range(len(branches) - 1)), + num_caseRegions=len(branches) - 1, + ) + + # ``RegionSequence`` in MLIR does not support slicing, so the + # auto-generated Python bindings for ``caseRegions`` fail at runtime! + # We convert it to a list to work around that. + regions = list(switch_op.regions) + # Move the default region to the back. + regions = regions[1:] + regions[:1] + for branch, region in zip(branches, regions): + with ir.InsertionPoint(region.blocks.append()): + outs = lower_jaxpr_to_mosaic_gpu( + ctx.module_ctx, ctx.launch_ctx, branch.jaxpr, args + ) + scf_dialect.yield_([ + _ensure_ir_value(out, aval.dtype) + for out, aval in zip(outs, ctx.avals_out) + ]) + return list(switch_op.results) + + def _bcast( x: ir.Value, y: ir.Value, x_aval: jax_core.ShapedArray, y_aval: jax_core.ShapedArray, out_aval: jax_core.ShapedArray, -) -> ir.Value: +) -> tuple[mgpu.FragmentedArray, mgpu.FragmentedArray]: if not isinstance(x, mgpu.FragmentedArray): x_dtype = x_aval.dtype if x_aval.weak_type: @@ -935,6 +990,7 @@ def _bcast( def _ensure_fa(x: object, dtype: jnp.dtype) -> mgpu.FragmentedArray: if isinstance(x, mgpu.FragmentedArray): + assert x.mlir_dtype == mgpu_utils.dtype_to_ir_type(dtype) return x elif isinstance(x, (np.number, np.ndarray, int, float)): return mgpu.FragmentedArray.splat( @@ -944,12 +1000,14 @@ def _ensure_fa(x: object, dtype: jnp.dtype) -> mgpu.FragmentedArray: ) elif isinstance(x, ir.Value): if isinstance(x.type, (ir.IntegerType, ir.FloatType, ir.IndexType)): + assert x.type == mgpu_utils.dtype_to_ir_type(dtype) return mgpu.FragmentedArray.splat(x, (), is_signed=mgpu_utils.is_signed(dtype)) raise NotImplementedError(f"Unsupported type: {type(x)}") def _ensure_ir_value(x: object, dtype: jnp.dtype) -> ir.Value: if isinstance(x, ir.Value): + assert x.type == mgpu_utils.dtype_to_ir_type(dtype) return x elif isinstance(x, (np.number, np.ndarray, int, float)): return _ir_constant(x, mgpu_utils.dtype_to_ir_type(dtype)) diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py index ea45a7dcb7b9..ae6c40b9416d 100644 --- a/jax/experimental/mosaic/gpu/fragmented_array.py +++ b/jax/experimental/mosaic/gpu/fragmented_array.py @@ -784,13 +784,13 @@ def broadcast_minor(self, n): _registers=new_regs, _layout=WGMMA_LAYOUT, _is_signed=self.is_signed ) - def select(self, x, y): + def select(self, on_true, on_false): if ( not ir.IntegerType.isinstance(self.mlir_dtype) or ir.IntegerType(self.mlir_dtype).width != 1 ): raise NotImplementedError - return self._pointwise(arith.select, x, y) + return self._pointwise(arith.select, on_true, on_false) def foreach(self, fn: Callable[[ir.Value, tuple[ir.Value, ...]], None]): """Call a function for each value and index.""" diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py index 4701a72490c8..b5c22734b0d4 100644 --- a/jax/experimental/mosaic/gpu/utils.py +++ b/jax/experimental/mosaic/gpu/utils.py @@ -951,6 +951,8 @@ def dtype_to_ir_type(dtype: jax.typing.DTypeLike) -> ir.Type: def is_signed(dtype: jax.typing.DTypeLike) -> bool | None: - if jnp.issubdtype(dtype, jnp.integer): + if jnp.issubdtype(dtype, jnp.bool_): + return False + elif jnp.issubdtype(dtype, jnp.integer): return jnp.issubdtype(dtype, jnp.signedinteger) return None diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py index 3018e0383188..267ddd0c97d5 100644 --- a/tests/pallas/mosaic_gpu_test.py +++ b/tests/pallas/mosaic_gpu_test.py @@ -376,6 +376,27 @@ def kernel(o_ref): kernel(), jnp.full([256], 5.0, dtype=jnp.float32) ) + def test_cond(self): + + @functools.partial( + pl.pallas_call, + out_shape=jax.ShapeDtypeStruct([256], jnp.int32), + ) + def kernel(x_ref, o_ref): + acc = x_ref[...].sum() + jax.lax.cond( + acc % 2 == 0, + lambda: pl.debug_print("acc * 2: {}", acc * 2), + lambda: pl.debug_print("acc: {}", acc), + ) + o_ref[...] = jnp.broadcast_to(acc, o_ref.shape) + + x = jnp.arange(256) + with jtu.capture_stdout() as output: + jax.block_until_ready(kernel(x)) + + self.assertIn("acc * 2:", output()) + @parameterized.parameters(jnp.float16, jnp.float32) def test_wgmma(self, dtype): # TensorCores can only fuse transposes of 16-bit values, and RHS