Added support for lax.cond_p to Pallas Mosaic GPU lowering

PiperOrigin-RevId: 676133222
jax-ml · Sep 26, 2024 · afb65ae · afb65ae
1 parent 0a66e2d
commit afb65ae
Show file tree

Hide file tree

Showing 4 changed files with 85 additions and 4 deletions.
diff --git a/jax/_src/pallas/mosaic_gpu/lowering.py b/jax/_src/pallas/mosaic_gpu/lowering.py
@@ -34,6 +34,7 @@
 from jax._src.lib.mlir.dialects import arith as arith_dialect
 from jax._src.lib.mlir.dialects import gpu as gpu_dialect
 from jax._src.lib.mlir.dialects import memref as memref_dialect
+from jax._src.lib.mlir.dialects import scf as scf_dialect
 from jax._src.pallas import core as pallas_core
 from jax._src.pallas import primitives
 from jax._src.pallas import utils as pallas_utils
@@ -677,6 +678,22 @@ def _pjit_lowering_rule(ctx: LoweringRuleContext, *args, jaxpr, **_):
  )
 
 
+@register_lowering_rule(lax.select_n_p)
+def _select_n_lowering_rule(ctx: LoweringRuleContext, pred, *cases):
+ if len(cases) != 2:
+ raise NotImplementedError(
+ "Mosaic GPU lowering only supports select_n with 2 cases, got"
+ f" {len(cases)}"
+ )
+ pred_aval, *cases_avals = ctx.avals_in
+ [out_aval] = ctx.avals_out
+ pred = _ensure_fa(pred, pred_aval.dtype)
+ cases = _bcast(*cases, *cases_avals, out_aval)
+ # ``select`` expects the first case to be the true branch, but ``select_n``
+ # orders the cases in reverse.
+ return pred.select(*reversed(cases))
+
+
 @register_lowering_rule(lax.broadcast_in_dim_p)
 def _broadcast_in_dim_lowering_rule(
  ctx: LoweringRuleContext,
@@ -712,6 +729,16 @@ def _binary_op_lowering_rule(ctx: LoweringRuleContext, x, y, *, impl):
  lax.sub_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x - y),
  lax.mul_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x * y),
  lax.div_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x / y),
+ lax.rem_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x % y),
+ lax.and_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x & y),
+ lax.or_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x | y),
+ lax.xor_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x ^ y),
+ lax.gt_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x > y),
+ lax.lt_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x < y),
+ lax.ge_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x >= y),
+ lax.le_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x <= y),
+ lax.eq_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x == y),
+ lax.ne_p: partial(_binary_op_lowering_rule, impl=lambda x, y: x != y),
 })
 
 
@@ -909,13 +936,41 @@ def _scan_lowering_rule(
  return for_out
 
 
+@register_lowering_rule(lax.cond_p)
+def _cond_lowering_rule(ctx: LoweringRuleContext, index, *args, branches):
+ index_aval, *_arg_avals = ctx.avals_in
+ switch_op = scf_dialect.IndexSwitchOp(
+ map(mgpu_utils.dtype_to_ir_type, ctx.avals_out),
+ _as_index(_ensure_ir_value(index, index_aval.dtype)),
+ ir.DenseI64ArrayAttr.get(range(len(branches) - 1)),
+ num_caseRegions=len(branches) - 1,
+ )
+
+ # ``RegionSequence`` in MLIR does not support slicing, so the
+ # auto-generated Python bindings for ``caseRegions`` fail at runtime!
+ # We convert it to a list to work around that.
+ regions = list(switch_op.regions)
+ # Move the default region to the back.
+ regions = regions[1:] + regions[:1]
+ for branch, region in zip(branches, regions):
+ with ir.InsertionPoint(region.blocks.append()):
+ outs = lower_jaxpr_to_mosaic_gpu(
+ ctx.module_ctx, ctx.launch_ctx, branch.jaxpr, args
+ )
+ scf_dialect.yield_([
+ _ensure_ir_value(out, aval.dtype)
+ for out, aval in zip(outs, ctx.avals_out)
+ ])
+ return list(switch_op.results)
+
+
 def _bcast(
  x: ir.Value,
  y: ir.Value,
  x_aval: jax_core.ShapedArray,
  y_aval: jax_core.ShapedArray,
  out_aval: jax_core.ShapedArray,
-) -> ir.Value:
+) -> tuple[mgpu.FragmentedArray, mgpu.FragmentedArray]:
  if not isinstance(x, mgpu.FragmentedArray):
  x_dtype = x_aval.dtype
  if x_aval.weak_type:
@@ -935,6 +990,7 @@ def _bcast(
 
 def _ensure_fa(x: object, dtype: jnp.dtype) -> mgpu.FragmentedArray:
  if isinstance(x, mgpu.FragmentedArray):
+ assert x.mlir_dtype == mgpu_utils.dtype_to_ir_type(dtype)
  return x
  elif isinstance(x, (np.number, np.ndarray, int, float)):
  return mgpu.FragmentedArray.splat(
@@ -944,12 +1000,14 @@ def _ensure_fa(x: object, dtype: jnp.dtype) -> mgpu.FragmentedArray:
  )
  elif isinstance(x, ir.Value):
  if isinstance(x.type, (ir.IntegerType, ir.FloatType, ir.IndexType)):
+ assert x.type == mgpu_utils.dtype_to_ir_type(dtype)
  return mgpu.FragmentedArray.splat(x, (), is_signed=mgpu_utils.is_signed(dtype))
  raise NotImplementedError(f"Unsupported type: {type(x)}")
 
 
 def _ensure_ir_value(x: object, dtype: jnp.dtype) -> ir.Value:
  if isinstance(x, ir.Value):
+ assert x.type == mgpu_utils.dtype_to_ir_type(dtype)
  return x
  elif isinstance(x, (np.number, np.ndarray, int, float)):
  return _ir_constant(x, mgpu_utils.dtype_to_ir_type(dtype))

diff --git a/jax/experimental/mosaic/gpu/fragmented_array.py b/jax/experimental/mosaic/gpu/fragmented_array.py
@@ -784,13 +784,13 @@ def broadcast_minor(self, n):
  _registers=new_regs, _layout=WGMMA_LAYOUT, _is_signed=self.is_signed
  )
 
- def select(self, x, y):
+ def select(self, on_true, on_false):
  if (
  not ir.IntegerType.isinstance(self.mlir_dtype)
  or ir.IntegerType(self.mlir_dtype).width != 1
  ):
  raise NotImplementedError
- return self._pointwise(arith.select, x, y)
+ return self._pointwise(arith.select, on_true, on_false)
 
  def foreach(self, fn: Callable[[ir.Value, tuple[ir.Value, ...]], None]):
  """Call a function for each value and index."""

diff --git a/jax/experimental/mosaic/gpu/utils.py b/jax/experimental/mosaic/gpu/utils.py
@@ -951,6 +951,8 @@ def dtype_to_ir_type(dtype: jax.typing.DTypeLike) -> ir.Type:
 
 
 def is_signed(dtype: jax.typing.DTypeLike) -> bool | None:
- if jnp.issubdtype(dtype, jnp.integer):
+ if jnp.issubdtype(dtype, jnp.bool_):
+ return False
+ elif jnp.issubdtype(dtype, jnp.integer):
  return jnp.issubdtype(dtype, jnp.signedinteger)
  return None
diff --git a/tests/pallas/mosaic_gpu_test.py b/tests/pallas/mosaic_gpu_test.py
@@ -376,6 +376,27 @@ def kernel(o_ref):
  kernel(), jnp.full([256], 5.0, dtype=jnp.float32)
  )
 
+ def test_cond(self):
+
+ @functools.partial(
+ pl.pallas_call,
+ out_shape=jax.ShapeDtypeStruct([256], jnp.int32),
+ )
+ def kernel(x_ref, o_ref):
+ acc = x_ref[...].sum()
+ jax.lax.cond(
+ acc % 2 == 0,
+ lambda: pl.debug_print("acc * 2: {}", acc * 2),
+ lambda: pl.debug_print("acc: {}", acc),
+ )
+ o_ref[...] = jnp.broadcast_to(acc, o_ref.shape)
+
+ x = jnp.arange(256)
+ with jtu.capture_stdout() as output:
+ jax.block_until_ready(kernel(x))
+
+ self.assertIn("acc * 2:", output())
+
  @parameterized.parameters(jnp.float16, jnp.float32)
  def test_wgmma(self, dtype):
  # TensorCores can only fuse transposes of 16-bit values, and RHS