added XLA custom op defs for TE GEMM

Signed-off-by: Alp Dener <adener@nvidia.com> Added XLA FFI custom op for TE GEMM Signed-off-by: Alp Dener <adener@nvidia.com> finished GEMM custom op primitive and serial unit test Signed-off-by: Alp Dener <adener@nvidia.com> fixed GEMM custom op batcher Signed-off-by: Alp Dener <adener@nvidia.com> fixed output dtype error and contracting dimensions options Signed-off-by: Alp Dener <adener@nvidia.com> AG overlap working but executes scatter to match outer LHS dim Signed-off-by: Alp Dener <adener@nvidia.com> both all-gather and all-reduce are now working Signed-off-by: Alp Dener <adener@nvidia.com> code style Signed-off-by: Alp Dener <adener@nvidia.com> changed kwargs in abstract to be explicit Signed-off-by: Alp Dener <adener@nvidia.com> added fwd/bwd implementation for non-fp8 gemm Signed-off-by: Alp Dener <adener@nvidia.com>
NVIDIA · Nov 14, 2024 · 941f5bb · 941f5bb
1 parent c0a539c
commit 941f5bb
Show file tree

Hide file tree

Showing 12 changed files with 1,370 additions and 6 deletions.
diff --git a/tests/jax/test_custom_call_compute.py b/tests/jax/test_custom_call_compute.py
@@ -25,6 +25,7 @@
     _jax_dbias_cast_transpose,
 )
 from transformer_engine.jax.cpp_extensions.quantization import _jax_cast_fp8
+from transformer_engine.jax.gemm import fp8_gemm, gemm
 from transformer_engine.jax import cpp_extensions as tex
 
 
@@ -415,6 +416,60 @@ def ref_func(x, ln_s, y, z, w, v, amax_list_1, amax_list_2, scale_list_1, scale_
             )
 
 
+class TestGemm:
+
+    @staticmethod
+    def _generate_inputs(b, m, n, k, dtype):
+        key = jax.random.PRNGKey(0)
+        subkeys = jax.random.split(key, 3)
+        a = jax.random.normal(subkeys[0], (b, m, k), dtype)
+        b = jax.random.normal(subkeys[1], (n, k), dtype)
+        bias_dtype = dtype if dtype not in [jnp.float8_e4m3fn, jnp.float8_e5m2] else jnp.bfloat16
+        bias = jax.random.normal(subkeys[2], (n, ), bias_dtype)
+        return a, b, bias
+
+    @staticmethod
+    def _generate_fp8_inputs(b, m, n, k, fp8_dtype):
+        a, b, bias = TestGemm._generate_inputs(b, m, n, k, jnp.bfloat16)
+        a_scale, b_scale = map(
+            lambda x: (jnp.max(jnp.abs(x)) / 127.).astype(jnp.float32),
+            [a, b]
+        )
+        a_q, b_q = map(
+            lambda x, x_scale: jnp.round(x / x_scale).astype(fp8_dtype),
+            [(a, a_scale), (b, b_scale)]
+        )
+        return a, a_q, jnp.reciprocal(a_scale), b, b_q, jnp.reciprocal(b_scale), bias
+
+    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
+    @pytest.mark.parametrize("use_bias", (False, True))
+    @pytest.mark.parametrize("do_gelu", (False, True))
+    def test_gemm(self, b, m, n, k, use_bias, do_gelu):
+        a, b, bias = self._generate_inputs(b, m, n, k, jnp.bfloat16)
+
+        primitive_out = gemm(a, b, bias=bias if use_bias else None, layout='NT', do_gelu=do_gelu)
+        ref_out = jnp.dot(a, b)
+        if use_bias:
+            ref_out += bias
+        if do_gelu:
+            ref_out = jax.nn.gelu(ref_out)
+
+        assert_allclose(primitive_out, ref_out, dtype=jnp.bfloat16)
+
+    @pytest.mark.skipif(not is_fp8_supported, reason=reason)
+    @pytest.mark.parametrize("m,n,k", GEMM_CASES)
+    @pytest.mark.parametrize("fp8_dtype", FP8_COMPUTE_TYPE)
+    def test_fp8_gemm(self, m, n, k, fp8_dtype):
+        a, a_q, a_scale_inv, b, b_q, b_scale_inv, _ = self._generate_fp8_inputs(
+            m, n, k, fp8_dtype
+        )
+
+        primitive_out = fp8_gemm(a_q, a_scale_inv, b_q, b_scale_inv, out_dtype=jnp.bfloat16)
+        ref_out = jnp.dot(a, b)
+
+        assert_allclose(primitive_out, ref_out, dtype=fp8_dtype)
+
+
 @pytest.fixture(name="random_inputs")
 def random_inputs_fixture(shape):
     key = jax.random.PRNGKey(0)

diff --git a/transformer_engine/jax/cpp_extensions/__init__.py b/transformer_engine/jax/cpp_extensions/__init__.py
@@ -4,6 +4,7 @@
 """Python interface for c++ extensions"""
 from .activation import *
 from .attention import *
+from .gemm import *
 from .normalization import *
 from .quantization import *
 from .softmax import *