Enable AMD BF16 Grouped Gemm (#3526)

Summary: Pull Request resolved: #3526 X-link: facebookresearch/FBGEMM#608 Implementation of CK based BF16 Grouped Gemm. Currently performance is quite poor :( Reviewed By: zjing14 Differential Revision: D67261862 fbshipit-source-id: 98d38c7f238ccbc97769c6b3a36e1d1540c1a6ce
pytorch · Dec 31, 2024 · 4c0d4f7 · 4c0d4f7
1 parent 56184fd
commit 4c0d4f7
Show file tree

Hide file tree

Showing 63 changed files with 5,019 additions and 12 deletions.
diff --git a/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py b/fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
@@ -598,7 +598,7 @@ def quantize_fixed_nk(self, x, w):
         return (
             x,
             w,
-            torch.tensor(m_values).to(dtype=torch.int32, device=x[0].device),
+            torch.tensor(m_values).to(dtype=torch.int64, device=x[0].device),
             output,
         )
 
@@ -622,7 +622,7 @@ def quantize(self, x, w):
         m_values = None
         return x, w, m_values, output
 
-    def compute(self, x, w, m_values, output):
+    def compute(self, x, w, m_values, _):
         return torch.ops.fbgemm.bf16bf16bf16_grouped(
             x,
             w,
@@ -642,7 +642,7 @@ def name(self) -> str:
 
     @property
     def hip(self) -> bool:
-        return False
+        return True
 
     @property
     def cuda(self) -> bool: