[Pallas] Introduce gmm_backward (#7151)

alanwaketan · web-flow · commit c96c95a46676 · 2024-05-29T20:48:14.000-07:00
Summary:
This pull request introduces a helper for gmm_backward. I'm still debuting if we need to make gmm as a autograd.function given we will do manual back-propagation in Mixtral.

Test Plan:
python test/test_gmm.py
diff --git a/test/test_gmm.py b/test/test_gmm.py
@@ -7,7 +7,7 @@
 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.metrics as met
-from torch_xla.experimental.custom_kernel import gmm, _make_group_metadata, _histogram, tgmm
+from torch_xla.experimental.custom_kernel import gmm, _make_group_metadata, _histogram, tgmm, gmm_backward
 from torch_xla import runtime as xr
 from torch_xla._internal import tpu
 
@@ -344,6 +344,36 @@ def test_tgmm_bf16(self):
     # Make sure tgmm doesn't fallback.
     self.assertNotIn("aten::", met.short_metrics_report())
 
+  @unittest.skipIf(xr.device_type() != 'TPU', "This test only works on TPU.")
+  def test_gmm_backward(self):
+    self._init_test_cases()
+    for test_case in self.tests_cases:
+      num_groups = test_case['num_groups']
+      k = test_case['k']
+      m = test_case['m']
+      n = test_case['n']
+      lhs_dtype = rhs_dtype = torch.bfloat16
+
+      lhs = torch.rand(m, k, dtype=lhs_dtype, requires_grad=True)
+      rhs = torch.rand(num_groups, k, n, dtype=rhs_dtype, requires_grad=True)
+      group_sizes = self._group_sizes_strategy(m=m, num_groups=num_groups)
+      lhs.retain_grad()
+      rhs.retain_grad()
+
+      ref_out = self._reference_gmm(lhs, rhs, group_sizes)
+      ref_out.sum().backward()
+
+      ref_out_backward = torch.ones_like(ref_out)
+      grad_lhs, grad_rhs = gmm_backward(
+          ref_out_backward.to("xla"), lhs.to("xla"), rhs.to("xla"),
+          group_sizes.to("xla"))
+
+      self.assertTrue(torch.allclose(lhs.grad, grad_lhs.cpu()))
+      self.assertTrue(torch.allclose(rhs.grad, grad_rhs.cpu()))
+
+    # Make sure gmm doesn't fallback.
+    self.assertNotIn("aten::", met.short_metrics_report())
+
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)
diff --git a/torch_xla/experimental/custom_kernel.py b/torch_xla/experimental/custom_kernel.py
@@ -771,7 +771,7 @@ def tgmm(
     lhs: torch.Tensor,
     rhs: torch.Tensor,
     group_sizes: torch.Tensor,
-    tiling: tuple[int, int, int] = (128, 128, 128)
+    tiling: tuple[int, int, int] = (512, 512, 512)
 ) -> torch.Tensor:
   """Compute lhs[:, sizes[i-1]:sizes[i]] @ rhs[sizes[i-1]:sizes[i], :].
 
@@ -813,13 +813,18 @@ def tgmm(
   )
   group_offset_torch = torch.tensor([0], dtype=torch.int32).to(lhs.device)
 
-  lhs = lhs.swapaxes(0, 1)
   return torch_xla._XLAC._xla_tpu_custom_call([
-      num_tiles, group_offsets, group_ids, m_tile_ids, group_offset_torch, lhs,
-      rhs
+      num_tiles, group_offsets, group_ids, m_tile_ids, group_offset_torch,
+      lhs.t(), rhs
   ], payload, [torch.Size([num_groups, k, n])], [preferred_element_type])[0]
 
 
+def gmm_backward(grad, lhs, rhs, group_sizes, tiling=(512, 512, 512)):
+  grad_lhs = gmm(grad, rhs.transpose(-1, -2), group_sizes, tiling)
+  grad_rhs = tgmm(lhs.t(), grad, group_sizes, tiling)
+  return grad_lhs, grad_rhs
+
+
 def non_xla_attetion(q, k, v, attention_type):
   # This will be called when dynamo use fake tensor to construct the fake output.
   # We need to make sure output tensor's shape is correct.