Rewrite scan-based GRU based on nn.GRU (#8914)

iwknow · web-flow · commit 145c33aeca82 · 2025-04-04T10:14:56.000-07:00
diff --git a/test/test_gru.py b/test/test_gru.py
@@ -1,6 +1,5 @@
 import torch
 import torch.nn as nn
-
 import torch_xla
 from torch_xla.experimental.gru import GRU
 
@@ -27,16 +26,9 @@ def build_models(self, input_size, hidden_size, num_layers, bias):
         input_size, hidden_size, num_layers=num_layers, bias=bias, dropout=0.0)
 
     # Copy parameters from the upstream GRU to our scan-based GRU.
-    for layer in range(num_layers):
-      scan_gru.weight_ih[layer].data.copy_(
-          getattr(gru, f'weight_ih_l{layer}').data)
-      scan_gru.weight_hh[layer].data.copy_(
-          getattr(gru, f'weight_hh_l{layer}').data)
-      if gru.bias:
-        scan_gru.bias_ih[layer].data.copy_(
-            getattr(gru, f'bias_ih_l{layer}').data)
-        scan_gru.bias_hh[layer].data.copy_(
-            getattr(gru, f'bias_hh_l{layer}').data)
+    # This ensures that the scan-based GRU has the same parameters as the
+    # upstream GRU and both models are parameterized the same way.
+    scan_gru.load_state_dict(gru.state_dict(), strict=True)
 
     return gru, scan_gru
 
@@ -78,7 +70,7 @@ def check_gradients(self,
     for layer in range(num_layers):
       for name in params_to_check:
         param1 = getattr(gru, f'{name}_l{layer}')
-        param2 = getattr(scan_gru, name)[layer]
+        param2 = getattr(scan_gru, f'{name}_l{layer}')
         torch.testing.assert_close(
             param1.grad,
             param2.grad,
@@ -88,6 +80,46 @@ def check_gradients(self,
             atol=atol,
             rtol=rtol)
 
+  def test_scan_gru_and_upstream_gru_parameter_independency(self):
+    """
+    Ensures that the parameters of the scan-based GRU and upstream GRU are independent even the parameters of the scan-based GRU are initialized using the upstream GRU.
+    """
+    input_size, hidden_size, num_layers = 16, 32, 2
+    gru, scan_gru = self.build_models(input_size, hidden_size, num_layers, True)
+    gru = gru.cpu()
+    scan_gru = scan_gru.to('xla')
+    torch_xla.sync()
+
+    with torch.no_grad():
+      gru_weight_ih_l0 = gru.state_dict()['weight_ih_l0']
+      scan_gru_weight_ih_l0 = scan_gru.state_dict()['weight_ih_l0']
+
+      # Compare the parameters of the GRU and scan-based GRU before changing.
+      torch.testing.assert_close(
+          gru_weight_ih_l0,
+          scan_gru_weight_ih_l0,
+          msg=lambda msg: f"weight_ih_l0 mismatch. {msg}",
+          check_device=False)
+
+      # Change the parameters of the GRU with random numbers.
+      gru_weight_ih_l0.uniform_(-1, 1)
+
+      # Assert not close after the change.
+      try:
+        torch.testing.assert_close(
+            gru_weight_ih_l0,
+            scan_gru_weight_ih_l0,
+            msg=lambda msg: f"weight_ih_l0 mismatch. {msg}",
+            check_device=False)
+        raise AssertionError(
+            "weight_ih_l0 should not be close after changing the GRU parameters."
+        )
+      except AssertionError as e:
+        if str(e).startswith("weight_ih_l0 mismatch."):
+          pass
+        else:
+          raise e
+
   @parameterized.parameters(True, False)
   def test_scan_gru_vs_pytorch_xla_for_loop(self, bias):
     """
diff --git a/torch_xla/experimental/gru.py b/torch_xla/experimental/gru.py
@@ -1,11 +1,12 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from typing import overload
 
 from torch_xla.experimental.scan import scan
 
 
-class GRU(nn.Module):
+class GRU(nn.GRU):
   r"""
   PyTorch/XLA GRU implemented using scan.
 
@@ -52,47 +53,24 @@ class GRU(nn.Module):
 
   """
 
+  @overload
   def __init__(self,
-               input_size,
-               hidden_size,
-               num_layers=1,
-               bias=True,
-               dropout=0.0):
-    super().__init__()
-
-    self.input_size = input_size
-    self.hidden_size = hidden_size
-    self.num_layers = num_layers
-    self.bias = bias
-    self.dropout = dropout
-
-    # Create parameters for each layer.
-    # For layer 0, the input dimension is `input_size`, otherwise it's `hidden_size`.
-    self.weight_ih = nn.ParameterList()
-    self.weight_hh = nn.ParameterList()
-    if bias:
-      self.bias_ih = nn.ParameterList()
-      self.bias_hh = nn.ParameterList()
-
-    for layer in range(num_layers):
-      layer_input_size = input_size if layer == 0 else hidden_size
-      # weight_ih: combines weights for reset, update, and new gates.
-      w_ih = nn.Parameter(torch.Tensor(3 * hidden_size, layer_input_size))
-      w_hh = nn.Parameter(torch.Tensor(3 * hidden_size, hidden_size))
-      self.weight_ih.append(w_ih)
-      self.weight_hh.append(w_hh)
-      if bias:
-        b_ih = nn.Parameter(torch.Tensor(3 * hidden_size))
-        b_hh = nn.Parameter(torch.Tensor(3 * hidden_size))
-        self.bias_ih.append(b_ih)
-        self.bias_hh.append(b_hh)
-    self.reset_parameters()
-
-  def reset_parameters(self):
-    # Initialize parameters uniformly as in the upstream PyTorch GRU.
-    stdv = 1.0 / (self.hidden_size**0.5)
-    for weight in self.parameters():
-      weight.data.uniform_(-stdv, stdv)
+               input_size: int,
+               hidden_size: int,
+               num_layers: int = 1,
+               bias: bool = True,
+               dropout: float = 0.0):
+    pass
+
+  def __init__(self, *args, **kwargs):
+    assert not kwargs.get('batch_first', False), \
+      "GRU only supports batch_first=False (seq_len, batch, input_size)."
+    assert not kwargs.get('bidirectional', False), \
+      "GRU only supports unidirectional GRU."
+    assert kwargs.get('proj_size', 0) == 0, \
+      "GRU only supports no projection."
+
+    super().__init__(*args, **kwargs)
 
   def forward(self, input, hx=None):
     """
@@ -119,12 +97,12 @@ def forward(self, input, hx=None):
     for layer in range(self.num_layers):
       init = {
           'h': hx[layer],
-          'w_ih': self.weight_ih[layer],
-          'w_hh': self.weight_hh[layer]
+          'w_ih': getattr(self, f'weight_ih_l{layer}'),
+          'w_hh': getattr(self, f'weight_hh_l{layer}')
       }
       if self.bias:
-        init['b_ih'] = self.bias_ih[layer]
-        init['b_hh'] = self.bias_hh[layer]
+        init['b_ih'] = getattr(self, f'bias_ih_l{layer}', None)
+        init['b_hh'] = getattr(self, f'bias_hh_l{layer}', None)
 
       # Define the step function for scanning over time.
       # x_t: (batch, current_input_size)
@@ -155,15 +133,8 @@ def step_fn(carry, x_t):
         # Update hidden state
         h_new = (1 - z) * n + z * h
 
-        carry_new = {
-            'h': h_new,
-            'w_ih': w_ih,
-            'w_hh': w_hh,
-        }
-        if b_ih is not None:
-          carry_new['b_ih'] = b_ih
-        if b_hh is not None:
-          carry_new['b_hh'] = b_hh
+        carry_new = {**carry, 'h': h_new}
+
         return carry_new, h_new
 
       # Use scan to iterate over the time dimension.