refactor

hgt312 · hgt312 · commit 354bda8cd936 · 2023-06-08T11:40:34.000-07:00
diff --git a/torch_xla/distributed/zero_redundancy_optimizer.py b/torch_xla/distributed/zero_redundancy_optimizer.py
@@ -1,6 +1,5 @@
-from copy import deepcopy
-from math import inf
-from typing import (Any, Iterator, Optional, Type, Union)
+import copy
+from typing import (Any, Iterator, Optional, Type, Union, List, Dict)
 
 import torch
 import torch.nn as nn
@@ -10,8 +9,6 @@
 import torch_xla
 import torch_xla.core.xla_model as xm
 
-from .fsdp.xla_fully_sharded_data_parallel import _calc_grad_norm
-
 
 class ZeroRedundancyOptimizer(Optimizer):
   r"""
@@ -50,51 +47,138 @@ def __init__(
       grad_clipping: bool = True,
       max_norm: Optional[float] = None,
       pin_layout: bool = True,
+      cc_op_groups: Optional[Any] = None,
+      lazy_init: bool = False,
       **defaults: Any,
   ):
-    self.params = list(params)
-    super().__init__(self.params, defaults)
-    if isinstance(self.params[0], dict):
-      self.params = [p for pg in self.params for p in pg['params']]
-
-    self.device = self.params[0].device
+    super().__init__(params, defaults)
 
-    self.rank = xm.get_ordinal()
-    self.world_size = xm.xrt_world_size()
-    self.cc_op_groups = [list(range(self.world_size))]
+    self.global_world_size = xm.xrt_world_size()
+    self.global_rank = xm.get_ordinal()
+    self._cc_op_groups = [list(range(self.global_world_size))
+                         ] if cc_op_groups is None else cc_op_groups
 
+    self.optimizer_class = optimizer_class
+    self.defaults = defaults
     self.optimizer_dtype = optimizer_dtype if optimizer_dtype is not None else torch.float32
     self.grad_clipping = grad_clipping
     self.max_norm = max_norm if max_norm is not None else 1.0
     self.pin_layout = pin_layout
 
+    self.inited = False
+    if not lazy_init:
+      self.init_zero()
+
+  def init_zero(self):
+    self.local_world_size = len(self.cc_op_groups[0])
+    self.local_rank = self.global_rank // len(self.cc_op_groups)
     # Shard parameters for use in optimizer
-    self.sharded_params = []
-    self._shard_parameters()
+    sharded_param_groups = self._shard_parameters()
     # Optimizer initialization
-    self.base_optimizer = optimizer_class(iter(self.sharded_params), **defaults)
+    self.base_optimizer = self.optimizer_class(sharded_param_groups,
+                                               **self.defaults)
+    self._sync_param_groups(self.param_groups, self.base_optimizer.param_groups)
+    self.inited = True
+
+  @property
+  def cc_op_groups(self):
+    return self._cc_op_groups
+
+  @cc_op_groups.setter
+  def cc_op_groups(self, new_cc_op_groups):
+    assert not self.inited, "already inited, cannot change cc_op_groups"
+    self._cc_op_groups = new_cc_op_groups
+
+  @staticmethod
+  def _sync_param_groups(
+      src_param_groups: List[Dict[Any, Any]],
+      dst_param_groups: List[Dict[Any, Any]],
+  ) -> None:
+    r"""
+      Syncs the attributes from the source parameter groups to the
+      destination parameter groups, except the parameters.
+
+      Example attributes include learning rate or scheduler attributes. The
+      two parameter groups should have the same length (i.e. same number of
+      parameter groups).
+
+      Arguments:
+          src_param_groups (list[dict]): parameter groups giving the
+              attribute settings to copy.
+          dst_param_groups (list[dict]): parameter groups giving the
+              attribute settings to set.
+      """
+    assert len(src_param_groups) == len(dst_param_groups), \
+      "Mismatch between number of source and destination parameter groups"
+    for src_param_group, dst_param_group in zip(src_param_groups,
+                                                dst_param_groups):
+      # Sync all attributes except the parameters
+      for attr in filter(lambda x: x != "params", src_param_group.keys()):
+        dst_param_group[attr] = src_param_group[attr]
 
   def _shard_tensor(self, tensor: torch.Tensor):
     """
     Get the shard of the input tensor.
     """
-    assert tensor.shape[0] % self.world_size == 0, "Not support padding now."
-    tensor = tensor.chunk(self.world_size)[self.rank]
+    assert tensor.shape[
+        0] % self.local_world_size == 0, "Not support padding now."
+    tensor = tensor.chunk(self.local_world_size)[self.local_rank]
     return tensor
 
   def _shard_parameters(self):
     """
     Shard all parameters.
     """
-    xm.unlazy(self.params)
-    for param in self.params:
-      shard_data = param.data.to(device="cpu")  # move to cpu
-      shard_data = self._shard_tensor(shard_data)  # slice it
-      if shard_data.dtype != self.optimizer_dtype:
-        shard_data = shard_data.to(dtype=self.optimizer_dtype)
-      shard_data = shard_data.to(device=self.device)  # move to xla device
-      shard = nn.Parameter(shard_data, requires_grad=param.requires_grad)
-      self.sharded_params.append(shard)
+    all_params = []
+    for param_group in self.param_groups:
+      for param in param_group['params']:
+        all_params.append(param)
+
+    self.device = all_params[0].device
+    xm.unlazy(all_params)
+
+    sharded_params_groups = []
+    for param_group in self.param_groups:
+      sharded_params = []
+      for param in param_group['params']:
+        shard_data = param.data.to(device="cpu")  # move to cpu
+        shard_data = self._shard_tensor(shard_data)  # slice it
+        if shard_data.dtype != self.optimizer_dtype:
+          shard_data = shard_data.to(dtype=self.optimizer_dtype)
+        shard_data = shard_data.to(device=self.device)  # move to xla device
+        shard = nn.Parameter(shard_data, requires_grad=param.requires_grad)
+        sharded_params.append(shard)
+      sharded_params_group = copy.copy(param_group)
+      sharded_params_group['params'] = sharded_params
+      sharded_params_groups.append(sharded_params_group)
+
+    return sharded_params_groups
+
+  @torch.no_grad()
+  def _calc_grad_norm(
+      self,
+      norm_type: Union[float, int] = 2.0,
+  ) -> torch.Tensor:
+    grads_for_norm = []
+    for param_group in self.base_optimizer.param_groups:
+      for p in param_group['params']:
+        if p.grad is not None:
+          grads_for_norm.append(p.grad.detach())
+    # Norm parameters.
+    if norm_type != 2.0:
+      raise RuntimeError(f"only norm type 2 is supported, getting {norm_type}")
+    total_norm = torch.zeros([], dtype=self.optimizer_dtype, device=self.device)
+    for grad in grads_for_norm:
+      grad_norm = (grad * grad).sum()
+      total_norm += grad_norm
+    # across all ranks as no pipeline parallel
+    total_norm = xm.all_reduce(
+        xm.REDUCE_SUM,
+        total_norm,
+        groups=[list(range(self.global_world_size))],
+        pin_layout=self.pin_layout)
+    total_norm = torch.pow(total_norm, 1.0 / norm_type)
+    return total_norm
 
   @torch.no_grad()
   def _clip_grad_norm(
@@ -109,55 +193,53 @@ def _clip_grad_norm(
     """
     max_norm = float(max_norm)
     norm_type = float(norm_type)
-    params_with_grad = [p for p in self.sharded_params if p.grad is not None]
-    # Computes the max norm for this shard's gradients and sync's across workers
-    local_norm = _calc_grad_norm(params_with_grad, norm_type)
-    if norm_type == inf:
-      total_norm = xm.all_reduce(
-          xm.REDUCE_MAX,
-          local_norm,
-          groups=self.cc_op_groups,
-          pin_layout=self.pin_layout)
-    else:
-      total_norm = xm.all_reduce(
-          xm.REDUCE_SUM,
-          local_norm**norm_type,
-          groups=self.cc_op_groups,
-          pin_layout=self.pin_layout)
-      total_norm = total_norm**(1.0 / norm_type)
-
-    # Now multiply each grad by (max_norm/total_norm), same as torch 1.7 https://tinyurl.com/3wtxhhqq)
-    clip_coef = torch.clip(max_norm / (total_norm + 1e-6), 0.0, 1.0)
-    for p in params_with_grad:
-      p.grad.detach().mul_(clip_coef)
+    total_norm = self._calc_grad_norm(norm_type)
+
+    clip_coeff = torch.tensor(
+        max_norm, device=self.device) / (
+            total_norm + 1e-6)
+    clip_value = torch.where(clip_coeff < 1, clip_coeff,
+                             torch.tensor(1., device=self.device))
+    for param_group in self.base_optimizer.param_groups:
+      for p in param_group['params']:
+        if p.grad is not None:
+          p.grad.detach().mul_(clip_value)
 
   @torch.no_grad()
   def step(self, closure=None, **kwargs):
     """
     Performs a single optimizer step and syncs parameters across all ranks.
     """
+    assert self.inited, "must call init_zero() first"
+
     loss = None
     if closure is not None:
       with torch.enable_grad():
         loss = closure()
 
+    # sync to base optimizer
+    self._sync_param_groups(self.param_groups, self.base_optimizer.param_groups)
+
     # Reduce full gradients across ranks
     # Assign gradient shards to the respective parameter shards
-    for param, shard in zip(self.params, self.sharded_params):
-      if param.grad is not None:
-        grad_shard = xm.reduce_scatter(
-            xm.REDUCE_SUM,
-            param.grad,
-            scale=1.0 / self.world_size,
-            scatter_dim=0,
-            shard_count=self.world_size,
-            pin_layout=self.pin_layout,
-            groups=self.cc_op_groups,
-        )
-
-        if grad_shard.dtype != self.optimizer_dtype:
-          grad_shard = grad_shard.to(dtype=self.optimizer_dtype)
-        shard.grad = grad_shard
+    for param_group, sharded_param_group in zip(
+        self.param_groups, self.base_optimizer.param_groups):
+      for param, shard in zip(param_group['params'],
+                              sharded_param_group['params']):
+        if param.grad is not None:
+          grad_shard = xm.reduce_scatter(
+              xm.REDUCE_SUM,
+              param.grad,
+              scale=1.0 / self.local_world_size,
+              scatter_dim=0,
+              shard_count=self.local_world_size,
+              pin_layout=self.pin_layout,
+              groups=self.cc_op_groups,
+          )
+
+          if grad_shard.dtype != self.optimizer_dtype:
+            grad_shard = grad_shard.to(dtype=self.optimizer_dtype)
+          shard.grad = grad_shard
 
     if self.grad_clipping:
       # Update unscale/clip with sub partitions
@@ -169,18 +251,24 @@ def step(self, closure=None, **kwargs):
     self.base_optimizer.zero_grad(set_to_none=True)
 
     # All gather the new weights across the ranks and assign them to the full parameters
-    for param, shard in zip(self.params, self.sharded_params):
-      if param.grad is not None:
-        shard_data = shard.data
-        if param.dtype != self.optimizer_dtype:
-          shard_data = shard_data.to(dtype=param.dtype)
-        xm.all_gather(
-            shard_data,
-            dim=0,
-            output=param.data,
-            pin_layout=self.pin_layout,
-            groups=self.cc_op_groups,
-        )
+    for param_group, sharded_param_group in zip(
+        self.param_groups, self.base_optimizer.param_groups):
+      for param, shard in zip(param_group['params'],
+                              sharded_param_group['params']):
+        if param.grad is not None:
+          shard_data = shard.data
+          if param.dtype != self.optimizer_dtype:
+            shard_data = shard_data.to(dtype=param.dtype)
+          xm.all_gather(
+              shard_data,
+              dim=0,
+              output=param.data,
+              pin_layout=self.pin_layout,
+              groups=self.cc_op_groups,
+          )
+
+    # sync back
+    self._sync_param_groups(self.base_optimizer.param_groups, self.param_groups)
 
     return loss
 
@@ -190,7 +278,7 @@ def state_dict(self):
     return state_dict
 
   def load_state_dict(self, state_dict):
-    state_dict = deepcopy(state_dict)
+    state_dict = copy.deepcopy(state_dict)
     base = state_dict.pop('base')
     super().load_state_dict(state_dict)
     self.base_optimizer.load_state_dict(base)