Add _sharded_cpu_state_dict for distributed checkpointing (#5288)

shahyash10 · will-cromar · qihqi · JackCaoG · commit 67ab9750b0ac · 2023-07-21T22:15:46.000Z
* initiak commit * Add test workflow for `xrt` branch (#5241) * Add test workflow for `xrt` branch * Only run for PRs targeting XRT branch * Add function to generate stablehlo based callable from pytorch model (#5216) * Add function to generate stablehlo based callable from pytorch model Added function `torch_xla.experimental.stablehlo_saved_model.export_pytorch_model`. This function will take a pytorch Module and convert it into stablehlo bytecode. * Only run the main CI workflow on PRs targeting master and release branches (#5244) * Only run main CI for master and release branches. * Disabling XRT tests on main CI * AMP for TPUs v3 (#5161) * remove duplicate autocast_test (#5246) * Remove `test_experimental_pjrt_tpu.py` from TPU CI (#5247) * Install `expecttest` in xla_test_job.yaml (#5252) * Add IAM roles for cloudbuild_editors (#5251) * [Functionalization] Remove view in view_symint (#5231) * [Functionalization] Remove view in view_symint Summary: This pull request removes views in tensor_method::view_symint. Test Plan: XLA_DISABLE_FUNCTIONALIZATION=1 PJRT_DEVICE=TPU python ../test/test_view_ops.py -v -k TestViewOpsXLA.test_view_view PJRT_DEVICE=TPU python ../test/test_view_ops.py -v -k TestViewOpsXLA.test_view_view * Fix linters * fixed the test * ran the linter --------- Co-authored-by: Xiongfei Wei <isaacwxf23@gmail.com> * Delete XRT from the main branch (#5240) * Delete XRT from the main branch * Remove dead import * formatting * Remove disable_xrt build option * Fix runtime init * Revert "Remove disable_xrt build option" This reverts commit ba312e7. * Add disable XRT option back * formatting * Prune mesh service * Remove obsolete test * Remove other run server script * Remove XRT config * Update PJRT default device test * Add a file I forgot to save * if using_pjrt -> @requires_pjrt * Remove irrelevant test case * Remove XRT env vars * fix md link * formatting * Remove extra `requires_pjrt` * merge conflicts * Add other autocast back * Add nightly build for cuda 12 (#5253) * Fix the linter command in the CI (#5254) * fix linter command * ran linter * Jack cao g/fix spmd buff is null (#5256) * Fix that non-tensor scalar can't be handled by virtual device * add test * comment * Skip calling as_strided in empty_strided_symint if the input has dynamic dimensions. (#5239) * Skip calling as_strided in empty_strided_symint. * only return empty_symint conditionally. * add a comment * Add XRT nightly builds (#5261) * Add XRT nightly builds * remove space * [OpenXLA] Migrate to pull XLA from OpenXLA (#5202) PyTorch/XLA migrate to pull XLA from OpenXLA by replacing TensorFlow with OpenXLA after deprecating XRT usage, and replace TensorFlow-pin with OpenXLA-pin to May09 * Add ToString method for both PjrtData and PjrtShardedData (#5265) * Add ToString method for both PjrtData and PjrtShardedData * on cpu same config will become replicated, dont't check actual op sharding type * Update Sharded graph HLO dumping (#5266) * Enable PjRt Client Compilation with StableHLO (#5233) * Enable xla PjRt client compilation with StableHLO * add XLA_STABLEHLO_COMPILE to configuration.yaml * fix merge conflict * dummy commit to trigger ci * Revert "dummy commit to trigger ci" This reverts commit f7aec23. * Disable Bazel remote cache for forked PR (#5259) * disable bazel remote cache if gcloud key is empty * remove remote cache from setup.py * experiment with debug msg * fix flag * add more logs * skip remote chache if credential file is empty * add comment * add logs * add check in test and coverage script * fix condition in coverage test * advance branch pr * allow remote cache if gloud file isn't specified explicitly * remove dummy comment * Suppress debug symbols in OpenXLA code (#5269) * [SPMD] Sharding n-d tensor on (n+1)-d Mesh (#5268) * Make TPU detection more robust (#5271) * Clean bazel stuff on distutils clean. (#5274) * Clean bazel stuff on distutils clean * Fix python formatting * Delete unused .so file, and .lds files (#5275) * [OpenXLA] Delete unused .so file and .lds files * Fix the error when export_torch_model is given a non-tensor (#5277) However the generated StableHLO graph still hardcodes the non-tensor value. this is not correct, will fix later. * Dsiable test_simple_model_with_different_input_shape since it is curretnly broken by pytorch (#5282) * Always do build_ext in python setup.py develop (#5273) Bazel should figure out that _XLAC.so is current or not, and trigger rebuild if any cpp files changed. * Remove or improve several hardcoded TPU test conditions (#5272) * Remove or improve several hardcoded TPU test conditions * Fix test condition * Add `runtime.host_index` (#5283) * Make it an error if calling sizes() on a dynamic tensor. (#4998) * Err if calling sizes() on dynamic tensor * try to set has_symbolic_sizes_strides_ * resolve merge conflict * enable CONTINUE_ON_ERROR * fixed the python test test_SizeEq_should_not_compile_for_identical_symints * fix test_index_types * set CONTINUE_ON_ERROR to true * remove some unwanted code. * add a print * directly set has_symbolic_sizes_strides_ = true * make some fixes. * fix empty_strided_symint * ran linter * change error type in the test. * fix comments * ran linter * Fix the error where mark_step does not materalize tensors on SPMD:0 (#5281) * Fix the error where mark_step does not materalize tensors on SPMD:0 * typo * fix test_non_tensor_scalar * Disable torch._dynamo.config.automatic_dynamic_shapes (#5285) * Set torch._dynamo.config.automatic_dynamic_shapes to False * Enable DynamoInferenceBasicTest.test_simple_model_with_different_input_shape * run linter * wrap only if sharding type is non-replicated * Handle non-tensors * run linter * Call wrap_if_sharded first * Add exception in test for unsharded tensor * fix test * Use torch.Tensor instead of torch.tensor * use .cpu() only for tensors --------- Co-authored-by: Will Cromar <wcromar@google.com> Co-authored-by: qihqi <hanq@google.com> Co-authored-by: Meghan Cowan <cowanmeg@google.com> Co-authored-by: Mateusz Lewko <mateusz.lewko@gmail.com> Co-authored-by: Jiewen Tan <jwtan@google.com> Co-authored-by: Xiongfei Wei <isaacwxf23@gmail.com> Co-authored-by: Wonjoo Lee <wonjoo@google.com> Co-authored-by: JackCaoG <59073027+JackCaoG@users.noreply.github.com> Co-authored-by: Manfei <41607353+ManfeiBai@users.noreply.github.com> Co-authored-by: Siyuan Liu <lsiyuan@google.com> Co-authored-by: stgpetrovic <stgpetrovic@gmail.com> Co-authored-by: Mohit Khatwani <118776932+khatwanimohit@users.noreply.github.com>
diff --git a/test/spmd/test_xla_distributed_checkpoint.py b/test/spmd/test_xla_distributed_checkpoint.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import tempfile
 import unittest
 import test_xla_sharding_base
@@ -14,6 +15,8 @@
     create_default_global_save_plan,
 )
 from torch_xla.experimental.distributed_checkpoint import SPMDLoadPlanner, SPMDSavePlanner
+from torch_xla.experimental._distributed_checkpoint_helpers import (
+    _sharded_cpu_state_dict, _CpuShards, _is_sharded_tensor)
 
 
 class DistributedCheckpointTestBase(test_xla_sharding_base.XlaShardingTest):
@@ -244,6 +247,24 @@ def test_resolve_shard_data(self):
       self.assertTrue(torch.allclose(shard.data, resolved_data))
 
 
+class DistributedCheckpointHelpersTest(DistributedCheckpointTestBase):
+
+  def test_sharded_cpu_state_dict(self):
+    model = self.SimpleLinear().to(xm.xla_device())
+    state_dict = model.state_dict()
+    sharded_cpu_state_dict = _sharded_cpu_state_dict(state_dict)
+    self.assertCountEqual(sharded_cpu_state_dict,
+                          ['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias'])
+    for name, param in sharded_cpu_state_dict.items():
+      if name == 'fc1.weight':
+        # _sharded_cpu_state_dict returns _CpuShards only for sharded tensors
+        if _is_sharded_tensor(param):
+          self.assertTrue(isinstance(param, _CpuShards))
+      else:
+        self.assertTrue(isinstance(param, torch.Tensor))
+        self.assertTrue(param.device == torch.device("cpu"))
+
+
 if __name__ == '__main__':
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/torch_xla/experimental/_distributed_checkpoint_helpers.py b/torch_xla/experimental/_distributed_checkpoint_helpers.py
@@ -2,10 +2,14 @@
 # stable. Once the upstream makes these stable, we should take a dependency on
 # their APIs.
 
+import dataclasses
+
 import torch
+import torch_xla.experimental.xla_sharding as xs
 
 from torch.distributed.checkpoint.planner import SavePlan
 from typing import (
+    Any,
     Callable,
     Collection,
     Dict,
@@ -14,12 +18,13 @@
     MutableMapping,
     Sequence,
     Tuple,
-    TypeVar,
     Union,
     cast,
 )
-from torch.distributed.checkpoint.metadata import (
-    STATE_DICT_TYPE,)
+from torch.distributed.checkpoint.metadata import (MetadataIndex,
+                                                   STATE_DICT_TYPE)
+from torch_xla.experimental.xla_sharding import XLAShardedTensor, ShardingType
+from torch.utils._pytree import tree_map
 
 PATH_ITEM = Union[str, int]
 OBJ_PATH = Tuple[PATH_ITEM, ...]
@@ -186,4 +191,37 @@ def narrow_tensor_by_index(tensor: torch.Tensor, offsets: Sequence[int],
       # recording here for the narrow op and 'local_shard' should be a
       # leaf variable in the autograd graph.
       narrowed_tensor = narrowed_tensor.narrow(idx, offset, size)
-  return narrowed_tensor
+  return narrowed_tensor
+
+
+def _is_sharded_tensor(x: Any) -> bool:
+  """Return true if the tensor's data is sharded across multiple devices"""
+  return isinstance(
+      x, XLAShardedTensor) and x.sharding_type != ShardingType.REPLICATED
+
+
+def _unwrap_xla_sharded_tensor(x: Any) -> Any:
+  if isinstance(x, XLAShardedTensor):
+    return x.global_tensor
+  return x
+
+
+@dataclasses.dataclass
+class _CpuShards:
+  shards: List[xs.XLAShard]
+  global_shape: torch.Size
+
+
+def _sharded_cpu_state_dict(state_dict: STATE_DICT_TYPE) -> STATE_DICT_TYPE:
+  """
+  Converts a state_dict on XLA device to a sharded state_dict on CPU.
+  """
+
+  def move_state_dict_to_cpu(v):
+    v = xs.wrap_if_sharded(v)
+    if not _is_sharded_tensor(v):
+      v = _unwrap_xla_sharded_tensor(v)
+      return v.cpu() if isinstance(v, torch.Tensor) else v
+    return _CpuShards(shards=v.local_shards, global_shape=v.global_tensor.shape)
+
+  return tree_map(move_state_dict_to_cpu, state_dict)
diff --git a/torch_xla/experimental/distributed_checkpoint.py b/torch_xla/experimental/distributed_checkpoint.py
@@ -33,14 +33,15 @@
 )
 from torch.distributed.checkpoint.utils import find_state_dict_object
 from torch.utils._pytree import tree_map
-from torch_xla.experimental.xla_sharding import (XLAShardedTensor, XLAShard,
-                                                 ShardingType)
+from torch_xla.experimental.xla_sharding import XLAShardedTensor, XLAShard
 from torch_xla.experimental._distributed_checkpoint_helpers import (
     FLATTEN_MAPPING,
     flatten_state_dict,
     dedup_tensors,
+    _is_sharded_tensor,
     set_element,
     narrow_tensor_by_index,
+    _unwrap_xla_sharded_tensor,
 )
 from typing import Any, Dict, List, Tuple, Union
 
@@ -373,15 +374,3 @@ def _create_xla_read_items(sharded_state_dict: STATE_DICT_TYPE,
     chunks = [_create_chunk_from_shard_index(index) for index in shard_indices]
     items.extend(create_read_items_for_chunk_list(fqn, md, chunks))
   return items
-
-
-def _is_sharded_tensor(x: Any) -> bool:
-  """Return true if the tensor's data is sharded across multiple devices"""
-  return isinstance(
-      x, XLAShardedTensor) and x.sharding_type != ShardingType.REPLICATED
-
-
-def _unwrap_xla_sharded_tensor(x: Any) -> Any:
-  if isinstance(x, XLAShardedTensor):
-    return x.global_tensor
-  return x