pytorch · alanwaketan · Apr 22, 2023 · Apr 21, 2023 · Apr 21, 2023 · Apr 21, 2023
diff --git a/test/test_mp_all_gather.py b/test/test_mp_all_gather.py
@@ -6,6 +6,10 @@
 import torch_xla.distributed.xla_multiprocessing as xmp
 
 
+def all_gather(tensor, dim):
+  return xm.all_gather(tensor, dim=dim)
+
+
 def _mp_fn(index):
   device = xm.xla_device()
   world_size = xm.xrt_world_size()
@@ -14,6 +18,18 @@ def _mp_fn(index):
     ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
     result = xm.all_gather(ordinal_tensor, dim=0)
 
+    cpu_result = result.cpu()
+    expected = torch.arange(0, world_size, dtype=torch.float)
+    if not cpu_result.allclose(expected):
+      print('xm.all_gather() produced wrong reductions', file=sys.stderr)
+      print(f'[{index}] {cpu_result}', file=sys.stderr)
+      sys.exit(1)
+
+    compiled_all_gather = torch.compile(
+        all_gather, backend='torchxla_trace_once', fullgraph=True)
+    ordinal_tensor = torch.tensor([index], dtype=torch.float).to(device)
+    result = compiled_all_gather(ordinal_tensor, dim=0)
+
     cpu_result = result.cpu()
     expected = torch.arange(0, world_size, dtype=torch.float)
     if not cpu_result.allclose(expected):

diff --git a/torch_xla/core/xla_model.py b/torch_xla/core/xla_model.py
@@ -9,6 +9,7 @@
 import torch.nn.functional as F
 import torch_xla
 from torch_xla.experimental import pjrt
+from torch_xla.experimental import tpu
 import torch_xla.core.xla_env_vars as xenv
 import torch_xla.debug.metrics_saver as ms
 import torch_xla.utils.utils as xu
@@ -26,6 +27,27 @@
 _DEVICE_CONTEXTS = dict()
 _DEVICE_CONTEXTS_LOCK = threading.Lock()
 
+# Note [Dynamo WORLD_SIEZ and ORDINAL]
+# Belows are workaround to cache the ordinal and world_size such that
+# Dynamo won't do graph breaks when xm.xrt_world_size() and xm.get_ordinal() are called.
+_WORLD_SIZE = None
+_ORDINAL = None
+
+
+def _init_world_size_ordinal():
+  global _WORLD_SIZE, _ORDINAL
+
+  if not pjrt.using_pjrt():
+    return
+
+  # We don't support V3-8. See Note [V3-8 Threading]
+  if pjrt.device_type() == 'TPU' and tpu.version() < 4:
+    return
+
+  if _WORLD_SIZE is None:
+    _WORLD_SIZE = xrt_world_size()
+    _ORDINAL = get_ordinal()
+
 
 class DeviceContext(object):
 
@@ -90,6 +112,10 @@ def xrt_world_size(defval=1):
   Returns:
     The number of devices which is taking part of the replication.
   """
+  global _WORLD_SIZE
+  if _WORLD_SIZE is not None:
+    return _WORLD_SIZE
+
   if pjrt.using_pjrt():
     return pjrt.world_size()
 
@@ -109,6 +135,10 @@ def get_ordinal(defval=0):
   Returns:
     The replication ordinal of the current thread.
   """
+  global _ORDINAL
+  if _ORDINAL is not None:
+    return _ORDINAL
+
   if pjrt.using_pjrt():
     return pjrt.global_ordinal()
 
@@ -533,8 +563,7 @@ def all_gather(value, dim=0, groups=None, output=None, pin_layout=True):
     A tensor which has, in the ``dim`` dimension, all the values from the
     participating replicas.
   """
-  if pin_layout and xla_device_hw(
-      value.device) in ('TPU', 'GPU', 'XPU') and output == None:
+  if pin_layout and output == None:
     # There is not an easy way to pin the all_gather layout on TPU and GPU, use
     # all_reduce based all_gather for this purpose.
     return _all_gather_using_all_reduce(

diff --git a/torch_xla/csrc/cross_replica_reduces.cpp b/torch_xla/csrc/cross_replica_reduces.cpp
@@ -18,6 +18,7 @@
 namespace torch_xla {
 namespace {
 
+// Note [V3-8 Threading]
 // For V3-8 + PJRT, we have 4 processes and each process has 2 threads to manage
 // the 8 cores. Therefore, we need different tokens for different threads.
 std::unordered_map<int64_t, std::shared_ptr<torch::lazy::Value>>

diff --git a/torch_xla/experimental/pjrt.py b/torch_xla/experimental/pjrt.py
@@ -235,6 +235,9 @@ def _run_thread_per_device(
   def _thread_fn(device: torch.device):
     torch_xla._XLAC._xla_set_default_device(device)
 
+    # See Note Note [Dynamo WORLD_SIEZ and ORDINAL].
+    xm._init_world_size_ordinal()
+
     return fn()
 
   with concurrent.futures.ThreadPoolExecutor(