Add topological sorting to dynamo partitions (#5472)

wonjoo-wj · web-flow · commit b58327e15ca3 · 2023-08-24T10:39:33.000-07:00
* Add topological sorting to dynamo partitions

* Run linter

* Update unit tests to include more in-place ops
diff --git a/test/dynamo/test_dynamo.py b/test/dynamo/test_dynamo.py
@@ -101,33 +101,43 @@ def __init__(self, device=None):
         super().__init__()
         self.self_tensor = torch.zeros((5, 3), device=device)
 
-      def forward(self, index, copy_tensor, input_tensor):
+      def copy_(self, index, copy_tensor):
         self.self_tensor.index_copy_(0, index, copy_tensor)
+
+      def add_(self, index, other_tensor):
+        self.self_tensor.add_(other_tensor)
+
+      def abs_(self, index, other_tensor):
+        self.self_tensor.abs_()
+
+      def forward(self, index, copy_tensor, input_tensor, op_name):
+        getattr(self, op_name)(index, copy_tensor)
         output = input_tensor + self.self_tensor
         return output
 
     torch._dynamo.reset()
     met.clear_counters()
     met.clear_all()
     device = xm.xla_device()
-    input_tensor = torch.ones(3)
-    copy_tensor = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]],
-                               dtype=torch.float)
-    index = torch.tensor([0, 4, 2])
-    xla_input_tensor = input_tensor.to(device)
-    xla_copy_tensor = copy_tensor.to(device)
-    xla_index = index.to(device)
 
     cpu_model = TestModel()
-    res_cpu = cpu_model.forward(index, copy_tensor, input_tensor)
-
     xla_model = TestModel(device).to(device)
     compiled_model = torch.compile(xla_model, backend='openxla')
-    res_xla_dynamo = compiled_model.forward(xla_index, xla_copy_tensor,
-                                            xla_input_tensor)
 
-    self.assertIn('xla::index_copy', met.counter_names())
-    self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
+    input_tensor = torch.ones(3)
+    copy_tensor = torch.rand(5, 3)
+    index = torch.tensor([0, 4, 2, 1, 3])
+    xla_input_tensor = input_tensor.to(device)
+    xla_copy_tensor = copy_tensor.to(device)
+    xla_index = index.to(device)
+
+    in_place_ops = ['copy_', 'add_', 'abs_']
+    for in_place_op in in_place_ops:
+      res_cpu = cpu_model.forward(
+          index, copy_tensor, input_tensor, op_name=in_place_op)
+      res_xla_dynamo = compiled_model.forward(
+          xla_index, xla_copy_tensor, xla_input_tensor, op_name=in_place_op)
+      self.assertTrue(torch.allclose(res_cpu, res_xla_dynamo.cpu()))
 
   def test_simple_model_with_different_input_shape(self):
     met.clear_counters()
@@ -245,22 +255,22 @@ def fn_fallback(t):
     cpu_res = fn_fallback(t)
     xla_dynamo_res = dynamo_fn(t_xla)
     self.assertTrue(torch.allclose(cpu_res, xla_dynamo_res.cpu()))
-    self.assertEqual(met.metric_data('CompileTime')[0], 4)
-    self.assertEqual(met.metric_data('ExecuteTime')[0], 8)
+    self.assertEqual(met.metric_data('CompileTime')[0], 3)
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 10)
 
     # Second tracing
     met.clear_counters()
     xla_dynamo_res_2 = dynamo_fn(t_xla)
     self.assertTrue(torch.allclose(cpu_res, xla_dynamo_res_2.cpu()))
-    self.assertEqual(met.metric_data('CompileTime')[0], 4)
-    self.assertEqual(met.metric_data('ExecuteTime')[0], 10)
+    self.assertEqual(met.metric_data('CompileTime')[0], 3)
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 12)
 
     # Verify that dynamo can handle different inputs
     xla_dynamo_res_3 = dynamo_fn(t_xla * 3)
     cpu_res_3 = fn_fallback(t * 3)
     self.assertTrue(torch.allclose(cpu_res_3, xla_dynamo_res_3.cpu()))
-    self.assertEqual(met.metric_data('CompileTime')[0], 5)
-    self.assertEqual(met.metric_data('ExecuteTime')[0], 12)
+    self.assertEqual(met.metric_data('CompileTime')[0], 4)
+    self.assertEqual(met.metric_data('ExecuteTime')[0], 15)
 
 
 class DynamoTrainingBasicTest(unittest.TestCase):
diff --git a/torch_xla/core/dynamo_bridge.py b/torch_xla/core/dynamo_bridge.py
@@ -10,6 +10,7 @@
 
 import torch
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.utils.fuser_utils import topo_sort
 import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.metrics as metrics
@@ -421,10 +422,17 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
           "call_function", "call_module", "call_method"
       ] and (node not in fallback_ops or node.target == operator.getitem)
 
-  # partition the model and exectue to collect inputs
+  # partition the model
   supported_ops = XlaOperatorSupport()
-  partitioner = CapabilityBasedPartitioner(xla_model, supported_ops)
+  partitioner = CapabilityBasedPartitioner(
+      xla_model, supported_ops, allows_single_node_partition=True)
   partitions = partitioner.propose_partitions()
+
+  # propose_partitions() does not guarantee topolgical order, so sort it manually
+  for partition in partitions:
+    partition.nodes = topo_sort(partition.nodes)
+
+  # fuse partitions and exectue to collect inputs
   partitioned_graph = partitioner.fuse_partitions(partitions)
   InputCollector(partitioned_graph).run(*xla_args)