Add documentation and python API for persistent cache (#6046)

jonb377 · jonb377 · commit 22988c61550c · 2023-12-08T21:44:25.000Z
diff --git a/API_GUIDE.md b/API_GUIDE.md
@@ -314,6 +314,30 @@ tensors are always loaded back to the device they were saved from, and if
 that device is unavailable the load will fail. PyTorch/XLA, like all of PyTorch,
 is under active development and this behavior may change in the future.
 
+## Compilation Caching
+
+The XLA compiler converts the traced HLO into an executable which runs on
+the devices. Compilation can be time consuming, and in cases where the HLO
+doesn't change across executions, the compilation result can be persisted to
+disk for reuse, significantly reducing development iteration time.
+
+Note that if the HLO changes between executions, a recompilation will still
+occur.
+
+This is currently an experimental opt-in API, which must be activated before
+any computations are executed. Initialization is done through the
+`initialize_cache` API:
+
+```python
+import torch_xla.runtime as xr
+xr.initialize_cache('YOUR_CACHE_PATH', readonly=False)
+```
+
+This will initialize a persistent compilation cache at the specified path. The
+`readonly` parameter can be used to control whether the worker will be able to
+write to the cache, which can be useful when a shared cache mount is used for
+an SPMD workload.
+
 ## Further Reading
 
 Additional documentation is available at the
diff --git a/test/test_persistent_cache.py b/test/test_persistent_cache.py
@@ -43,32 +43,34 @@ def _assert_correctness_and_metrics(t, xt, metrics):
       f'Unexpected value for counter {counter}: expected {value}, got {actual}'
 
 
-def _mp_test(rank, metrics):
+def _mp_test(rank, tmpdir, metrics):
   # In MP, the cache dir must be different for each process to avoid a race
   # condition where one process loads the compilation result of another, which
   # would break the metrics assertion.
-  os.environ['XLA_PERSISTENT_CACHE_PATH'] = \
-    os.path.join(os.environ['XLA_PERSISTENT_CACHE_PATH'], str(rank))
+  xr.initialize_cache(os.path.join(tmpdir, str(rank)))
 
   t = torch.randn(16)
   xt = t.to(xm.xla_device())
   _assert_correctness_and_metrics(t, xt, metrics)
 
 
-def _single_device_test(metrics):
+def _single_device_test(tmpdir, metrics):
+  xr.initialize_cache(tmpdir)
   t = torch.randn(16)
   xt = t.to(xm.xla_device())
   _assert_correctness_and_metrics(t, xt, metrics)
 
 
-def _spmd_replicated_test(metrics):
+def _spmd_replicated_test(tmpdir, metrics):
+  xr.initialize_cache(tmpdir)
   xr.use_spmd()
   t = torch.randn(16)
   xt = t.to(xm.xla_device())
   _assert_correctness_and_metrics(t, xt, metrics)
 
 
-def _spmd_sharded_test(metrics):
+def _spmd_sharded_test(tmpdir, metrics):
+  xr.initialize_cache(tmpdir)
   xr.use_spmd()
   t = torch.randn(16)
 
@@ -90,19 +92,23 @@ class PersistentCacheTest(parameterized.TestCase):
 
   @run_with_tmpdir
   def _run_test(self, launch_method, test_fn, tmpdir):
-    os.environ['XLA_PERSISTENT_CACHE_PATH'] = tmpdir
-
     # Run once to warm the cache
-    launch_method(test_fn, ({
-        'PersistentCacheMiss': 1,
-        'PersistentCacheHit': None
-    },))
+    launch_method(test_fn, (
+        tmpdir,
+        {
+            'PersistentCacheMiss': 1,
+            'PersistentCacheHit': None
+        },
+    ))
 
     # The second run should hit the cache
-    launch_method(test_fn, ({
-        'PersistentCacheMiss': None,
-        'PersistentCacheHit': 1
-    },))
+    launch_method(test_fn, (
+        tmpdir,
+        {
+            'PersistentCacheMiss': None,
+            'PersistentCacheHit': 1
+        },
+    ))
 
   def test_persistent_cache_mp(self):
     self._run_test(xmp.spawn, _mp_test)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -977,6 +977,9 @@ void InitXlaModuleBindings(py::module m) {
   m.def("_xla_runtime_is_initialized", []() {
     return runtime::GetComputationClientIfInitialized() != nullptr;
   });
+  m.def("_xla_computation_cache_is_initialized", []() {
+    return XLAGraphExecutor::Get()->IsComputationCacheInitialized();
+  });
   m.def("_get_git_revs", []() { return GetRevisions(); });
   m.def("_get_xla_tensor_dimension_size",
         [](const at::Tensor& tensor, int dim) {
diff --git a/torch_xla/csrc/xla_graph_executor.cpp b/torch_xla/csrc/xla_graph_executor.cpp
@@ -506,9 +506,15 @@ void XLAGraphExecutor::MaybeDumpGraph(std::string name,
   }
 }
 
+bool XLAGraphExecutor::IsComputationCacheInitialized() {
+  return computation_cache_ != nullptr;
+}
+
 XLAGraphExecutor::ComputationCache* XLAGraphExecutor::GetComputationCache() {
-  static ComputationCache* cache = CreateComputationCache();
-  return cache;
+  if (computation_cache_ == nullptr) {
+    computation_cache_ = CreateComputationCache();
+  }
+  return computation_cache_;
 }
 
 void XLAGraphExecutor::ClearPendingIrs(
diff --git a/torch_xla/csrc/xla_graph_executor.h b/torch_xla/csrc/xla_graph_executor.h
@@ -173,6 +173,7 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
                                      torch::lazy::HashReducer>;
 
   ComputationCache* GetComputationCache();
+  bool IsComputationCacheInitialized();
 
   std::vector<torch::lazy::BackendDataPtr> ExecuteComputationWithBarrier(
       torch::lazy::hash_t hash, const std::vector<at::IValue>& graph_inputs,
@@ -344,6 +345,8 @@ class XLAGraphExecutor : public torch::lazy::LazyGraphExecutor {
   std::shared_ptr<Async> SyncTensorsGraphInternal(
       std::vector<XLATensorPtr>* tensors, absl::Span<const std::string> devices,
       const SyncTensorsConfig& config, bool warm_up_cache_only = false);
+
+  ComputationCache* computation_cache_;
 };
 
 }  // namespace torch_xla
diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py
@@ -265,3 +265,21 @@ def get_master_ip() -> str:
   if device_type() == 'TPU':
     return tpu.discover_master_worker_ip()
   raise RuntimeError(f'IP discovery not supported for device: {device_type()}')
+
+
+@requires_pjrt
+def initialize_cache(path: str, readonly: bool = False):
+  """Initializes the persistent compilation cache. This API must be called
+  before any computations have been performed.
+
+  Args:
+    path: The path at which to store the persistent cache.
+    readonly: Whether or not this worker should have write access to the cache.
+  """
+  assert not torch_xla._XLAC._xla_computation_cache_is_initialized(
+  ), "Computation cache has already been initialized"
+
+  # TODO(jonbolin): Consider moving away from environment variables to control
+  # the cache.
+  os.environ['XLA_PERSISTENT_CACHE_PATH'] = path
+  os.environ['XLA_PERSISTENT_CACHE_READ_ONLY'] = '1' if readonly else '0'

Original file line number	Diff line number	Diff line change
`@@ -506,9 +506,15 @@ void XLAGraphExecutor::MaybeDumpGraph(std::string name,`
`506`	`506`	`}`
`507`	`507`	`}`
`508`	`508`
	`509`	`+bool XLAGraphExecutor::IsComputationCacheInitialized() {`
	`510`	`+ return computation_cache_ != nullptr;`
	`511`	`+}`
	`512`	`+`
`509`	`513`	`XLAGraphExecutor::ComputationCache* XLAGraphExecutor::GetComputationCache() {`
`510`		`- static ComputationCache* cache = CreateComputationCache();`
`511`		`- return cache;`
	`514`	`+ if (computation_cache_ == nullptr) {`
	`515`	`+ computation_cache_ = CreateComputationCache();`
	`516`	`+ }`
	`517`	`+ return computation_cache_;`
`512`	`518`	`}`
`513`	`519`
`514`	`520`	`void XLAGraphExecutor::ClearPendingIrs(`