[benchmarks] Set matrix multiplication precision. (#7748)

ysiraichi · web-flow · commit 2870e93b10d1 · 2024-07-25T19:45:01.000-03:00
diff --git a/benchmarks/experiment_runner.py b/benchmarks/experiment_runner.py
@@ -29,6 +29,7 @@
 from benchmark_experiment import ExperimentLoader, BenchmarkExperiment
 from util import cleanup, move_to_device, randomize_input, reset_rng_state, us_to_s, ns_to_s, StrOrBool
 
+import torch_xla
 import torch_xla.core.xla_model as xm
 import torch_xla.debug.profiler as xp
 
@@ -939,6 +940,11 @@ def __str__(self):
       action="store_true",
       help="Whether to enable fast F32 multiplication in PyTorch.",
   )
+  parser.add_argument(
+      "--matmul-precision",
+      choices=["default", "high", "highest"],
+      help="Set matrix multiplication for both PyTorch and PyTorch/XLA.",
+  )
   parser.add_argument(
       "--experiment-config",
       type=str,
@@ -1009,9 +1015,15 @@ def main():
   logging.basicConfig(level=args.log_level.value, force=True)
   logger.debug(f"Parsed args: {args}")
 
+  precision = 'highest'
+  if args.matmul_precision is not None:
+    precision = args.matmul_precision
+  # --disable-tf32 flag may overwrite precision settings for BC reasons.
   if not args.disable_tf32:
     logger.warning('Enabling fast F32 multiplication for PyTorch')
-    torch.set_float32_matmul_precision('high')
+    precision = 'high'
+  torch.set_float32_matmul_precision(precision)
+  torch_xla._XLAC._xla_set_mat_mul_precision(precision)
 
   if args.profile_xla:
     logger.info(
diff --git a/test/bench.py b/test/bench.py
@@ -129,6 +129,5 @@ def run_benchmarks(args):
   args.benchs = benchs
 
   torch.set_default_dtype(torch.float32)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   run_benchmarks(args)
diff --git a/test/pytorch_test_base.py b/test/pytorch_test_base.py
@@ -632,8 +632,7 @@ def get_primary_device(cls):
   def setUpClass(cls):
     # Sets the primary test device to the xla_device (CPU or TPU)
     cls.primary_device = str(xm.xla_device())
-    torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-        use_full_mat_mul_precision=True)
+    torch_xla._XLAC._xla_set_mat_mul_precision('highest')
 
   def setUp(self):
     super().setUp()
diff --git a/test/test_gmm.py b/test/test_gmm.py
@@ -465,7 +465,6 @@ def test_gmm_backward_3(self):
   logging.getLogger().setLevel(logging.INFO)
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_mp_distributed_mm.py b/test/test_mp_distributed_mm.py
@@ -12,8 +12,7 @@ def _mp_fn(index):
 
   if xm.xla_device_hw(device) in ('TPU', 'CUDA'):
     world_size = xr.world_size()
-    torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-        use_full_mat_mul_precision=True)
+    torch_xla._XLAC._xla_set_mat_mul_precision('highest')
     torch.manual_seed(11)
     xm.set_rng_state(11)
 
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -3104,8 +3104,7 @@ def test_repeat_special(self):
 if __name__ == '__main__':
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   test = unittest.main(verbosity=FLAGS.verbosity, exit=False)
   if xu.getenv_as('METRICS_DEBUG', bool, defval=False):
     print(met.metrics_report())
diff --git a/test/test_operations_hlo.py b/test/test_operations_hlo.py
@@ -71,8 +71,7 @@ def test_dropout_by_u8_mask(self):
 if __name__ == '__main__':
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   test = unittest.main(verbosity=FLAGS.verbosity, exit=False)
   if xu.getenv_as('METRICS_DEBUG', bool, defval=False):
     print(met.metrics_report())
diff --git a/test/test_pallas.py b/test/test_pallas.py
@@ -925,7 +925,6 @@ def test_flash_attention_sm_scale_backward(self):
   logging.getLogger().setLevel(logging.INFO)
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/test/test_pallas_spmd.py b/test/test_pallas_spmd.py
@@ -103,8 +103,7 @@ def test_flash_attention_backward_spmd_data_parallel(self):
   logging.getLogger().setLevel(logging.INFO)
   torch.set_default_dtype(torch.float32)
   torch.manual_seed(42)
-  torch_xla._XLAC._xla_set_use_full_mat_mul_precision(
-      use_full_mat_mul_precision=True)
+  torch_xla._XLAC._xla_set_mat_mul_precision('highest')
   xr.use_spmd()
   test = unittest.main()
   sys.exit(0 if test.result.wasSuccessful() else 1)
diff --git a/torch_xla/csrc/init_python_bindings.cpp b/torch_xla/csrc/init_python_bindings.cpp
@@ -1939,14 +1939,11 @@ void InitXlaModuleBindings(py::module m) {
       py::arg("nodes_threshold") = 100, py::arg("device") = "");
   m.def("_xla_memory_info",
         [](const std::string& device) { return GetMemoryInfo(device); });
-  m.def(
-      "_xla_set_use_full_mat_mul_precision",
-      [](bool use_full_mat_mul_precision) {
-        XlaHelpers::set_mat_mul_precision(use_full_mat_mul_precision
-                                              ? xla::PrecisionConfig::HIGHEST
-                                              : xla::PrecisionConfig::DEFAULT);
-      },
-      py::arg("use_full_mat_mul_precision") = true);
+  m.def("_xla_set_mat_mul_precision", [](const std::string& mat_mul_precision) {
+    xla::PrecisionConfig::Precision precision =
+        ConsumeValue(xla::StringToPrecision(mat_mul_precision));
+    XlaHelpers::set_mat_mul_precision(precision);
+  });
 
   py::class_<xla::XlaBuilder, op_builder::BuilderPtr>(m, "XlaBuilder");
   py::class_<op_builder::Op, op_builder::OpPtr>(m, "XlaOp");