FujitsuResearch · aki916 · Apr 5, 2026 · Apr 9, 2026 · Apr 12, 2026 · Apr 30, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Change log
 
+## [v1.1.1+mps] 2026-05-27
+
+### Apple Silicon / macOS support
+
+- **MPS quantization**: GPTQ (and AutoBit with GPTQ-only candidates) on `device="mps"`; Cholesky-heavy steps run on CPU where MPS lacks support; cross-platform `empty_cache()` via new `onecomp/utils/device.py` (`runner.py`, `quantizer/gptq/_gptq.py`, `quantizer/_quantizer.py`)
+- **MPS inference**: load saved quantized models on Mac with `QuantizedModelLoader` + Transformers `generate()` (GemLite/vLLM remain Linux + CUDA)
+- **macOS `uv sync`**: added `darwin` to `tool.uv.environments`, Linux-only markers on CUDA extras (`cu118`–`cu130`), documented `--extra cpu` path in `README.md`
+
 ## [v1.1.1] 2026-05-21
 
 ### New Feature: Quantization progress logging

diff --git a/README.md b/README.md
@@ -121,20 +121,34 @@ curl -LsSf https://astral.sh/uv/install.sh | sh
 
 git clone https://github.com/FujitsuResearch/OneCompression.git
 cd OneCompression
-uv sync --extra cu128 --extra dev --extra visualize
 ```
 
 The `uv sync` command creates a Python virtual environment and installs all dependent libraries.
 
+#### Linux (CUDA quantization / vLLM)
+
+```bash
+uv sync --extra cu128 --extra dev --extra visualize
+```
+
 The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
 Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, `cu128`, or `cu130`.
 PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.
 
+#### macOS (development / MPS inference)
+
+```bash
+uv sync --extra cpu --extra dev --extra visualize
+```
+
+On macOS, use `--extra cpu` only. CUDA extras (`cu118`–`cu130`) and `--extra vllm` are Linux-only.
+After `uv sync`, you can run GPTQ quantization and Hugging Face `generate()` inference on MPS; vLLM serving still requires Linux with an NVIDIA GPU.
+
 Adding `--extra dev` installs development tools (black, pytest, pylint).
 Adding `--extra visualize` installs matplotlib for visualization features.
 Adding `--extra hydra` installs `hydra-core` for the example scripts and `model_validation/` runners that use Hydra-based configuration.
 
-To use vLLM for serving quantized models, add `--extra vllm` together with `--extra cu130`:
+To use vLLM for serving quantized models on Linux, add `--extra vllm` together with `--extra cu130`:
 
 ```bash
 uv sync --extra cu130 --extra dev --extra visualize --extra vllm
@@ -183,7 +197,10 @@ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118
 ### Building Documentation Locally
 
 ```bash
+# Linux
 uv sync --extra cu128 --extra dev --extra docs
+# macOS
+uv sync --extra cpu --extra dev --extra docs
 uv run mkdocs serve
 ```
 

diff --git a/onecomp/qep/_qep_config.py b/onecomp/qep/_qep_config.py
@@ -23,7 +23,8 @@ class QEPConfig:
             Default is 0.01.
         perccorr (float): Correction percentage for error propagation.
             Default is 0.5.
-        device (str): Device to use for QEP computations (e.g., "cuda").
+        device (str): Device to use for QEP computations
+            (e.g., "cuda", "mps", "cpu").
             Default is "cuda:0".
         exclude_layer_keywords (list[str]): List of keywords to identify
             layers excluded from error propagation. Layers whose names

diff --git a/onecomp/qep/_quantize_with_qep_arch.py b/onecomp/qep/_quantize_with_qep_arch.py
@@ -33,6 +33,7 @@
     move_kwargs_to_device,
     expand_kwargs_batch,
 )
+from onecomp.utils.device import empty_cache
 from onecomp.utils.quantization_progress import QuantizationProgressTracker
 
 logger = getLogger(__name__)
@@ -520,6 +521,6 @@ def run_quantize_with_qep_arch(
 
         # free memory
         block_q.cpu()
-        torch.cuda.empty_cache()
+        empty_cache(device)
 
     quantizer.execute_post_processing()
diff --git a/onecomp/quantized_model_loader.py b/onecomp/quantized_model_loader.py
@@ -23,6 +23,7 @@
 from .quantizer.gptq.config import resolve_gptq_layer_wbits, resolve_gptq_layer_group_size
 from .quantizer.gptq.gptq_layer import GPTQLinear
 from .utils.dtype import needs_bfloat16
+from .utils.device import get_default_device
 from .utils.quant_config import get_quant_param
 
 logger = getLogger(__name__)
@@ -167,7 +168,7 @@ def load_quantized_model(
                 device_map_resolved = infer_auto_device_map(model)
                 model = dispatch_model(model, device_map=device_map_resolved)
             except ImportError:
-                model = model.to("cuda" if torch.cuda.is_available() else "cpu")
+                model = model.to(get_default_device())
 
         tokenizer = AutoTokenizer.from_pretrained(
             save_directory,
@@ -229,7 +230,7 @@ def load_quantized_model_pt(
                 device_map_resolved = infer_auto_device_map(model)
                 model = dispatch_model(model, device_map=device_map_resolved)
             except ImportError:
-                model = model.to("cuda" if torch.cuda.is_available() else "cpu")
+                model = model.to(get_default_device())
 
         tokenizer = AutoTokenizer.from_pretrained(
             save_directory,

diff --git a/onecomp/quantizer/_quantizer.py b/onecomp/quantizer/_quantizer.py
@@ -17,6 +17,20 @@
 import torch
 from torch.nn import Linear, Conv2d, Conv1d
 
+from onecomp.utils.device import empty_cache
+
+
+def _safe_cholesky_and_solve(hessian, rhs):
+    if hessian.device.type == "mps":
+        hessian_cpu = hessian.cpu()
+        rhs_cpu = rhs.cpu()
+        cholesky = torch.linalg.cholesky(hessian_cpu)
+        delta_weight = torch.cholesky_solve(rhs_cpu.t(), cholesky).to(hessian.device)
+    else:
+        cholesky = torch.linalg.cholesky(hessian)
+        delta_weight = torch.cholesky_solve(rhs.t(), cholesky)
+    return delta_weight
+
 
 @dataclass
 class QuantizationResult:
@@ -222,7 +236,7 @@ def quantize(
         result.quantization_time = end_time - start_time
 
         self.results[name] = result
-        torch.cuda.empty_cache()
+        empty_cache()
 
         if self.calc_quant_error:
             # Record quantization error
@@ -268,7 +282,7 @@ def quantize_with_qep(
                 percdamp=percdamp,
                 perccorr=perccorr,
             )
-            torch.cuda.empty_cache()
+            empty_cache()
 
         self.logger.debug("Quantizing layer: %s", name)
         result = self.quantize_layer(module, quant_input_activation, hessian=hessian)
@@ -282,7 +296,7 @@ def quantize_with_qep(
         result.quantization_time = end_time - start_time
 
         self.results[name] = result
-        torch.cuda.empty_cache()
+        empty_cache()
 
         if self.calc_quant_error:
             # Record quantization error
@@ -318,7 +332,7 @@ def _record_quantization_error(
             result.relative_weight_squared_error,
         ) = self.calculate_weight_quantization_error(module, dequantized_weight)
 
-        torch.cuda.empty_cache()
+        empty_cache()
 
     def adjust_weight(
         self,
@@ -363,9 +377,8 @@ def adjust_weight(
         damp = percdamp * torch.mean(torch.diag(hessian))
         diag = torch.arange(hessian.shape[0], device=hessian.device)
         hessian[diag, diag] += damp
-        cholesky = torch.linalg.cholesky(hessian)
         rhs = weight @ delta_hatX
-        delta_weight = torch.cholesky_solve(rhs.t(), cholesky).t()
+        delta_weight = _safe_cholesky_and_solve(hessian, rhs).t()
         weight = weight + (perccorr * delta_weight)
 
         if isinstance(module, Conv1d):
@@ -908,7 +921,7 @@ def calculate_output_quantization_error(
 
             del batch_diff, batch_X_T
 
-        torch.cuda.empty_cache()
+        empty_cache()
 
         # MSE = output_squared_error / (out_features * total_samples)
         mean_output_squared_error = output_squared_error / num_elements

diff --git a/onecomp/quantizer/autobit/activation_stats.py b/onecomp/quantizer/autobit/activation_stats.py
@@ -16,6 +16,7 @@
     forward_input,
     move_kwargs_to_device,
 )
+from onecomp.utils.device import get_default_device, empty_cache
 
 
 def _find_head_modules(model, blocks):
@@ -90,14 +91,14 @@ def collect_activation_stats_blockwise(
     from onecomp.calibration import prepare_calibration_dataset
 
     if device is None:
-        device = torch.device("cuda")
+        device = get_default_device()
 
     original_device = next(model.parameters()).device
     if original_device.type != "cpu":
         if logger:
             logger.info("Moving model to CPU for block-wise activation collection")
         model.to("cpu")
-        torch.cuda.empty_cache()
+        empty_cache(original_device)
 
     model_id = getattr(model.config, "_name_or_path", None)
     tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
@@ -155,7 +156,7 @@ def collect_activation_stats_blockwise(
         for h in hooks:
             h.remove()
         block.cpu()
-        torch.cuda.empty_cache()
+        empty_cache(device)
 
     # Collect b_diag
     if use_curvature_b:
@@ -206,7 +207,7 @@ def collect_activation_stats_blockwise(
             for h in hooks:
                 h.remove()
             block.cpu()
-            torch.cuda.empty_cache()
+            empty_cache(device)
 
     a_diag = {}
     b_diag = {}
@@ -274,6 +275,6 @@ def _compute_loss_grad(final_hidden, norm, lm_head, input_ids, device):
 
     norm.cpu()
     lm_head.cpu()
-    torch.cuda.empty_cache()
+    empty_cache(device)
 
     return torch.cat(all_grads)
diff --git a/onecomp/quantizer/gptq/_gptq.py b/onecomp/quantizer/gptq/_gptq.py
@@ -21,6 +21,7 @@
 
 from onecomp.quantizer._quantizer import Quantizer, QuantizationResult
 from onecomp.utils.quant_config import get_quant_param
+from onecomp.utils.device import empty_cache
 
 
 @dataclass
@@ -474,6 +475,11 @@ def _compute_inverse_hessian(
     Cholesky decomposition fails (non-positive-definite), progressively
     increases damping and retries up to *max_retries* times.
 
+    Note:
+        This function uses torch.linalg.cholesky / torch.cholesky_inverse
+        directly (without MPS-safe wrappers) because the caller (run_gptq)
+        moves hessian to CPU before calling this function when on MPS.
+
     Args:
         hessian: Square Hessian matrix (modified in-place).
         percdamp: Base damping as a fraction of the mean diagonal.
@@ -491,7 +497,7 @@ def _compute_inverse_hessian(
         try:
             cholesky_lower = torch.linalg.cholesky(hessian)
             break
-        except torch._C._LinAlgError:
+        except (torch._C._LinAlgError, RuntimeError):
             damp_scale *= 10.0
             extra = damp_scale * damp
             hessian[diag, diag] += extra
@@ -543,6 +549,11 @@ def run_gptq(  # pylint: disable=too-many-positional-arguments
     )
 
     matrix_W = layer.weight.data.clone()
+
+    if hessian.device.type == "mps":
+        hessian = hessian.cpu()
+        matrix_W = matrix_W.to("cpu")
+
     if isinstance(layer, nn.Conv2d):
         matrix_W = matrix_W.flatten(1)
     if isinstance(layer, Conv1D):
@@ -643,9 +654,10 @@ def run_gptq(  # pylint: disable=too-many-positional-arguments
         zero = quantizer.zero.to(dtype=torch.int32, device="cpu")
     perm = perm.cpu() if perm is not None else None
 
+    _device = quantized_weight.device
     del hessian, Hinv, matrix_W, Q_int
     gc.collect()
-    torch.cuda.empty_cache()
+    empty_cache(_device)
 
     return {
         "qweight": quantized_weight,

diff --git a/onecomp/quantizer/gptq/gptq_layer.py b/onecomp/quantizer/gptq/gptq_layer.py
@@ -228,7 +228,7 @@ def __init__(  # pylint: disable=too-many-positional-arguments
         zero: torch.Tensor,  # FP16
         perm: Optional[torch.Tensor] = None,  # INT64
         bias: Optional[torch.Tensor] = None,
-        device: str = "cuda",
+        device: Union[str, torch.device] = "cuda",
         pack_weights: bool = True,  # Pack INT weights for memory efficiency
         use_gemlite: Optional[bool] = None,  # GemLite flag
     ):