Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Change log

## [v1.1.1+mps] 2026-05-27

### Apple Silicon / macOS support

- **MPS quantization**: GPTQ (and AutoBit with GPTQ-only candidates) on `device="mps"`; Cholesky-heavy steps run on CPU where MPS lacks support; cross-platform `empty_cache()` via new `onecomp/utils/device.py` (`runner.py`, `quantizer/gptq/_gptq.py`, `quantizer/_quantizer.py`)
- **MPS inference**: load saved quantized models on Mac with `QuantizedModelLoader` + Transformers `generate()` (GemLite/vLLM remain Linux + CUDA)
- **macOS `uv sync`**: added `darwin` to `tool.uv.environments`, Linux-only markers on CUDA extras (`cu118`–`cu130`), documented `--extra cpu` path in `README.md`

## [v1.1.1] 2026-05-21

### New Feature: Quantization progress logging
Expand Down
21 changes: 19 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,20 +121,34 @@ curl -LsSf https://astral.sh/uv/install.sh | sh

git clone https://github.com/FujitsuResearch/OneCompression.git
cd OneCompression
uv sync --extra cu128 --extra dev --extra visualize
```

The `uv sync` command creates a Python virtual environment and installs all dependent libraries.

#### Linux (CUDA quantization / vLLM)

```bash
uv sync --extra cu128 --extra dev --extra visualize
```

The `--extra cu128` option installs the CUDA-enabled version of PyTorch (along with `torchvision` from the same CUDA index).
Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118`, `cu121`, `cu124`, `cu126`, `cu128`, or `cu130`.
PyTorch will be automatically downloaded by `uv`, so you do not need to install it beforehand.

#### macOS (development / MPS inference)

```bash
uv sync --extra cpu --extra dev --extra visualize
```

On macOS, use `--extra cpu` only. CUDA extras (`cu118`–`cu130`) and `--extra vllm` are Linux-only.
After `uv sync`, you can run GPTQ quantization and Hugging Face `generate()` inference on MPS; vLLM serving still requires Linux with an NVIDIA GPU.

Adding `--extra dev` installs development tools (black, pytest, pylint).
Adding `--extra visualize` installs matplotlib for visualization features.
Adding `--extra hydra` installs `hydra-core` for the example scripts and `model_validation/` runners that use Hydra-based configuration.

To use vLLM for serving quantized models, add `--extra vllm` together with `--extra cu130`:
To use vLLM for serving quantized models on Linux, add `--extra vllm` together with `--extra cu130`:

```bash
uv sync --extra cu130 --extra dev --extra visualize --extra vllm
Expand Down Expand Up @@ -183,7 +197,10 @@ Replace `cu128` with the appropriate variant for your environment: `cpu`, `cu118
### Building Documentation Locally

```bash
# Linux
uv sync --extra cu128 --extra dev --extra docs
# macOS
uv sync --extra cpu --extra dev --extra docs
uv run mkdocs serve
```

Expand Down
3 changes: 2 additions & 1 deletion onecomp/qep/_qep_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ class QEPConfig:
Default is 0.01.
perccorr (float): Correction percentage for error propagation.
Default is 0.5.
device (str): Device to use for QEP computations (e.g., "cuda").
device (str): Device to use for QEP computations
(e.g., "cuda", "mps", "cpu").
Default is "cuda:0".
exclude_layer_keywords (list[str]): List of keywords to identify
layers excluded from error propagation. Layers whose names
Expand Down
3 changes: 2 additions & 1 deletion onecomp/qep/_quantize_with_qep_arch.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
move_kwargs_to_device,
expand_kwargs_batch,
)
from onecomp.utils.device import empty_cache
from onecomp.utils.quantization_progress import QuantizationProgressTracker

logger = getLogger(__name__)
Expand Down Expand Up @@ -520,6 +521,6 @@ def run_quantize_with_qep_arch(

# free memory
block_q.cpu()
torch.cuda.empty_cache()
empty_cache(device)

quantizer.execute_post_processing()
5 changes: 3 additions & 2 deletions onecomp/quantized_model_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from .quantizer.gptq.config import resolve_gptq_layer_wbits, resolve_gptq_layer_group_size
from .quantizer.gptq.gptq_layer import GPTQLinear
from .utils.dtype import needs_bfloat16
from .utils.device import get_default_device
from .utils.quant_config import get_quant_param

logger = getLogger(__name__)
Expand Down Expand Up @@ -167,7 +168,7 @@ def load_quantized_model(
device_map_resolved = infer_auto_device_map(model)
model = dispatch_model(model, device_map=device_map_resolved)
except ImportError:
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(get_default_device())

tokenizer = AutoTokenizer.from_pretrained(
save_directory,
Expand Down Expand Up @@ -229,7 +230,7 @@ def load_quantized_model_pt(
device_map_resolved = infer_auto_device_map(model)
model = dispatch_model(model, device_map=device_map_resolved)
except ImportError:
model = model.to("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(get_default_device())

tokenizer = AutoTokenizer.from_pretrained(
save_directory,
Expand Down
27 changes: 20 additions & 7 deletions onecomp/quantizer/_quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,20 @@
import torch
from torch.nn import Linear, Conv2d, Conv1d

from onecomp.utils.device import empty_cache


def _safe_cholesky_and_solve(hessian, rhs):
if hessian.device.type == "mps":
hessian_cpu = hessian.cpu()
rhs_cpu = rhs.cpu()
cholesky = torch.linalg.cholesky(hessian_cpu)
delta_weight = torch.cholesky_solve(rhs_cpu.t(), cholesky).to(hessian.device)
else:
cholesky = torch.linalg.cholesky(hessian)
delta_weight = torch.cholesky_solve(rhs.t(), cholesky)
return delta_weight


@dataclass
class QuantizationResult:
Expand Down Expand Up @@ -222,7 +236,7 @@ def quantize(
result.quantization_time = end_time - start_time

self.results[name] = result
torch.cuda.empty_cache()
empty_cache()

if self.calc_quant_error:
# Record quantization error
Expand Down Expand Up @@ -268,7 +282,7 @@ def quantize_with_qep(
percdamp=percdamp,
perccorr=perccorr,
)
torch.cuda.empty_cache()
empty_cache()

self.logger.debug("Quantizing layer: %s", name)
result = self.quantize_layer(module, quant_input_activation, hessian=hessian)
Expand All @@ -282,7 +296,7 @@ def quantize_with_qep(
result.quantization_time = end_time - start_time

self.results[name] = result
torch.cuda.empty_cache()
empty_cache()

if self.calc_quant_error:
# Record quantization error
Expand Down Expand Up @@ -318,7 +332,7 @@ def _record_quantization_error(
result.relative_weight_squared_error,
) = self.calculate_weight_quantization_error(module, dequantized_weight)

torch.cuda.empty_cache()
empty_cache()

def adjust_weight(
self,
Expand Down Expand Up @@ -363,9 +377,8 @@ def adjust_weight(
damp = percdamp * torch.mean(torch.diag(hessian))
diag = torch.arange(hessian.shape[0], device=hessian.device)
hessian[diag, diag] += damp
cholesky = torch.linalg.cholesky(hessian)
rhs = weight @ delta_hatX
delta_weight = torch.cholesky_solve(rhs.t(), cholesky).t()
delta_weight = _safe_cholesky_and_solve(hessian, rhs).t()
weight = weight + (perccorr * delta_weight)

if isinstance(module, Conv1d):
Expand Down Expand Up @@ -908,7 +921,7 @@ def calculate_output_quantization_error(

del batch_diff, batch_X_T

torch.cuda.empty_cache()
empty_cache()

# MSE = output_squared_error / (out_features * total_samples)
mean_output_squared_error = output_squared_error / num_elements
Expand Down
11 changes: 6 additions & 5 deletions onecomp/quantizer/autobit/activation_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
forward_input,
move_kwargs_to_device,
)
from onecomp.utils.device import get_default_device, empty_cache


def _find_head_modules(model, blocks):
Expand Down Expand Up @@ -90,14 +91,14 @@ def collect_activation_stats_blockwise(
from onecomp.calibration import prepare_calibration_dataset

if device is None:
device = torch.device("cuda")
device = get_default_device()

original_device = next(model.parameters()).device
if original_device.type != "cpu":
if logger:
logger.info("Moving model to CPU for block-wise activation collection")
model.to("cpu")
torch.cuda.empty_cache()
empty_cache(original_device)

model_id = getattr(model.config, "_name_or_path", None)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
Expand Down Expand Up @@ -155,7 +156,7 @@ def collect_activation_stats_blockwise(
for h in hooks:
h.remove()
block.cpu()
torch.cuda.empty_cache()
empty_cache(device)

# Collect b_diag
if use_curvature_b:
Expand Down Expand Up @@ -206,7 +207,7 @@ def collect_activation_stats_blockwise(
for h in hooks:
h.remove()
block.cpu()
torch.cuda.empty_cache()
empty_cache(device)

a_diag = {}
b_diag = {}
Expand Down Expand Up @@ -274,6 +275,6 @@ def _compute_loss_grad(final_hidden, norm, lm_head, input_ids, device):

norm.cpu()
lm_head.cpu()
torch.cuda.empty_cache()
empty_cache(device)

return torch.cat(all_grads)
16 changes: 14 additions & 2 deletions onecomp/quantizer/gptq/_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

from onecomp.quantizer._quantizer import Quantizer, QuantizationResult
from onecomp.utils.quant_config import get_quant_param
from onecomp.utils.device import empty_cache


@dataclass
Expand Down Expand Up @@ -474,6 +475,11 @@ def _compute_inverse_hessian(
Cholesky decomposition fails (non-positive-definite), progressively
increases damping and retries up to *max_retries* times.

Note:
This function uses torch.linalg.cholesky / torch.cholesky_inverse
directly (without MPS-safe wrappers) because the caller (run_gptq)
moves hessian to CPU before calling this function when on MPS.

Args:
hessian: Square Hessian matrix (modified in-place).
percdamp: Base damping as a fraction of the mean diagonal.
Expand All @@ -491,7 +497,7 @@ def _compute_inverse_hessian(
try:
cholesky_lower = torch.linalg.cholesky(hessian)
break
except torch._C._LinAlgError:
except (torch._C._LinAlgError, RuntimeError):
damp_scale *= 10.0
extra = damp_scale * damp
hessian[diag, diag] += extra
Expand Down Expand Up @@ -543,6 +549,11 @@ def run_gptq( # pylint: disable=too-many-positional-arguments
)

matrix_W = layer.weight.data.clone()

if hessian.device.type == "mps":
hessian = hessian.cpu()
matrix_W = matrix_W.to("cpu")

if isinstance(layer, nn.Conv2d):
matrix_W = matrix_W.flatten(1)
if isinstance(layer, Conv1D):
Expand Down Expand Up @@ -643,9 +654,10 @@ def run_gptq( # pylint: disable=too-many-positional-arguments
zero = quantizer.zero.to(dtype=torch.int32, device="cpu")
perm = perm.cpu() if perm is not None else None

_device = quantized_weight.device
del hessian, Hinv, matrix_W, Q_int
gc.collect()
torch.cuda.empty_cache()
empty_cache(_device)

return {
"qweight": quantized_weight,
Expand Down
2 changes: 1 addition & 1 deletion onecomp/quantizer/gptq/gptq_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ def __init__( # pylint: disable=too-many-positional-arguments
zero: torch.Tensor, # FP16
perm: Optional[torch.Tensor] = None, # INT64
bias: Optional[torch.Tensor] = None,
device: str = "cuda",
device: Union[str, torch.device] = "cuda",
pack_weights: bool = True, # Pack INT weights for memory efficiency
use_gemlite: Optional[bool] = None, # GemLite flag
):
Expand Down
Loading