Skip to content

Commit 21c43fe

Browse files
authored
Merge pull request pytorch#9 from anzr299/an/ovquantizer
[OVQuantizer] Apply Fixes and Integrate into the Llama Example Workflow
2 parents 291dcd9 + d744ae9 commit 21c43fe

7 files changed

Lines changed: 489 additions & 415 deletions

File tree

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
1-
from .quantizer import OpenVINOQuantizer, quantize_model
1+
from .quantizer import OpenVINOQuantizer, quantize_model, QuantizationMode
22

3-
__all__ = ["OpenVINOQuantizer", "quantize_model"]
3+
__all__ = ["OpenVINOQuantizer", "quantize_model", "QuantizationMode"]
Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# Copyright (c) Intel Corporation
2+
#
3+
# Licensed under the BSD License (the "License"); you may not use this file
4+
# except in compliance with the License. See the license file found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# mypy: disable-error-code=import-not-found
8+
9+
from abc import ABC, abstractmethod
10+
from typing import Optional, Tuple
11+
12+
import torch
13+
14+
from nncf.experimental.torch.fx.node_utils import ( # type: ignore[import-untyped]
15+
get_tensor_constant_from_node,
16+
)
17+
from nncf.experimental.torch.fx.transformations import ( # type: ignore[import-untyped]
18+
constant_update,
19+
module_insertion,
20+
node_removal,
21+
)
22+
from nncf.quantization.algorithms.weight_compression.config import ( # type: ignore[import-untyped]
23+
WeightCompressionParameters,
24+
)
25+
from nncf.quantization.algorithms.weight_compression.weight_lowering import ( # type: ignore[import-untyped]
26+
do_integer_quantization,
27+
)
28+
from nncf.tensor.tensor import Tensor as NNCFTensor # type: ignore[import-untyped]
29+
from nncf.torch.graph.transformations.commands import ( # type: ignore[import-untyped]
30+
PTTargetPoint,
31+
TargetType,
32+
)
33+
from nncf.torch.quantization.layers import ( # type: ignore[import-untyped]
34+
BaseWeightsDecompressor,
35+
INT4AsymmetricWeightsDecompressor,
36+
INT4SymmetricWeightsDecompressor,
37+
INT8AsymmetricWeightsDecompressor,
38+
INT8SymmetricWeightsDecompressor,
39+
)
40+
from torchao.quantization.pt2e import ObserverBase
41+
42+
43+
class WeightObserverBase(ObserverBase, ABC):
44+
"""
45+
Base implementation of an NNCF observer that defines the rules for compressing layer weights into the OpenVINO representation.
46+
"""
47+
48+
def __init__(
49+
self,
50+
wc_param: WeightCompressionParameters,
51+
dtype: torch.dtype,
52+
**kwargs,
53+
) -> None:
54+
"""
55+
:param wc_param: Weight compression parameters container.
56+
:param dtype: target dtype for the quantization.
57+
"""
58+
super().__init__(dtype=dtype, is_dynamic=False)
59+
self._wc_param = wc_param
60+
61+
def calculate_qparams( # type: ignore[override]
62+
self,
63+
weight: torch.Tensor,
64+
) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
65+
"""
66+
Calculates quantization parameters: quantized weight, quantization scale and quantization zero point.
67+
68+
:param weight: FP weight to be used for calculating qparams.
69+
:return: A tuple containing the quantized weight, quantization scale and quantization zero point.
70+
"""
71+
wc_param = self._wc_param
72+
wc_config = wc_param.compression_config
73+
reduction_axes = wc_param.reduction_axes
74+
q_weight, scale, zp = do_integer_quantization(
75+
NNCFTensor(weight), wc_config, reduction_axes=reduction_axes
76+
)
77+
zp = zp.data if zp is not None else None
78+
return q_weight.data, scale.data, zp
79+
80+
def forward(self, x: torch.Tensor) -> torch.Tensor:
81+
return x
82+
83+
def convert(
84+
self, model: torch.fx.GraphModule, observer_node: torch.fx.Node
85+
) -> None:
86+
"""
87+
Replaces the given observer node from the given model with a quantized
88+
weight and a OpenVINO specific decompression module.
89+
90+
:param model: A `torch.fx.GraphModule` representing the statically traced model
91+
with observer nodes attached and calibrated.
92+
:param observer_node: The `torch.fx.Node` corresponding to the observer module for
93+
the weight that is being transformed into a compressed representation.
94+
"""
95+
weight_node = observer_node.args[0]
96+
original_weight = get_tensor_constant_from_node(weight_node, model)
97+
q_weight, scale, zero_point = self.calculate_qparams(original_weight)
98+
99+
decompressor = self._create_decompressor(
100+
scale, zero_point, q_weight, original_weight
101+
)
102+
packed_q_weight = decompressor.pack_weight(q_weight)
103+
104+
# Weight port id is 0 since observer is inserted for a single weight only.
105+
constant_update(model, observer_node, packed_q_weight, input_port_id=0)
106+
107+
compressed_weight_name = observer_node.all_input_nodes[0].name
108+
decompressor_suffix = "_".join(
109+
compressed_weight_name.replace(".", "_").split("_")[:-2]
110+
)
111+
decompressor_name = f"{decompressor.quantization_mode}_weights_decompressor_{decompressor_suffix}"
112+
113+
module_insertion(
114+
model,
115+
decompressor,
116+
[
117+
PTTargetPoint(
118+
TargetType.OPERATOR_POST_HOOK,
119+
target_node_name=compressed_weight_name,
120+
)
121+
],
122+
decompressor_name,
123+
)
124+
node_removal(model, observer_node, 0)
125+
126+
@abstractmethod
127+
def _create_decompressor(
128+
self,
129+
scale: torch.Tensor,
130+
zero_point: Optional[torch.Tensor],
131+
q_weight: torch.Tensor,
132+
original_weight: torch.Tensor,
133+
) -> BaseWeightsDecompressor:
134+
"""
135+
Returns a respective NNCF decompressor for different types of quantization.
136+
137+
:param scale: Calculated scale quantization parameter.
138+
:param zero_point: Calculated zero_point quantization parameter.
139+
:param q_weight: Calculated quantized weight.
140+
:param original_weight: FP weight.
141+
:return: NNCF observer according to the qmode which creates the decompression subgraph supported by OpenVINO.
142+
"""
143+
144+
145+
class INT4WeightObserver(WeightObserverBase):
146+
"""
147+
OpenVINO INT4 Weight Compression observer.
148+
"""
149+
150+
def _create_decompressor(
151+
self,
152+
scale: torch.Tensor,
153+
zero_point: Optional[torch.Tensor],
154+
q_weight: torch.Tensor,
155+
original_weight: torch.Tensor,
156+
) -> BaseWeightsDecompressor:
157+
if zero_point is None:
158+
return INT4SymmetricWeightsDecompressor(
159+
scale, q_weight.shape, original_weight.shape, original_weight.dtype
160+
)
161+
return INT4AsymmetricWeightsDecompressor(
162+
scale,
163+
zero_point,
164+
q_weight.shape,
165+
original_weight.shape,
166+
original_weight.dtype,
167+
)
168+
169+
170+
class INT8WeightObserver(WeightObserverBase):
171+
"""
172+
OpenVINO INT8 Weight Compression per channel observer.
173+
"""
174+
175+
def _create_decompressor(
176+
self,
177+
scale: torch.Tensor,
178+
zero_point: Optional[torch.Tensor],
179+
q_weight: torch.Tensor,
180+
original_weight: torch.Tensor,
181+
) -> BaseWeightsDecompressor:
182+
if zero_point is None:
183+
return INT8SymmetricWeightsDecompressor(scale, original_weight.dtype)
184+
return INT8AsymmetricWeightsDecompressor(
185+
scale, zero_point, original_weight.dtype
186+
)

backends/openvino/quantizer/observers/nncf_observers.py

Lines changed: 0 additions & 176 deletions
This file was deleted.

0 commit comments

Comments
 (0)