diff --git a/src/brevitas/graph/gpfq.py b/src/brevitas/graph/gpfq.py index e255660a0..7e80f61cb 100644 --- a/src/brevitas/graph/gpfq.py +++ b/src/brevitas/graph/gpfq.py @@ -10,11 +10,14 @@ import unfoldNd from brevitas.function import get_upper_bound_on_l1_norm +from brevitas.graph.calibrate import disable_return_quant_tensor +from brevitas.graph.calibrate import restore_return_quant_tensor from brevitas.graph.gpxq import GPxQ from brevitas.graph.gpxq import gpxq_mode from brevitas.graph.gpxq import StopFwdException from brevitas.graph.gpxq import SUPPORTED_CONV_OP import brevitas.nn as qnn +from brevitas.quant_tensor import QuantTensor class gpfq_mode(gpxq_mode): @@ -89,6 +92,7 @@ def catch_stopfwd(self, *args, **kwargs): pass # Disable quantization + self.return_quant_tensor_state = disable_return_quant_tensor(self.model) self.disable_quant_inference.disable_param_quantization(self.model, is_training=False) self.disable_quant_inference.disable_act_quantization(self.model, is_training=False) # Collect float input @@ -104,6 +108,7 @@ def catch_stopfwd(self, *args, **kwargs): self.disable_quant_inference.enable_act_quantization(self.model, is_training=False) else: self.disable_quant_inference.disable_bias_quantization(self.model, is_training=False) + restore_return_quant_tensor(self.model, self.return_quant_tensor_state) if self.return_forward_output: # If we want to return the output of the network, we need to disable all hooks @@ -155,7 +160,7 @@ def update_batch(self, module, input, current_layer): # Update reference to current layer current_layer.layer_names.add(self.name) - is_quant_disabled = module.weight_quant.disable_quant + is_quant_enabled = module.weight_quant.is_quant_enabled inp = self.process_input(input) batch_size = inp.shape[0] @@ -210,7 +215,7 @@ def update_batch(self, module, input, current_layer): inp_processed.append(inp) inp_processed = torch.stack(inp_processed) - if is_quant_disabled: + if not is_quant_enabled: if self.float_input is None: self.float_input = inp_processed else: @@ -229,6 +234,7 @@ def update_batch(self, module, input, current_layer): raise StopFwdException def single_layer_update(self): + assert not self.layer.weight_quant_requires_quant_input, "Error: GPFQ does not support weight quantizers that require quantized inputs." weight = self.layer.weight.data dev = weight.device dtype = weight.dtype @@ -302,13 +308,36 @@ def __init__( p=p) self.accumulator_bit_width = accumulator_bit_width assert self.accumulator_bit_width is not None - self.requires_quant_input = True # force true + + def process_input(self, inp): + inp = super().process_input(inp) + inp = self.layer.input_quant(inp) + + is_quant_enabled = self.layer.weight_quant.is_quant_enabled + + # If using quantized activations, inp could be QuantTensor. In + # this case, we overwrite the metadata. + if isinstance(inp, QuantTensor): + if is_quant_enabled and self.quant_input is None: + self.quant_input = QuantTensor( + value=torch.empty( + 1, dtype=self.layer.weight.dtype, device=self.layer.weight.device), + scale=inp.scale, + zero_point=inp.zero_point, + bit_width=inp.bit_width, + signed=inp.signed, + training=inp.training) + inp = inp.value + + return inp def single_layer_update(self): # raise error in case no quant-input is here if self.quant_input is None: - raise ValueError( - 'Expected quant input to calculate L1-norm upper bound, but received None') + raise ValueError('Expected self.quant_input to calculate L1-norm upper bound, but recevied None. ' + \ + 'Make sure that either the input to the model is a QuantTensor or the layer has an input quant enabled. ' \ + 'Also, check if `use_quant_activations=True` in `gpfq_mode` when `accumulator_bit_width` is specified. ' + \ + 'Alternatively, provide a custom `a2q_layer_filter_fnc` to `gpfq_mode` to filter layers without a quant_tensor input.') weight = self.layer.weight.data dev = weight.device dtype = weight.dtype @@ -328,7 +357,8 @@ def single_layer_update(self): T = get_upper_bound_on_l1_norm( torch.tensor(self.accumulator_bit_width), input_bit_width, input_is_signed) s = self.layer.quant_weight_scale() - s = s.view(self.groups, -1) # [Groups, OC/Groups] + if s.ndim > 1: + s = s.view(self.groups, -1) # [Groups, OC/Groups] # initialize cumulative l1-norm z = torch.zeros(weight.shape[:-1], device=dev) @@ -362,8 +392,8 @@ def single_layer_update(self): else: q_arg = torch.zeros_like(U[group_index, :, 0]) - max_q_arg = s[group_index, :] * torch.clamp_min(T - z[group_index, :], 0.) - q_arg = q_arg.sign() * torch.clamp_max(q_arg.abs(), max_q_arg) + max_q_arg = s * torch.clamp_min(T - z, 0.) + q_arg = q_arg.sign() * torch.clamp_max(q_arg.abs(), max_q_arg[group_index, :]) weight[group_index, :, permutation_list[group_index][t]] = q_arg q = self.get_quant_weights(t, 0, permutation_list) z += q.abs() / s # increment cumulative l1-norm diff --git a/src/brevitas/graph/gptq.py b/src/brevitas/graph/gptq.py index 28cb12cd6..9c466f0eb 100644 --- a/src/brevitas/graph/gptq.py +++ b/src/brevitas/graph/gptq.py @@ -3,9 +3,10 @@ from copy import deepcopy import math -from typing import List, Optional, Set +from typing import List, Optional import warnings +from packaging import version import torch try: @@ -14,6 +15,7 @@ LinAlgError = RuntimeError import unfoldNd +from brevitas import torch_version from brevitas.graph.gpxq import GPxQ from brevitas.graph.gpxq import gpxq_mode from brevitas.graph.gpxq import StopFwdException @@ -133,6 +135,8 @@ def __init__( dtype=torch.float32) self.nsamples = 0 + assert torch_version >= version.parse('1.10'), "GPTQ requires torch 1.10 or higher" + def update_batch(self, module, input, current_layer): if self.disable_pre_forward_hook: return input @@ -188,6 +192,7 @@ def update_batch(self, module, input, current_layer): raise StopFwdException def single_layer_update(self, percdamp=.01): + assert not self.layer.weight_quant_requires_quant_input, "Error: GPTQ does not support weight quantizers that require quantized inputs." if hasattr(self.layer, 'allocate_params'): self.layer.allocate_params(self.layer) weight = self.layer.weight.data diff --git a/src/brevitas/graph/gpxq.py b/src/brevitas/graph/gpxq.py index e9641a5a8..dc9d6e19b 100644 --- a/src/brevitas/graph/gpxq.py +++ b/src/brevitas/graph/gpxq.py @@ -11,13 +11,13 @@ from typing import List, Optional, Set import warnings -import torch from torch.fx import GraphModule as TorchGraphModule from brevitas.fx import GraphModule +from brevitas.graph.calibrate import disable_return_quant_tensor from brevitas.graph.calibrate import DisableEnableQuantization +from brevitas.graph.calibrate import restore_return_quant_tensor import brevitas.nn as qnn -from brevitas.quant_tensor import QuantTensor SUPPORTED_CONV_OP = ( qnn.QuantConv2d, qnn.QuantConv1d, qnn.QuantConvTranspose1d, qnn.QuantConvTranspose2d) @@ -87,6 +87,7 @@ def __init__( # How many subblock to use during GPTQ for each layer self.disable_quant_inference = DisableEnableQuantization() + self.return_quant_tensor_state = dict() self.group_of_parallel_layers = group_of_parallel_layers self.return_forward_output = return_forward_output @@ -146,6 +147,7 @@ def __enter__(self): self.gpxq_layers[name] = gpxq_module_optimizer if not self.use_quant_activations: + self.return_quant_tensor_state = disable_return_quant_tensor(self.model) self.disable_quant_inference.disable_act_quantization( self.model, is_training=self.model.training) self.disable_quant_inference.disable_bias_quantization( @@ -165,6 +167,7 @@ def __exit__(self, type, value, traceback): self.model, is_training=self.model.training) self.disable_quant_inference.enable_bias_quantization( self.model, is_training=self.model.training) + restore_return_quant_tensor(self.model, self.return_quant_tensor_state) def update(self): for name in self.current_layer.layer_names: @@ -207,55 +210,11 @@ def __init__( self.disable_pre_forward_hook = False # Some layers require knowledge from quant inputs to compute quant weights self.quant_input = None - self.requires_quant_input = False # For GPFA2Q - - @property - def layer_requires_input_quant(self): - # some weight quantizers require a quant input (e.g., A2Q) - check_1 = self.layer.weight_quant_requires_quant_input - # if input_quant is enabled, then we will store its information - check_2 = self.layer.is_input_quant_enabled - # GPFA2Q requires the quantized input to be stored - check_3 = self.requires_quant_input - requires_input_quant = check_1 or check_2 or check_3 - return requires_input_quant def process_input(self, inp): # Input is a tuple, so we take first element inp = inp[0] - # if the quant_input is not already cached, then get - # metadata from QuantWBIOL module - if self.quant_input is None: - inp_scale = self.layer.quant_input_scale() - inp_zero_point = self.layer.quant_input_zero_point() - inp_bit_width = self.layer.quant_input_bit_width() - inp_signed = self.layer.is_quant_input_signed - inp_training = self.layer.training - - # If using quantized activations, inp could be QuantTensor. In - # this case, we overwrite the metadata. - if isinstance(inp, QuantTensor): - if self.layer_requires_input_quant and (self.quant_input is None): - inp_scale = inp.scale - inp_zero_point = inp.zero_point - inp_bit_width = inp.bit_width - inp_signed = inp.signed - inp_training = inp.training - inp = inp.value - - # if the layer requires an input quant and the quant input cache has - # yet to be populated, then populate with the collected metadata - if self.layer_requires_input_quant and (self.quant_input is None): - self.quant_input = QuantTensor( - value=torch.empty( - 1, dtype=self.layer.weight.dtype, device=self.layer.weight.device), - scale=inp_scale, - zero_point=inp_zero_point, - bit_width=inp_bit_width, - signed=inp_signed, - training=inp_training) - # If input is unbatched, add batch_size = 1 if len(inp.shape) == 1: warnings.warn("Found unbatched input, adding batch dimension equal to 1") diff --git a/tests/brevitas/graph/equalization_fixtures.py b/tests/brevitas/graph/equalization_fixtures.py index 4750fc96d..985986789 100644 --- a/tests/brevitas/graph/equalization_fixtures.py +++ b/tests/brevitas/graph/equalization_fixtures.py @@ -11,6 +11,8 @@ from brevitas import torch_version from brevitas.graph.equalize import _cross_layer_equalization +import brevitas.nn as qnn +from brevitas.quant import Int8ActPerTensorFloat SEED = 123456 ATOL = 1e-3 @@ -26,6 +28,7 @@ IN_SIZE_CONV = (1, 3, 224, 224) IN_SIZE_LINEAR = (1, 224, 3) +IN_SIZE_CONV_SMALL = (1, 3, 32, 32) def equalize_test(regions, merge_bias, bias_shrinkage, scale_computation_type): @@ -374,3 +377,95 @@ def forward(self, x): ('layer1.0.conv1', 'layer1.1.conv1', 'layer2.0.conv1', 'layer2.0.downsample.0')], [('layer2.0.bn1',), ('layer2.0.conv2',)], [('layer4.0.bn2', 'layer4.0.downsample.1', 'layer4.1.bn2'), ('fc', 'layer4.1.conv1')],] + + +@pytest_cases.fixture +def quant_conv_with_input_quant_model(): + + class QuantConvModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + self.conv_0 = qnn.QuantConv2d( + 3, 16, kernel_size=3) # gpxq tests assume no quant on first layer + self.conv_1 = qnn.QuantConv2d(16, 32, kernel_size=3, input_quant=Int8ActPerTensorFloat) + + def forward(self, x): + x = self.conv_0(x) + x = torch.relu(x) + x = self.conv_1(x) + return x + + return QuantConvModel + + +@pytest_cases.fixture +def quant_convdepthconv_model(): + + class QuantConvDepthConvModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + self.conv = qnn.QuantConv2d(3, 16, kernel_size=3) + self.conv_0 = qnn.QuantConv2d(16, 16, kernel_size=1, groups=16) + self.relu = qnn.QuantReLU(return_quant_tensor=True) + + def forward(self, x): + x = self.conv(x) + x = self.relu(x) + x = self.conv_0(x) + return x + + return QuantConvDepthConvModel + + +@pytest_cases.fixture +def quant_residual_model(): + + class QuantResidualModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + self.conv = qnn.QuantConv2d(3, 16, kernel_size=1) + self.conv_0 = qnn.QuantConv2d(16, 3, kernel_size=1) + self.relu = qnn.QuantReLU(return_quant_tensor=True) + + def forward(self, x): + start = x + x = self.conv(x) + x = self.relu(x) + x = self.conv_0(x) + x = start + x + return x + + return QuantResidualModel + + +@pytest_cases.fixture +def quant_convtranspose_model(): + + class QuantConvTransposeModel(nn.Module): + + def __init__(self) -> None: + super().__init__() + self.relu = qnn.QuantReLU(return_quant_tensor=True) + self.conv_0 = qnn.QuantConvTranspose2d(in_channels=3, out_channels=8, kernel_size=3) + self.conv_1 = qnn.QuantConvTranspose2d(in_channels=8, out_channels=32, kernel_size=3) + + def forward(self, x): + x = self.conv_0(x) + x = self.relu(x) + x = self.conv_1(x) + return x + + return QuantConvTransposeModel + + +list_of_quant_fixtures = [ + 'quant_conv_with_input_quant_model', + 'quant_convdepthconv_model', + 'quant_residual_model', + 'quant_convtranspose_model'] + +toy_quant_model = fixture_union( + 'toy_quant_model', list_of_quant_fixtures, ids=list_of_quant_fixtures) diff --git a/tests/brevitas/graph/test_gpxq.py b/tests/brevitas/graph/test_gpxq.py new file mode 100644 index 000000000..49d470402 --- /dev/null +++ b/tests/brevitas/graph/test_gpxq.py @@ -0,0 +1,149 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +from functools import partial + +import pytest +import torch +import torch.nn as nn +from torch.utils.data import DataLoader +from torch.utils.data import TensorDataset + +from brevitas.graph.gpfq import gpfq_mode +from brevitas.graph.gptq import gptq_mode + +from .equalization_fixtures import * + + +def apply_gpfq( + calib_loader: DataLoader, + model: nn.Module, + act_order: bool, + use_quant_activations: bool = True, + accumulator_bit_width: int = 32, + a2q_layer_filter_fnc=lambda x: True): + model.eval() + dtype = next(model.parameters()).dtype + device = next(model.parameters()).device + with torch.no_grad(): + # use A2GPFQ if accumulator is less than 32 is specified + with gpfq_mode( + model, + use_quant_activations=use_quant_activations, + act_order=act_order, + use_gpfa2q=accumulator_bit_width < 32, + accumulator_bit_width=accumulator_bit_width, + a2q_layer_filter_fnc=a2q_layer_filter_fnc, + ) as gpfq: + gpfq_model = gpfq.model + for _ in range(gpfq.num_layers): + for _, (images, _) in enumerate(calib_loader): + images = images.to(device) + images = images.to(dtype) + gpfq_model(images) + gpfq.update() + + +def apply_gptq( + calib_loader: DataLoader, model: nn.Module, act_order: bool, use_quant_activations: bool): + model.eval() + dtype = next(model.parameters()).dtype + device = next(model.parameters()).device + with torch.no_grad(): + with gptq_mode( + model, + use_quant_activations=use_quant_activations, + act_order=act_order, + ) as gptq: + gptq_model = gptq.model + for _ in range(gptq.num_layers): + for _, (images, _) in enumerate(calib_loader): + images = images.to(device) + images = images.to(dtype) + gptq_model(images) + gptq.update() + + +def custom_layer_filter_fnc(layer: nn.Module) -> bool: + if isinstance(layer, nn.Conv2d) and layer.in_channels == 3: + return False + elif isinstance(layer, nn.ConvTranspose2d) and layer.in_channels == 3: + return False + return True + + +def identity_layer_filter_func(layer: nn.Module) -> bool: + return True + + +filter_func_dict = {"identity": identity_layer_filter_func, "ignore_input": custom_layer_filter_fnc} + +apply_gpxq_func_map = {"gpfq": apply_gpfq, "gptq": apply_gptq} + + +@pytest.mark.parametrize("act_order", [True, False]) +@pytest.mark.parametrize("use_quant_activations", [True, False]) +@pytest.mark.parametrize("acc_bit_width", [32, 24, 16, 12]) +@pytest.mark.parametrize("filter_func_str", filter_func_dict.keys()) +@pytest.mark.parametrize("apply_gpxq_tuple", apply_gpxq_func_map.items()) +def test_toymodels( + toy_quant_model, + act_order, + use_quant_activations, + acc_bit_width, + filter_func_str, + apply_gpxq_tuple, + request): + + test_id = request.node.callspec.id + + torch.manual_seed(SEED) + + name, apply_gpxq = apply_gpxq_tuple + + if (name == 'gptq' and acc_bit_width < 32): + pytest.skip("GPTQ does not support accumulator-aware quantization.") + + if name == 'gpfq': + filter_func = filter_func_dict[filter_func_str] + apply_gpxq = partial( + apply_gpxq, accumulator_bit_width=acc_bit_width, a2q_layer_filter_fnc=filter_func) + + model_class = toy_quant_model + model = model_class() + if 'mha' in test_id: + inp = torch.randn(32, *IN_SIZE_LINEAR[1:]) + else: + inp = torch.randn(32, *IN_SIZE_CONV_SMALL[1:]) + model.eval() + model(inp) # test forward pass and collect scaling factors + dataset = TensorDataset(inp, inp) + calib_loader = DataLoader(dataset, batch_size=16, num_workers=0, pin_memory=True, shuffle=True) + + if (name == 'gptq' and torch_version < version.parse('1.10')): + # GPTQ usage of linalg_cholesky() is not compatible with torch 1.9.1 and below + with pytest.raises(AssertionError): + apply_gpxq( + calib_loader=calib_loader, + model=model, + act_order=act_order, + use_quant_activations=use_quant_activations) + + elif (name == 'gpfq') and (acc_bit_width < 32) and (not use_quant_activations or + filter_func_str == 'identity'): + # GPFA2Q requires that the quant activations are used. GPFA2Q.single_layer_update will + # raise a ValueError if GPFA2Q.quant_input is None (also see GPxQ.process_input). This will + # happen when `use_quant_activations=False` or when the input to a model is not quantized + # and `a2q_layer_filter_fnc` does not properly handle it. + with pytest.raises(ValueError): + apply_gpxq( + calib_loader=calib_loader, + model=model, + act_order=act_order, + use_quant_activations=use_quant_activations) + else: + apply_gpxq( + calib_loader=calib_loader, + model=model, + act_order=act_order, + use_quant_activations=use_quant_activations)