diff --git a/noxfile.py b/noxfile.py index ffb1c5fbd..59ff7122f 100644 --- a/noxfile.py +++ b/noxfile.py @@ -174,4 +174,4 @@ def tests_brevitas_end_to_end(session, pytorch): install_pytorch(pytorch, session) install_torchvision(pytorch, session) session.install('--upgrade', '-e', '.[test, ort_integration]') - session.run('pytest', '-v', 'tests/brevitas_end_to_end') + session.run('pytest', '-n', 'logical', '-v', 'tests/brevitas_end_to_end') diff --git a/src/brevitas/__init__.py b/src/brevitas/__init__.py index eddc35a02..fe46102a7 100644 --- a/src/brevitas/__init__.py +++ b/src/brevitas/__init__.py @@ -23,6 +23,12 @@ else: torch_version = version.parse(torch.__version__) +try: + # Attempt _dynamo import + is_dynamo_compiling = torch._dynamo.is_compiling +except: + is_dynamo_compiling = lambda: False + try: __version__ = get_distribution(__name__).version except DistributionNotFound: diff --git a/src/brevitas/core/function_wrapper/clamp.py b/src/brevitas/core/function_wrapper/clamp.py index 70d1fc23f..163e63a22 100644 --- a/src/brevitas/core/function_wrapper/clamp.py +++ b/src/brevitas/core/function_wrapper/clamp.py @@ -113,6 +113,29 @@ def __init__( else: self.max_available_float = None + def inf_nan_clamp(self, x, inf_mask, p_max_val_mask, n_max_val_mask): + + # if non-saturating, we need to map values greater than max_val to nan or inf + if self.inf_values is not None: + # we have inf values, so we set abs values > max_value to +- inf, and leave inf at inf + x[p_max_val_mask] = torch.tensor(float('inf')) + x[n_max_val_mask] = torch.tensor(float('-inf')) + elif self.nan_values is not None: + # no inf values, so we need to map them to NaN + full_max_val_mask = torch.logical_or(p_max_val_mask, n_max_val_mask) + x[full_max_val_mask] = torch.tensor(float('nan')) + + # we also map the inf values to NaN in this case + x[inf_mask] = torch.tensor(float('nan')) + else: + raise RuntimeError( + "Clamping is not saturating, but neither `inf_values` nor `nan_values` is specified" + ) + return x + + def saturating_clamp(self, x, max_value, min_value): + return self.tensor_clamp_impl(x, min_val=min_value, max_val=max_value) + @brevitas.jit.script_method def forward( self, @@ -120,33 +143,21 @@ def forward( exponent_bit_width: Tensor, mantissa_bit_width: Tensor, exponent_bias: Tensor): - inf_mask = x.isinf() + max_value = max_float(exponent_bit_width, mantissa_bit_width, exponent_bias) max_value = max_value if self.max_available_float is None else torch.min( max_value, self.max_available_float()) + min_value = torch.tensor(0.) if not self.signed else -max_value + + # Compute masks + inf_mask = x.isinf() p_max_val_mask = x > max_value n_max_val_mask = -x > max_value - min_float = torch.tensor(0.) if not self.signed else -max_value # first clamp everything to +- max_value, basically the saturating case - x = self.tensor_clamp_impl(x, min_val=min_float, max_val=max_value) + x = self.saturating_clamp(x, max_value, min_value) if not self.saturating: - # if non-saturating, we need to map values greater than max_val to nan or inf - if self.inf_values is not None: - # we have inf values, so we set abs values > max_value to +- inf, and leave inf at inf - x[p_max_val_mask] = torch.tensor(float('inf')) - x[n_max_val_mask] = torch.tensor(float('-inf')) - elif self.nan_values is not None: - # no inf values, so we need to map them to NaN - full_max_val_mask = torch.logical_or(p_max_val_mask, n_max_val_mask) - x[full_max_val_mask] = torch.tensor(float('nan')) - - # we also map the inf values to NaN in this case - x[inf_mask] = torch.tensor(float('nan')) - else: - raise RuntimeError( - "Clamping is not saturating, but neither `inf_values` nor `nan_values` is specified" - ) + x = self.inf_nan_clamp(x, inf_mask, p_max_val_mask, n_max_val_mask) return x, self.saturating, self.inf_values, self.nan_values diff --git a/src/brevitas/export/common/handler/qcdq.py b/src/brevitas/export/common/handler/qcdq.py index 44061ce42..bbc03b630 100644 --- a/src/brevitas/export/common/handler/qcdq.py +++ b/src/brevitas/export/common/handler/qcdq.py @@ -454,7 +454,7 @@ def prepare_for_export(self, module): self.symbolic_kwargs['exponent_bit_width'] = module.exponent_bit_width() self.symbolic_kwargs['mantissa_bit_width'] = module.mantissa_bit_width() self.symbolic_kwargs['exponent_bias'] = module.exponent_bias() - self.symbolic_kwargs['saturating'] = module.saturating() + self.symbolic_kwargs['saturating'] = module.is_saturating() self.symbolic_kwargs['inf_values'] = module.inf_values() self.symbolic_kwargs['nan_values'] = module.nan_values() @@ -659,7 +659,7 @@ def prepare_for_export(self, module): 'exponent_bit_width': module.exponent_bit_width(), 'mantissa_bit_width': module.mantissa_bit_width(), 'exponent_bias': module.exponent_bias(), - 'saturating': module.saturating(), + 'saturating': module.is_saturating(), 'inf_values': module.inf_values(), 'nan_values': module.nan_values()} diff --git a/src/brevitas/export/inference/__init__.py b/src/brevitas/export/inference/__init__.py new file mode 100644 index 000000000..0e6d113e0 --- /dev/null +++ b/src/brevitas/export/inference/__init__.py @@ -0,0 +1,5 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +from .manager import InferenceManager +from .manager import quant_inference_mode diff --git a/src/brevitas/export/inference/handler.py b/src/brevitas/export/inference/handler.py new file mode 100644 index 000000000..1416014ec --- /dev/null +++ b/src/brevitas/export/inference/handler.py @@ -0,0 +1,153 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +from abc import ABC +from abc import abstractmethod +from typing import Tuple + +import torch + +from brevitas.function.ops import max_float +from brevitas.function.ops import max_int +from brevitas.function.ops import min_int +from brevitas.proxy.float_parameter_quant import WeightFloatQuantProxyFromInjector +from brevitas.proxy.float_runtime_quant import ActFloatQuantProxyFromInjector +from brevitas.proxy.float_runtime_quant import ActFloatQuantProxyFromInjectorBase +from brevitas.proxy.parameter_quant import BiasQuantProxyFromInjector +from brevitas.proxy.parameter_quant import WeightQuantProxyFromInjector +from brevitas.proxy.runtime_quant import ActQuantProxyFromInjector +from brevitas.utils.torch_utils import float_internal_scale + + +class InferenceHandler(torch.nn.Module, ABC): + + def attach_debug_info(self, module): + pass + + @abstractmethod + def prepare_for_export(self, module): + pass + + @abstractmethod + def quantize(self, x): + pass + + @abstractmethod + def dequantize(self, x): + pass + + +class IntInferencetHandler(InferenceHandler): + handled_layer = (ActQuantProxyFromInjector, BiasQuantProxyFromInjector) + + def attach_debug_info(self, module): + pass + + def prepare_for_export(self, module): + if module.is_quant_enabled: + self.scale = module.scale() + self.zero_point = module.zero_point().to(self.scale.device) + self.bit_width = module.bit_width() + self.min_clamp = min_int(module.is_signed, module.is_narrow_range, self.bit_width) + self.max_clamp = max_int(module.is_signed, module.is_narrow_range, self.bit_width) + + def quantize(self, x): + return torch.clamp( + torch.round(x / self.scale + self.zero_point), self.min_clamp, self.max_clamp) + + def dequantize(self, x): + return (x - self.zero_point) * self.scale + + def forward(self, x, unused_scale=None) -> Tuple[torch.Tensor]: + return self.dequantize(self.quantize(x)), self.scale, self.zero_point, self.bit_width + + +class IntWeightInferencetHandler(IntInferencetHandler): + handled_layer = WeightQuantProxyFromInjector + + def prepare_for_export(self, module): + if module.is_quant_enabled: + self.cached_weight = None + super().prepare_for_export(module) + if module._cached_weight is not None and not module.cache_inference_quant_weight_metadata_only: + self.cached_weight = module._cached_weight.value + + def forward(self, x) -> Tuple[torch.Tensor]: + if self.cached_weight is not None: + x = self.cached_weight + else: + x = self.dequantize(self.quantize(x)) + return x, self.scale, self.zero_point, self.bit_width + + +class FloatInferencetHandler(InferenceHandler): + handled_layer = (ActFloatQuantProxyFromInjector, BiasQuantProxyFromInjector) + + def prepare_for_export(self, module): + if module.is_quant_enabled: + self.scale = module.scale() + self.zero_point = module.zero_point().to(self.scale.device) + self.exponent_bit_width = module.exponent_bit_width() + self.mantissa_bit_width = module.mantissa_bit_width() + self.exponent_bias = module.exponent_bias() + self.saturating = module.is_saturating() + self.inf_values = module.inf_values() + self.nan_values = module.nan_values() + self.eps = torch.finfo(self.scale.dtype).tiny + if hasattr(module.tensor_quant, 'float_to_int_impl'): + self.float_to_int_impl = module.tensor_quant.float_to_int_impl + self.float_clamp_impl = module.tensor_quant.float_clamp_impl + elif hasattr(module, 'fused_activation_quant_proxy'): + self.float_to_int_impl = module.fused_activation_quant_proxy.tensor_quant.float_to_int_impl + self.float_clamp_impl = module.fused_activation_quant_proxy.tensor_quant.float_clamp_impl + + self.max_clamp = max_float( + self.exponent_bit_width, self.mantissa_bit_width, self.exponent_bias) + self.min_clamp = -self.max_clamp + self.fp_internal_scale_min = 1. - self.exponent_bias - self.mantissa_bit_width + self.max_value = max_float( + self.exponent_bit_width, self.mantissa_bit_width, self.exponent_bias) + self.min_value = torch.tensor(0.) if not module.is_signed else -self.max_value + + def quantize(self, x): + # Compute masks + inf_mask = x.isinf() + p_max_val_mask = x > self.max_value + n_max_val_mask = -x > self.max_value + + # Quantize + x = x / self.scale + internal_scale = float_internal_scale( + x, self.mantissa_bit_width, self.fp_internal_scale_min, self.eps) + x = internal_scale * self.float_to_int_impl(x / internal_scale) + + # Clamp + x = self.float_clamp_impl.saturating_clamp(x, self.max_value, self.min_value) + if not self.saturating: + x = self.float_clamp_impl.inf_nan_clamp(x, inf_mask, p_max_val_mask, n_max_val_mask) + + return x + + def dequantize(self, x): + return (x - self.zero_point) * self.scale + + def forward(self, x) -> Tuple[torch.Tensor]: + return self.dequantize(self.quantize(x)), self.scale, self.zero_point, self.exponent_bit_width, self.mantissa_bit_width, self.exponent_bias, self.saturating, self.inf_values, self.nan_values + + +class FloatWeightInferencetHandler(FloatInferencetHandler): + handled_layer = WeightFloatQuantProxyFromInjector + + def prepare_for_export(self, module): + if module.is_quant_enabled: + self.cached_weight = None + super().prepare_for_export(module) + if module._cached_weight is not None and not module.cache_inference_quant_weight_metadata_only: + self.cached_weight = module._cached_weight.value + + def forward(self, x) -> Tuple[torch.Tensor]: + if self.cached_weight is not None: + x = self.cached_weight + else: + x = self.dequantize(self.quantize(x)) + return x, self.scale, self.zero_point, self.exponent_bit_width, self.mantissa_bit_width, self.exponent_bias, self.saturating, self.inf_values, self.nan_values diff --git a/src/brevitas/export/inference/manager.py b/src/brevitas/export/inference/manager.py new file mode 100644 index 000000000..936106884 --- /dev/null +++ b/src/brevitas/export/inference/manager.py @@ -0,0 +1,106 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause + +from torch.nn import Module +import torch.nn as nn + +from brevitas.export.inference.handler import FloatInferencetHandler +from brevitas.export.inference.handler import FloatWeightInferencetHandler +from brevitas.export.inference.handler import IntInferencetHandler +from brevitas.export.inference.handler import IntWeightInferencetHandler +from brevitas.export.manager import _set_proxy_export_handler +from brevitas.export.manager import _set_proxy_export_mode +from brevitas.export.manager import _set_recurrent_layer_export_handler +from brevitas.export.manager import _set_recurrent_layer_export_mode +from brevitas.export.manager import BaseManager +from brevitas.graph.calibrate import disable_return_quant_tensor +from brevitas.graph.calibrate import restore_return_quant_tensor + + +def _override_caching_mode(m: nn.Module, attr: str, enabled: bool, metadata_only: bool = True): + cache_var = 'cache_inference_quant_' + attr + cache_var_metadata_only = cache_var + '_metadata_only' + if hasattr(m, cache_var): + setattr(m, cache_var, enabled) + setattr(m, cache_var_metadata_only, metadata_only) + + +def _override_bias_caching_mode(m: nn.Module, enabled: bool, metadata_only: bool = True): + _override_caching_mode(m, 'bias', enabled, metadata_only) + + +def _override_act_caching_mode(m: nn.Module, enabled: bool, metadata_only: bool = True): + _override_caching_mode(m, 'act', enabled, metadata_only) + + +def _override_weight_caching_mode(m: nn.Module, enabled: bool, metadata_only: bool = False): + _override_caching_mode(m, 'weight', enabled, metadata_only) + + +class quant_inference_mode: + + def __init__(self, model, cache_quant_weight=False, enabled=True): + self.model = model + self.enabled = enabled + self.cache_quant_weight = cache_quant_weight + self.export_manager = InferenceManager + self.hook_list = [] + self.return_quant_tensor_state = dict() + + def __enter__(self): + if self.enabled: + # Register the hook and store it in the list so that it can be removed by the hook itself when called + handle = self.model.register_forward_hook(self.hook) + self.hook_list.append(handle) + + # Enable bias for everything. Optionally, store the fully fake-quantized weights + self.model.apply( + lambda m: _override_bias_caching_mode(m, enabled=True, metadata_only=True)) + self.model.apply(lambda m: _override_act_caching_mode(m, enabled=True)) + self.model.apply( + lambda m: _override_weight_caching_mode( + m, enabled=True, metadata_only=not self.cache_quant_weight)) + + def __exit__(self, type, value, traceback): + # Disable all caching + # deactivate export mode + # restore return quant tensor + self.model.apply( + lambda m: _override_bias_caching_mode(m, enabled=False, metadata_only=False)) + self.model.apply( + lambda m: _override_act_caching_mode(m, enabled=False, metadata_only=False)) + if self.cache_quant_weight: + self.model.apply( + lambda m: _override_weight_caching_mode(m, enabled=False, metadata_only=False)) + InferenceManager.set_export_mode(self.model, enabled=False) + restore_return_quant_tensor(self.model, self.return_quant_tensor_state) + + def hook(self, module, inp, out): + # After one forward pass with caching enabled, we can: + # - Set the model in export mode + # - Attach export handlers + # - Disable return quant tensor since all quant metadata is cached + assert len(self.hook_list) == 1 + self.hook_list[0].remove() + self.model.apply(InferenceManager.set_export_handler) + InferenceManager.set_export_mode(self.model, enabled=True) + self.return_quant_tensor_state = disable_return_quant_tensor(self.model) + + +# Inheritance from BaseManager is not techincally needed +class InferenceManager(BaseManager): + handlers = [ + IntInferencetHandler, + FloatInferencetHandler, + IntWeightInferencetHandler, + FloatWeightInferencetHandler] + + @classmethod + def set_export_mode(cls, model: Module, enabled: bool): + _set_proxy_export_mode(model, enabled) + _set_recurrent_layer_export_mode(model, enabled) + + @classmethod + def set_export_handler(cls, module: Module): + _set_proxy_export_handler(cls, module) + _set_recurrent_layer_export_handler(cls, module) diff --git a/src/brevitas/export/manager.py b/src/brevitas/export/manager.py index 2805c6174..7b7e7a145 100644 --- a/src/brevitas/export/manager.py +++ b/src/brevitas/export/manager.py @@ -166,11 +166,15 @@ def _trace_fn_dispatcher(cls, fn, input, *args, **kwargs): @classmethod def handler_from_module(cls, module: Module, no_inheritance=False): for handler in cls.handlers: + if not isinstance(handler.handled_layer, tuple): + handled_classes = (handler.handled_layer,) + else: + handled_classes = handler.handled_layer if no_inheritance: - if type(module) == handler.handled_layer: + if type(module) in handled_classes: return handler else: - if isinstance(module, handler.handled_layer): + if any([isinstance(module, handler) for handler in handled_classes]): return handler return None diff --git a/src/brevitas/graph/calibrate.py b/src/brevitas/graph/calibrate.py index 92228b7a3..2b1f6833e 100644 --- a/src/brevitas/graph/calibrate.py +++ b/src/brevitas/graph/calibrate.py @@ -58,7 +58,7 @@ def disable_return_quant_tensor(model): def restore_return_quant_tensor(model, previous_state): for module in model.modules(): - if hasattr(module, 'return_quant_tensor'): + if hasattr(module, 'return_quant_tensor') and module in previous_state: module.return_quant_tensor = previous_state[module] diff --git a/src/brevitas/nn/mixin/base.py b/src/brevitas/nn/mixin/base.py index d64271cb5..a5c4407fd 100644 --- a/src/brevitas/nn/mixin/base.py +++ b/src/brevitas/nn/mixin/base.py @@ -8,12 +8,16 @@ from typing import Optional, Tuple, Union from warnings import warn +import packaging.version +import torch from torch import nn from torch import Tensor import torch.jit from torch.nn.utils.rnn import PackedSequence from brevitas import config +from brevitas import is_dynamo_compiling +from brevitas import torch_version from brevitas.common import ExportMixin from brevitas.inject import ExtendedInjector from brevitas.inject import Injector @@ -85,7 +89,7 @@ def unpack_input(self, inp: Union[Tensor, QuantTensor]) -> Union[Tensor, QuantTe qt_class = self.get_quant_tensor_class(inp) if qt_class is not None: inp = qt_class(*inp) - if not torch._C._get_tracing_state(): + if not torch._C._get_tracing_state() and not is_dynamo_compiling(): if isinstance(inp, QuantTensor): inp = inp.set(value=inp.value.rename(None)) else: diff --git a/src/brevitas/proxy/float_parameter_quant.py b/src/brevitas/proxy/float_parameter_quant.py index 4e6452792..0d6ffd106 100644 --- a/src/brevitas/proxy/float_parameter_quant.py +++ b/src/brevitas/proxy/float_parameter_quant.py @@ -4,6 +4,7 @@ from torch import Tensor import torch.nn as nn +from brevitas.core.function_wrapper.misc import Identity from brevitas.inject import BaseInjector as Injector from brevitas.proxy.parameter_quant import BiasQuantProxyFromInjectorBase from brevitas.proxy.parameter_quant import WeightQuantProxyFromInjectorBase @@ -83,6 +84,13 @@ def is_fnuz(self): ) is None and self.exponent_bias() == 16 return is_fnuz_e4m3 or is_fnuz_e5m2 + @property + def input_view_impl(self): + if self.tensor_quant is not None: + return self.tensor_quant.input_view_impl + else: + return Identity() + class WeightFloatQuantProxyFromInjector(WeightFloatQuantProxyFromInjectorBase): diff --git a/src/brevitas/proxy/float_runtime_quant.py b/src/brevitas/proxy/float_runtime_quant.py index b38f4ecdb..7350e5e32 100644 --- a/src/brevitas/proxy/float_runtime_quant.py +++ b/src/brevitas/proxy/float_runtime_quant.py @@ -4,6 +4,7 @@ import torch import torch.nn as nn +from brevitas.core.function_wrapper.misc import Identity from brevitas.inject import BaseInjector as Injector from brevitas.proxy.runtime_quant import ActQuantProxyFromInjectorBase from brevitas.quant_tensor import FloatQuantTensor @@ -27,7 +28,7 @@ def mantissa_bit_width(self, force_eval=True): def exponent_bias(self, force_eval=True): return self.retrieve_attribute('exponent_bias', force_eval) - def saturating(self, force_eval=True): + def is_saturating(self, force_eval=True): return self.retrieve_attribute('saturating', force_eval) def inf_values(self, force_eval=True): @@ -36,6 +37,13 @@ def inf_values(self, force_eval=True): def nan_values(self, force_eval=True): return self.retrieve_attribute('nan_values', force_eval) + @property + def input_view_impl(self): + if self.fused_activation_quant_proxy.tensor_quant is not None: + return self.fused_activation_quant_proxy.tensor_quant.input_view_impl + else: + return Identity() + @property def is_ocp(self): is_e4m3 = self.mantissa_bit_width() == 3 and self.exponent_bit_width() == 4 diff --git a/src/brevitas/proxy/groupwise_int_runtime_quant.py b/src/brevitas/proxy/groupwise_int_runtime_quant.py index ec9418e19..453cb3f9b 100644 --- a/src/brevitas/proxy/groupwise_int_runtime_quant.py +++ b/src/brevitas/proxy/groupwise_int_runtime_quant.py @@ -31,7 +31,7 @@ def create_quant_tensor( qt_args: Union[torch.Tensor, Tuple[Any]], x: Optional[GroupwiseIntQuantTensor] = None) -> GroupwiseIntQuantTensor: if x is None: - value, scale, zero_point, bit_width, = qt_args + value, scale, zero_point, bit_width = qt_args out = GroupwiseIntQuantTensor( value, scale, diff --git a/src/brevitas/proxy/parameter_quant.py b/src/brevitas/proxy/parameter_quant.py index 77a806ee8..f28233aed 100644 --- a/src/brevitas/proxy/parameter_quant.py +++ b/src/brevitas/proxy/parameter_quant.py @@ -4,7 +4,7 @@ from abc import ABC from abc import ABCMeta from abc import abstractmethod -from typing import Any, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union from warnings import warn import torch @@ -14,8 +14,11 @@ from typing_extensions import runtime_checkable from brevitas import config +from brevitas import is_dynamo_compiling +from brevitas.core.function_wrapper.misc import Identity from brevitas.function import max_int from brevitas.inject import BaseInjector as Injector +from brevitas.quant_tensor import _unpack_quant_tensor from brevitas.quant_tensor import IntQuantTensor from brevitas.quant_tensor import QuantTensor from brevitas.utils.quant_utils import _CachedIO @@ -92,6 +95,13 @@ def __init__(self, quant_layer: nn.Module, quant_injector: Injector) -> None: self.cache_class = None # To be redefined by each class self.quant_tensor_class = None # To be redefined by each class + @property + def input_view_impl(self): + if self.tensor_quant is not None: + return self.tensor_quant.int_quant.input_view_impl + else: + return Identity() + @property def cache_inference_quant_weight(self): return self._cache_inference_quant_weight @@ -118,19 +128,23 @@ def forward(self, x: torch.Tensor) -> Union[Tensor, QuantTensor]: if self.is_quant_enabled: # If quant is enabled the priority is: # - export mode - # - cached weight # - quantization flow if self.export_mode: out = self.export_handler(x) - out = self.create_quant_tensor(out) - elif self._cached_weight is not None and not self.cache_inference_quant_weight_metadata_only: - out = self._cached_weight.quant_tensor + if is_dynamo_compiling(): + out = out[0] + else: + out = self.create_quant_tensor(out) else: out = self.tensor_quant(x) - out = self.create_quant_tensor(out) - if not self.training and self.cache_inference_quant_weight and self._cached_weight is None: - self._cached_weight = self.cache_class( - out.detach(), metadata_only=self.cache_inference_quant_weight_metadata_only) + if is_dynamo_compiling(): + out = out[0] + else: + out = self.create_quant_tensor(out) + if not self.training and self.cache_inference_quant_weight and self._cached_weight is None: + self._cached_weight = self.cache_class( + out.detach(), + metadata_only=self.cache_inference_quant_weight_metadata_only) else: # quantization disabled out = self.apply_input_view(x) return out @@ -151,9 +165,10 @@ def tracked_parameter_list(self): def get_cached(self, attr): if self._cached_bias is None: - warn( - "No quant bias cache found, set cache_inference_quant_bias=True and run an " - "inference pass first") + if not is_dynamo_compiling(): + warn( + "No quant bias cache found, set cache_inference_quant_bias=True and run an " + "inference pass first") return None if self.training: warn("Cached quant bias scale is being used in training mode.") @@ -268,7 +283,7 @@ class BiasQuantProxyFromInjector(BiasQuantProxyFromInjectorBase): def scale(self): if not self.is_quant_enabled: return None - if self.requires_input_scale and self.is_quant_enabled and self.is_quant_enabled: + if self.requires_input_scale and self.is_quant_enabled: cache = self.get_cached('scale') return cache zhs = self._zero_hw_sentinel() @@ -335,12 +350,13 @@ def forward( out, out_scale, out_zp, out_bit_width = impl(x, input_scale) else: out, out_scale, out_zp, out_bit_width = impl(x) - out = IntQuantTensor( - out, out_scale, out_zp, out_bit_width, self.is_signed, self.training) - if not self.training and self.cache_inference_quant_bias: - cached_bias = _CachedIO( - out.detach(), metadata_only=self.cache_inference_quant_bias_metadata_only) - self._cached_bias = cached_bias + if not is_dynamo_compiling(): + out = IntQuantTensor( + out, out_scale, out_zp, out_bit_width, self.is_signed, self.training) + if not self.training and self.cache_inference_quant_bias: + cached_bias = _CachedIO( + out.detach(), metadata_only=self.cache_inference_quant_bias_metadata_only) + self._cached_bias = cached_bias else: out = x return out diff --git a/src/brevitas/proxy/quant_proxy.py b/src/brevitas/proxy/quant_proxy.py index 9c4255773..845bfd515 100644 --- a/src/brevitas/proxy/quant_proxy.py +++ b/src/brevitas/proxy/quant_proxy.py @@ -122,7 +122,7 @@ def add_tracked_module(self, module: nn.Module) -> None: raise RuntimeError("Trying to add None as a parent module.") def apply_input_view(self, x): - return self.quant_injector.input_view_impl(x) + return self.input_view_impl(x) def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, diff --git a/src/brevitas/proxy/runtime_quant.py b/src/brevitas/proxy/runtime_quant.py index 511f914e6..9feb593b4 100644 --- a/src/brevitas/proxy/runtime_quant.py +++ b/src/brevitas/proxy/runtime_quant.py @@ -13,6 +13,7 @@ from typing_extensions import runtime_checkable import brevitas +from brevitas import is_dynamo_compiling from brevitas.quant_tensor import IntQuantTensor from brevitas.quant_tensor import QuantTensor from brevitas.utils.quant_utils import _CachedIO @@ -98,6 +99,14 @@ def __init__(self, quant_layer, quant_injector): self.cache_quant_io_metadata_only = True self.cache_class = None + @property + def input_view_impl(self): + if self.fused_activation_quant_proxy.tensor_quant is not None and not isinstance( + self.fused_activation_quant_proxy.tensor_quant, _TensorQuantDisabledIdentity): + return self.fused_activation_quant_proxy.tensor_quant.int_quant.input_view_impl + else: + return Identity() + def internal_forward(self, force_eval): current_status = self.training if force_eval: @@ -107,14 +116,17 @@ def internal_forward(self, force_eval): return out def retrieve_attribute(self, attribute, force_eval): - if self.is_quant_enabled: + if self._cached_act is not None: + return getattr(self._cached_act, attribute) + elif self.is_quant_enabled: out = self.internal_forward(force_eval) return getattr(out, attribute) - elif self._cached_act is not None: - return getattr(self._cached_act, attribute) elif self._cached_act is None: return None + def apply_input_view(self, x): + return self.input_view_impl(x) + @property def is_quant_enabled(self): return self._is_quant_enabled and not self.disable_quant @@ -176,15 +188,18 @@ def forward(self, x: Union[Tensor, QuantTensor]) -> Union[Tensor, QuantTensor]: # If y is an empty QuantTensor, we need to check if this is a passthrough proxy, # otherwise return a simple Tensor - # If the second value (i.e., scale) is None, then quant is disabled - if isinstance(y, tuple) and y[1] is not None: - out = self.create_quant_tensor(y) - elif self.is_passthrough_act and isinstance(x, QuantTensor): - # preserve quant_metadata - y = y[0] - out = self.create_quant_tensor(y, x=x) - else: + if is_dynamo_compiling(): out = y[0] + else: + # If the second value (i.e., scale) is None, then quant is disabled + if y[1] is not None: + out = self.create_quant_tensor(y) + elif self.is_passthrough_act and isinstance(x, QuantTensor): + # preserve scale/zp/bit/sign even without output quant + y = y[0] + out = self.create_quant_tensor(y, x=x) + else: + out = y[0] if not self.training and self.cache_inference_quant_act and isinstance(out, QuantTensor): cached_out = self.cache_class(out.detach(), self.cache_quant_io_metadata_only) diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py index bac596be5..9e5c90e26 100644 --- a/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py +++ b/src/brevitas_examples/imagenet_classification/ptq/ptq_common.py @@ -22,7 +22,6 @@ from brevitas.graph.target.flexml import quantize_flexml from brevitas.inject import value import brevitas.nn as qnn -from brevitas.quant.experimental.float import Fp8e4m3Act from brevitas.quant.experimental.float import Fp8e4m3ActPerTensorFloat from brevitas.quant.experimental.float import Fp8e4m3ActPerTensorFloatMSE from brevitas.quant.experimental.float import Fp8e4m3WeightPerChannelFloat @@ -179,7 +178,8 @@ class CNNInt8DynamicActPerTensorFloat(Int8DynamicActPerTensorFloat): 'asym': CNNShiftedUint8DynamicActPerTensorFloat}}}, 'po2_scale': { 'stats': { - 'per_group': MXInt8Act}}}}, + 'per_group': { + 'sym': MXInt8Act}}}}}, 'float': { 'static': { 'float_scale': { diff --git a/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py b/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py index c960a89e6..4520bc3e2 100644 --- a/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py +++ b/src/brevitas_examples/imagenet_classification/ptq/ptq_evaluate.py @@ -18,6 +18,7 @@ from brevitas.export import export_onnx_qcdq from brevitas.export import export_torch_qcdq +from brevitas.export.inference import quant_inference_mode from brevitas.graph.quantize import preprocess_for_quantize from brevitas.graph.target.flexml import preprocess_for_flexml_quantize from brevitas_examples.imagenet_classification.ptq.ptq_common import apply_act_equalization @@ -267,6 +268,14 @@ def parse_type(v, default_type): 'uint_sym_act_for_unsigned_values', default=True, help='Use unsigned act quant when possible (default: enabled)') +add_bool_arg(parser, 'compile', default=False, help='Use torch.compile (default: disabled)') + + +def generate_ref_input(args, device, dtype): + model_config = get_model_config(args.model_name) + center_crop_shape = model_config['center_crop_shape'] + img_shape = center_crop_shape + return torch.ones(1, 3, img_shape, img_shape, device=device, dtype=dtype) def main(): @@ -474,23 +483,28 @@ def main(): # Validate the quant_model on the validation dataloader print("Starting validation:") - validate(val_loader, quant_model, stable=dtype != torch.bfloat16) + with torch.no_grad(), quant_inference_mode(quant_model): + param = next(iter(quant_model.parameters())) + device, dtype = param.device, param.dtype + ref_input = generate_ref_input(args, device, dtype) + quant_model(ref_input) + compiled_model = torch.compile(quant_model, fullgraph=True, disable=not args.compile) + validate(val_loader, compiled_model, stable=dtype != torch.bfloat16) if args.export_onnx_qcdq or args.export_torch_qcdq: # Generate reference input tensor to drive the export process - model_config = get_model_config(args.model_name) - center_crop_shape = model_config['center_crop_shape'] - img_shape = center_crop_shape - device, dtype = next(model.parameters()).device, next(model.parameters()).dtype - ref_input = torch.ones(1, 3, img_shape, img_shape, device=device, dtype=dtype) + param = next(iter(quant_model.parameters())) + device, dtype = param.device, param.dtype + ref_input = generate_ref_input(args, device, dtype) export_name = os.path.join(args.export_dir, config) if args.export_onnx_qcdq: export_name = export_name + '.onnx' - export_onnx_qcdq(model, ref_input, export_name, opset_version=args.onnx_opset_version) + export_onnx_qcdq( + quant_model, ref_input, export_name, opset_version=args.onnx_opset_version) if args.export_torch_qcdq: export_name = export_name + '.pt' - export_torch_qcdq(model, ref_input, export_name) + export_torch_qcdq(quant_model, ref_input, export_name) if __name__ == '__main__': diff --git a/src/brevitas_examples/imagenet_classification/utils.py b/src/brevitas_examples/imagenet_classification/utils.py index d506b8a61..460e7d77f 100644 --- a/src/brevitas_examples/imagenet_classification/utils.py +++ b/src/brevitas_examples/imagenet_classification/utils.py @@ -1,5 +1,3 @@ -import csv - import torch import torchvision.datasets as datasets import torchvision.transforms as transforms diff --git a/tests/brevitas_end_to_end/test_torchvision_models.py b/tests/brevitas_end_to_end/test_torchvision_models.py index 0d76ae2db..09f0b9253 100644 --- a/tests/brevitas_end_to_end/test_torchvision_models.py +++ b/tests/brevitas_end_to_end/test_torchvision_models.py @@ -13,6 +13,7 @@ from brevitas import torch_version from brevitas.export import export_onnx_qcdq from brevitas.export import export_torch_qcdq +from brevitas.export.inference import quant_inference_mode from brevitas.graph.calibrate import calibration_mode from brevitas.graph.quantize import layerwise_quantize from brevitas.graph.quantize import quantize @@ -21,9 +22,13 @@ from brevitas_examples.imagenet_classification.ptq.ptq_common import quantize_model from tests.marker import requires_pt_ge +TORCH_COMPILE_ATOL = 0.35 BATCH = 1 HEIGHT, WIDTH = 224, 224 IN_CH = 3 + +COMPILE_MODEL_LIST = ['efficientnet_b0', 'resnet18', 'fcn_resnet50'] + MODEL_LIST = [ 'vit_b_32', 'efficientnet_b0', @@ -68,11 +73,7 @@ def quantize_float(model): quant_format='float') -@fixture -@parametrize('model_name', MODEL_LIST) -@parametrize('quantize_fn', [quantize, quantize_flexml, layerwise_quantize]) -def torchvision_model(model_name, quantize_fn): - +def shared_quant_fn(model_name, quantize_fn): inp = torch.randn(BATCH, IN_CH, HEIGHT, WIDTH) if torch_version <= version.parse('1.9.1') and model_name == 'regnet_x_400mf': @@ -112,20 +113,53 @@ def torchvision_model(model_name, quantize_fn): return model -@requires_pt_ge('1.8.1') +@fixture +@parametrize('model_name', MODEL_LIST) +@parametrize('quantize_fn', [quantize_float, quantize, layerwise_quantize, quantize_flexml]) +def torchvision_model(model_name, quantize_fn): + return shared_quant_fn(model_name, quantize_fn) + + +@fixture +@parametrize('model_name', COMPILE_MODEL_LIST) +@parametrize('quantize_fn', [quantize_float, quantize]) +def torchvision_model_compile(model_name, quantize_fn): + return shared_quant_fn(model_name, quantize_fn) + + +@requires_pt_ge('2.2') +def test_torchvision_compile(torchvision_model_compile): + torch._dynamo.config.capture_scalar_outputs = True + if torchvision_model_compile is None: + pytest.skip('Model not instantiated') + + inp = torch.randn(BATCH, IN_CH, HEIGHT, WIDTH) + + with torch.no_grad(), quant_inference_mode(torchvision_model_compile): + prehook_non_compiled_out = torchvision_model_compile(inp) + post_hook_non_compiled_out = torchvision_model_compile(inp) + + compiled_model = torch.compile(torchvision_model_compile, fullgraph=True) + compiled_out = compiled_model(inp) + + assert torch.allclose(prehook_non_compiled_out, post_hook_non_compiled_out) + assert torch.allclose(post_hook_non_compiled_out, compiled_out, atol=TORCH_COMPILE_ATOL) + + def test_torchvision_graph_quantization_flexml_qcdq_onnx(torchvision_model, request): + test_id = request.node.callspec.id if torchvision_model is None: pytest.skip('Model not instantiated') + inp = torch.randn(BATCH, IN_CH, HEIGHT, WIDTH) - test_id = request.node.callspec.id quantize_fn_name = test_id.split("-")[0] torchvision_model(inp) + if quantize_fn_name != 'quantize_float': export_onnx_qcdq(torchvision_model, args=inp) -@requires_pt_ge('1.9.1') def test_torchvision_graph_quantization_flexml_qcdq_torch(torchvision_model, request): if torchvision_model is None: pytest.skip('Model not instantiated')