Xilinx · Giuseppe5 · May 29, 2024 · Apr 11, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/src/brevitas/export/common/handler/base.py b/src/brevitas/export/common/handler/base.py
@@ -4,6 +4,7 @@
 from abc import ABC
 from abc import abstractmethod
 import math
+from warnings import warn
 
 import torch
 from torch import Tensor
@@ -12,7 +13,8 @@
 from brevitas.function.ops import max_int
 from brevitas.function.ops import min_int
 
-__all__ = ['BaseHandler', 'BitWidthHandlerMixin', 'ZeroPointHandlerMixin']
+__all__ = [
+    'BaseHandler', 'BitWidthHandlerMixin', 'ZeroPointHandlerMixin', 'FloatZeroPointHandlerMixin']
 
 
 class BaseHandler(Module, ABC):
@@ -38,6 +40,13 @@ def quant_axis(cls, scale):
         return None
 
 
+class FloatClipMixin(ABC):
+
+    @classmethod
+    def clip_symbolic_kwargs(cls, narrow, signed, exponent_bit_width, mantissa_bit_width):
+        return None
+
+
 class ClipMixin(ABC):
 
     @classmethod
@@ -112,6 +121,18 @@ def validate_neg_scalar_int_exponent(cls, scale: Tensor):
         return -cls.validate_scalar_int_exponent(scale)
 
 
+class FloatZeroPointHandlerMixin(ABC):
+
+    @classmethod
+    def zero_point_with_dtype(cls, signed, exponent_bit_width, mantissa_bit_width, zero_point):
+        if exponent_bit_width == 4 and mantissa_bit_width == 3:
+            return zero_point.type(torch.float8_e4m3fn)
+        elif exponent_bit_width == 5 and mantissa_bit_width == 2:
+            return zero_point.type(torch.float8_e5m2)
+        else:
+            return zero_point.type(torch.float32)
+
+
 class ZeroPointHandlerMixin(ABC):
 
     @classmethod

diff --git a/src/brevitas/export/common/handler/qcdq.py b/src/brevitas/export/common/handler/qcdq.py
@@ -10,16 +10,20 @@
 
 from brevitas.export.common import to_0dim_if_scalar
 from brevitas.export.common import to_item_if_0dim
+from brevitas.proxy import ActFloatQuantProxyFromInjector
 from brevitas.proxy import ActQuantProxyFromInjector
 from brevitas.proxy import BiasQuantProxyFromInjector
 from brevitas.proxy import DecoupledWeightQuantProxyFromInjector
 from brevitas.proxy import DecoupledWeightQuantWithInputProxyFromInjector
+from brevitas.proxy import WeightFloatQuantProxyFromInjector
 from brevitas.proxy import WeightQuantProxyFromInjector
 from brevitas.proxy.runtime_quant import DynamicActQuantProxyFromInjector
 from brevitas.proxy.runtime_quant import TruncQuantProxyFromInjector
 
 from .base import BitWidthHandlerMixin
 from .base import ClipMixin
+from .base import FloatClipMixin
+from .base import FloatZeroPointHandlerMixin
 from .base import QuantAxisMixin
 from .base import ZeroPointHandlerMixin
 
@@ -66,6 +70,25 @@ def clip_fn(self, x, min_val, max_val):
         pass
 
 
+class FloatQMixin(ABC):
+
+    @abstractmethod
+    def quantize_fn(self, x, scale, zero_point, dtype, axis):
+        pass
+
+    @classmethod
+    def signed_dtype(cls, exponent_bit_width, mantissa_bit_width, is_signed):
+        if exponent_bit_width is None or mantissa_bit_width is None:
+            return None
+        if exponent_bit_width == 4 and mantissa_bit_width == 3:
+            dtype = torch.float8_e4m3fn
+        elif exponent_bit_width == 5 and mantissa_bit_width == 2:
+            dtype = torch.float8_e5m2
+        else:
+            dtype = torch.float32
+        return dtype
+
+
 class QMixin(BitWidthHandlerMixin, ABC):
 
     @classmethod
@@ -110,6 +133,34 @@ def quantize_fn(self, x, dtype):
         pass
 
 
+class FloatCDQCastProxyHandlerMixin(QuantAxisMixin,
+                                    FloatClipMixin,
+                                    FloatZeroPointHandlerMixin,
+                                    CDQCastMixin,
+                                    ABC):
+
+    def dequantize_symbolic_kwargs(
+            cls, scale, zero_point, exponent_bit_width, mantisssa_bit_width, is_signed):
+        scale_orig_shape = scale.shape
+        axis = cls.quant_axis(scale)
+        if cls.flatten_dequantize_params:
+            scale = scale.flatten()
+        scale = to_0dim_if_scalar(scale)
+        if cls.flatten_dequantize_params:
+            zero_point = zero_point.flatten()
+        zp = to_0dim_if_scalar(zero_point)
+        zp = zp.expand_as(scale)
+        zp = cls.zero_point_with_dtype(is_signed, exponent_bit_width, mantisssa_bit_width, zp)
+        return {
+            'scale': scale,
+            'zero_point': zp,
+            'axis': axis,
+            # We save only the scale original shape
+            # as zero-point is being expanded to the same
+            # size as the scale
+            'scale_orig_shape': scale_orig_shape}
+
+
 class CDQCastProxyHandlerMixin(QuantAxisMixin, ClipMixin, ZeroPointHandlerMixin, CDQCastMixin, ABC):
 
     def dequantize_symbolic_kwargs(cls, scale, zero_point, bit_width, is_signed):
@@ -133,6 +184,128 @@ def dequantize_symbolic_kwargs(cls, scale, zero_point, bit_width, is_signed):
             'scale_orig_shape': scale_orig_shape}
 
 
+class FloatQCDQCastWeightQuantProxyHandlerMixin(FloatQMixin, FloatCDQCastProxyHandlerMixin):
+    handled_layer = WeightFloatQuantProxyFromInjector
+
+    def quantize_symbolic_kwargs(
+            cls, scale, zero_point, exponent_bit_width, mantissa_bit_width, is_signed):
+        # compute axis before redefining scale
+        axis = cls.quant_axis(scale)
+        scale = to_0dim_if_scalar(scale.flatten())
+        zp = to_0dim_if_scalar(zero_point.flatten())
+        # expand_as must go after 0-dim check
+        zp = zp.expand_as(scale)
+        zp = cls.zero_point_with_dtype(is_signed, exponent_bit_width, mantissa_bit_width, zp)
+        if cls.itemize_quantize_scalar_params:
+            scale = to_item_if_0dim(scale)
+            zp = to_item_if_0dim(zp)
+        dtype = cls.signed_dtype(exponent_bit_width, mantissa_bit_width, is_signed)
+        return {'scale': scale, 'zero_point': zp, 'dtype': dtype, 'axis': axis}
+
+    def prepare_quantize_from_floating_point(self, module):
+        quant_weight = module.tracked_module_list[0].quant_weight()
+        scale = quant_weight.scale
+        self.scale_dtype = scale.dtype
+        if self.scale_dtype == torch.bfloat16 or self.scale_dtype == torch.float16:
+            scale = self.cast_fn(scale, torch.float32)
+        self.symbolic_kwargs['quantize_symbolic_kwargs'] = self.quantize_symbolic_kwargs(
+            scale,
+            quant_weight.zero_point,
+            quant_weight.exponent_bit_width,
+            quant_weight.mantissa_bit_width,
+            module.is_signed)
+
+    def prepare_quantize_from_integer(self, module):
+        int_weights = {
+            tm.weight.data_ptr(): tm.quant_weight().int(float_datatype=False)
+            for tm in module.tracked_module_list}
+        self.symbolic_kwargs['int_weights'] = int_weights
+
+    def prepare_for_export(self, module):
+        if module.is_quant_enabled:
+            self.validate(module)
+            if self._export_q_node:
+                self.prepare_quantize_from_floating_point(module)
+            else:
+                self.prepare_quantize_from_integer(module)
+            # Get the first quant weight as representative
+            quant_weight = module.tracked_module_list[0].quant_weight()
+
+            # (B)float16 is not supported with standard Q/DQ ops, thus we store the original dtype
+            # of the scale and we cast it to float32.
+            # The original dtype is then restored during the forward pass
+            scale = quant_weight.scale
+            self.scale_dtype = scale.dtype
+            if self.scale_dtype == torch.bfloat16 or self.scale_dtype == torch.float16:
+                scale = self.cast_fn(scale, torch.float32)
+
+            self.symbolic_kwargs['exponent_bit_width'] = quant_weight.exponent_bit_width
+            self.symbolic_kwargs['mantissa_bit_width'] = quant_weight.mantissa_bit_width
+            self.symbolic_kwargs['exponent_bias'] = quant_weight.exponent_bias
+            self.symbolic_kwargs['saturating'] = quant_weight.saturating
+            self.symbolic_kwargs['inf_values'] = quant_weight.inf_values
+            self.symbolic_kwargs['nan_values'] = quant_weight.nan_values
+            self.symbolic_kwargs['clip_symbolic_kwargs'] = self.clip_symbolic_kwargs(
+                module.is_narrow_range,
+                module.is_signed,
+                quant_weight.exponent_bit_width,
+                quant_weight.mantissa_bit_width)
+            self.symbolic_kwargs['dequantize_symbolic_kwargs'] = self.dequantize_symbolic_kwargs(
+                scale,
+                quant_weight.zero_point,
+                quant_weight.exponent_bit_width,
+                quant_weight.mantissa_bit_width,
+                module.is_signed)
+        else:
+            self.symbolic_kwargs = None
+
+    def quantize_from_floating_point(self, x: Tensor):
+        # Workaround for equal_cpu RuntimeError
+        quantize_symbolic_kwargs = self.symbolic_kwargs['quantize_symbolic_kwargs']
+        # Before quantization, cast input to float32
+        if self.scale_dtype == torch.float16 or self.scale_dtype == torch.bfloat16:
+            x = self.cast_fn(x, torch.float32)
+        x = self.quantize_fn(x, *quantize_symbolic_kwargs.values())
+        return x
+
+    def quantize_from_integer(self, x: Tensor):
+        return self.symbolic_kwargs['int_weights'][x.data_ptr()]
+
+    def symbolic_execution(self, x: Tensor):
+        assert self.symbolic_kwargs is not None, 'Symbolic execution requires quant to be enabled'
+
+        # Copy dict to allow for popping kwargs even on shared quantizers
+        dequantize_symbolic_kwargs = copy(self.symbolic_kwargs['dequantize_symbolic_kwargs'])
+        scale = dequantize_symbolic_kwargs['scale']
+        zero_point = dequantize_symbolic_kwargs['zero_point']
+
+        if self._export_q_node:
+            x = self.quantize_from_floating_point(x)
+        else:
+            x = self.quantize_from_integer(x)
+        clip_symbolic_kwargs = self.symbolic_kwargs['clip_symbolic_kwargs']
+        exponent_bit_width = self.symbolic_kwargs['exponent_bit_width']
+        mantissa_bit_width = self.symbolic_kwargs['mantissa_bit_width']
+        exponent_bias = self.symbolic_kwargs['exponent_bias']
+        saturating = self.symbolic_kwargs['saturating']
+        inf_values = self.symbolic_kwargs['inf_values']
+        nan_values = self.symbolic_kwargs['nan_values']
+        scale_orig_shape = dequantize_symbolic_kwargs.pop('scale_orig_shape')
+        # Workaround to trick the tracer into believing all return values are used
+        self.assert_ge_zero(scale, exponent_bit_width, mantissa_bit_width)
+        if clip_symbolic_kwargs is not None:
+            x = self.clip_fn(x, *clip_symbolic_kwargs.values())
+        x = self.dequantize_fn(x, *dequantize_symbolic_kwargs.values())
+        # After dequantization, cast both input and scale to the correct dtype
+        if self.scale_dtype == torch.float16 or self.scale_dtype == torch.bfloat16:
+            x = self.cast_fn(x, self.scale_dtype)
+            scale = self.cast_fn(scale, self.scale_dtype)
+        # Restore the original shapes to guarantee correct shape propagation downstream
+        scale = scale.view(scale_orig_shape)
+        zero_point = zero_point.view_as(scale)
+        return x, scale, zero_point, exponent_bit_width, mantissa_bit_width, exponent_bias, saturating, inf_values, nan_values
+
+
 class QCDQCastWeightQuantProxyHandlerMixin(QMixin, CDQCastProxyHandlerMixin):
     handled_layer = WeightQuantProxyFromInjector
 
@@ -251,6 +424,99 @@ def symbolic_execution(self, x: Tensor, input_bit_width: torch.Tensor, input_is_
         return super().symbolic_execution(x)
 
 
+class FloatQCDQCastActQuantProxyHandlerMixin(FloatQMixin, FloatCDQCastProxyHandlerMixin, ABC):
+    handled_layer = ActFloatQuantProxyFromInjector
+
+    def quantize_symbolic_kwargs(
+            cls, scale, zero_point, exponent_bit_width, mantissa_bit_width, is_signed):
+        # compute axis before redefining scale
+        axis = cls.quant_axis(scale)
+        scale = to_0dim_if_scalar(scale.flatten())
+        zp = to_0dim_if_scalar(zero_point.flatten())
+        # expand_as must go after 0-dim check
+        zp = zp.expand_as(scale)
+        zp = cls.zero_point_with_dtype(is_signed, exponent_bit_width, mantissa_bit_width, zp)
+        if cls.itemize_quantize_scalar_params:
+            scale = to_item_if_0dim(scale)
+            zp = to_item_if_0dim(zp)
+        dtype = cls.signed_dtype(exponent_bit_width, mantissa_bit_width, is_signed)
+        return {'scale': scale, 'zero_point': zp, 'dtype': dtype, 'axis': axis}
+
+    def prepare_for_export(self, module):
+        if module.is_quant_enabled:
+            self.validate(module)
+            self.symbolic_kwargs['exponent_bit_width'] = module.exponent_bit_width()
+            self.symbolic_kwargs['mantissa_bit_width'] = module.mantissa_bit_width()
+            self.symbolic_kwargs['exponent_bias'] = module.exponent_bias()
+            self.symbolic_kwargs['saturating'] = module.saturating()
+            self.symbolic_kwargs['inf_values'] = module.inf_values()
+            self.symbolic_kwargs['nan_values'] = module.nan_values()
+
+            # (B)float16 is not supported with standard Q/DQ ops, thus we store the original dtype
+            # of the scale and we cast it to float32.
+            # The original dtype is then restored during the forward pass
+            scale = module.scale()
+            self.scale_dtype = scale.dtype
+            if self.scale_dtype == torch.bfloat16 or self.scale_dtype == torch.float16:
+                scale = self.cast_fn(scale, torch.float32)
+
+            self.symbolic_kwargs['quantize_symbolic_kwargs'] = self.quantize_symbolic_kwargs(
+                scale,
+                module.zero_point(),
+                module.exponent_bit_width(),
+                module.mantissa_bit_width(),
+                module.is_signed)
+            self.symbolic_kwargs['dequantize_symbolic_kwargs'] = self.dequantize_symbolic_kwargs(
+                scale,
+                module.zero_point(),
+                module.exponent_bit_width(),
+                module.mantissa_bit_width(),
+                module.is_signed)
+            self.symbolic_kwargs['clip_symbolic_kwargs'] = self.clip_symbolic_kwargs(
+                module.is_narrow_range,
+                module.is_signed,
+                module.exponent_bit_width(),
+                module.mantissa_bit_width())
+
+        else:
+            self.symbolic_kwargs = None
+
+    def symbolic_execution(self, x: Tensor):
+        assert self.symbolic_kwargs is not None, 'Symbolic execution requires quant to be enabled'
+
+        # Copy dict to allow for popping kwargs even on shared quantizers
+        dequantize_symbolic_kwargs = copy(self.symbolic_kwargs['dequantize_symbolic_kwargs'])
+        scale = dequantize_symbolic_kwargs['scale']
+        zero_point = dequantize_symbolic_kwargs['zero_point']
+        scale_orig_shape = dequantize_symbolic_kwargs.pop('scale_orig_shape')
+
+        quantize_symbolic_kwargs = self.symbolic_kwargs['quantize_symbolic_kwargs']
+        clip_symbolic_kwargs = self.symbolic_kwargs['clip_symbolic_kwargs']
+        exponent_bit_width = self.symbolic_kwargs['exponent_bit_width']
+        mantissa_bit_width = self.symbolic_kwargs['mantissa_bit_width']
+        exponent_bias = self.symbolic_kwargs['exponent_bias']
+        saturating = self.symbolic_kwargs['saturating']
+        inf_values = self.symbolic_kwargs['inf_values']
+        nan_values = self.symbolic_kwargs['nan_values']
+
+        self.assert_ge_zero(scale, exponent_bit_width, mantissa_bit_width)
+        # If original dtype of the input is (b)float16, cast the input to float32
+        if x.dtype == torch.float16 or x.dtype == torch.bfloat16:
+            x = self.cast_fn(x, torch.float32)
+        x = self.quantize_fn(x, *quantize_symbolic_kwargs.values())
+        if clip_symbolic_kwargs is not None:
+            x = self.clip_fn(x, *clip_symbolic_kwargs.values())
+        x = self.dequantize_fn(x, *dequantize_symbolic_kwargs.values())
+        # After dequantization, cast both output and scale to the correct dtype
+        if self.scale_dtype == torch.float16 or self.scale_dtype == torch.bfloat16:
+            x = self.cast_fn(x, self.scale_dtype)
+            scale = self.cast_fn(scale, self.scale_dtype)
+        # Restore the original shapes to guarantee correct shape propagation downstream
+        scale = scale.view(scale_orig_shape)
+        zero_point = zero_point.view_as(scale)
+        return x, scale, zero_point, exponent_bit_width, mantissa_bit_width, exponent_bias, saturating, inf_values, nan_values
+
+
 class QCDQCastActQuantProxyHandlerMixin(QMixin, CDQCastProxyHandlerMixin, ABC):
     handled_layer = ActQuantProxyFromInjector