From 996c05f4d26e7720a1593c530fc7ca784dbe1722 Mon Sep 17 00:00:00 2001 From: gamzeisl Date: Tue, 24 Sep 2024 18:07:16 +0200 Subject: [PATCH] [PyITA] Move GELU functions --- PyITA/ITA.py | 173 +--------------------------- PyITA/gelu.py | 122 ++++++++++++++++++++ PyITA/{test_ITA.py => test_gelu.py} | 29 +++++ PyITA/util.py | 77 ++++++++++++- 4 files changed, 229 insertions(+), 172 deletions(-) create mode 100644 PyITA/gelu.py rename PyITA/{test_ITA.py => test_gelu.py} (89%) diff --git a/PyITA/ITA.py b/PyITA/ITA.py index be809ae..85e55d8 100644 --- a/PyITA/ITA.py +++ b/PyITA/ITA.py @@ -32,11 +32,10 @@ from numpy.typing import ArrayLike, DTypeLike from .softmax import fastSoftmax, realSoftmax, streamingPartialSoftmax +from .gelu import gelu_requantize, i_gelu_requantized, get_i_gelu_constants, get_i_gelu_requantized_constants from .util import (generate_matrix_mem, pack_8b_to_word, pack_array_8b_to_word, pack_hex_24b, pack_multihead_8b_to_word, pack_multihead_24b_to_word, random_shuffled_tensor, requantize, split_matrix, to_hex, write_matrix, - write_matrix_mem, write_matrix_mem_hex, write_vector_mem_hex) -from typing import Optional, Tuple -from numpy import int8 as i8, int16 as i16, int32 as i32, float32 as f32, uint8 as u8, uint16 as u16 + write_matrix_mem, write_matrix_mem_hex, write_vector_mem_hex, get_almost_symmetric_scaling_factor) class Transformer: @@ -971,174 +970,6 @@ def export_numpy(self): rqs_add = self.requant_add) -def round(x: f32, n_bits: int = 8): - x_clip = np.clip(x, -2**(n_bits - 1), 2**(n_bits - 1) - 1) - return np.floor(x_clip + 0.5 + np.finfo(f32).eps).astype(int) - - -def clip(x: f32, n_bits: int = 8) -> f32: - return np.clip(x, -2**(n_bits - 1), 2**(n_bits - 1) - 1) - - -def round_and_clip(x: f32, n_bits: int = 8) -> f32: - x_rounded = np.floor(x + 0.5 + np.finfo(f32).eps) - x_clipped = clip(x_rounded, n_bits) - return x_clipped - - -def round_to_i8(x: f32) -> i8: - x_rounded_clipped: f32 = round_and_clip(x, 8) - return x_rounded_clipped.astype(i8) - - -def round_to_u8(x: f32) -> u8: - x_rounded_clipped: f32 = round_and_clip(x, 8) - return x_rounded_clipped.astype(u8) - - -def round_to_i16(x: f32) -> i16: - x_rounded_clipped: f32 = round_and_clip(x, 16) - return x_rounded_clipped.astype(i16) - - -def i_gelu(q: i8, q_1: i16, q_b: i16, q_c: i16) -> i32: - q_clipped = max(q, -2**7 + 1) - q_erf: i32 = i_erf(q_clipped, q_b, q_c) - q_out: i32 = q_clipped * (q_erf + q_1) - return q_out - - -def gelu_requantize(q: i32, eps_mul: i8, eps_shift: u8, eps_add: u8) -> i8: - q_mul: i64 = eps_mul * q - shifted: f32 = q_mul / 2**float(eps_shift) + eps_add - q_req: i8 = round_to_i8(shifted) - return q_req - - -def i_gelu_requantized(q: i8, q_1: i16, q_b: i16, q_c: i16, eps_mul: u8, eps_shift: u8, eps_add: u8) -> i8: - q_out: i32 = i_gelu(q, q_1, q_b, q_c) - q_req: i8 = gelu_requantize(q_out, eps_mul, eps_shift, eps_add) - return q_req - - -def get_i_gelu_constants(S: f32) -> Tuple[i16, i16, i16, float, float, float]: - a: float = -0.2888 - b: float = -1.769 - c: float = 1 - S_2: f32 = S / np.sqrt(2) - q_1: i16 = round_to_i16(1 / (a * S_2**2)) - q_b: i16 = round_to_i16(b / S_2) - q_c: i16 = round_to_i16(c / (a * S_2**2)) - return q_1, q_b, q_c, a, b, c - - -def get_i_gelu_requantized_constants(S: f32, D: i32) -> Tuple[i16, i16, i16, float, float, float, u8, u8, u8, f32]: - q_1, q_b, q_c, a, b, c = get_i_gelu_constants(S) - S_2: f32 = S / np.sqrt(2) - S_out: f32 = S * a * S_2**2 / 2 - # Flip sign of eps_mul to ensure its positive - eps_mul: u8 = round_to_u8(-S_out / S * D) - eps_shift: u8 = round_to_i8(np.log2(D)) - eps_add: u8 = 0 - # Compensate for the sign flip in eps_mul by negating S - return q_1, q_b, q_c, a, b, c, eps_mul, eps_shift, eps_add, -S - - -def i_gelu_wrapper(q: i8, S: f32) -> Tuple[i32, f32]: - S_2: f32 = S / np.sqrt(2) - q_1, q_b, q_c, a, _, _ = get_i_gelu_constants(S) - q_out: i32 = i_gelu(q, q_1, q_b, q_c) - S_out: f32 = S * a * S_2**2 / 2 - return q_out, S_out - - -def i_gelu_wrapper_requantized(q: i8, S: f32, D: i32) -> Tuple[i8, f32]: - q_1, q_b, q_c, a, _, _, eps_mul, eps_shift, eps_add, S_out = get_i_gelu_requantized_constants(S, D) - q_out: i32 = i_gelu_requantized(q, q_1, q_b, q_c, eps_mul, eps_shift, eps_add) - return q_out, S_out - - -def i_erf(q: i8, q_b: i16, q_c: i16) -> i32: - q_sgn: i8 = np.sign(q) - q_abs: i8 = np.abs(q) - q_clipped: i8 = np.clip(q_abs, 0, -q_b) - q_L: i32 = i_poly(q_clipped, q_b, q_c) - q_out: i32 = q_sgn * q_L - return q_out - - -def i_erf_wrapper(q: i8, S: i8) -> Tuple[i32, f32]: - a: float = -0.2888 - b: float = -1.769 - c: float = 1 - q_b: i16 = round_to_i16(b / S) - q_c: i16 = round_to_i16(c / (a * S**2)) - S_out: f32 = a * S**2 - q_out: i32 = i_erf(q, q_b, q_c) - return q_out, S_out - - -def i_poly(q: i8, q_b: i16, q_c: i16) -> i32: - q16: i16 = q.astype(i16) - q_c32: i32 = q_c.astype(i32) - d: i16 = q16 + q_b - d_sq: i16 = d**2 - q_out: i32 = d_sq + q_c32 - return q_out.astype(i32) - - -def i_poly_wrapper(q: i8, S: f32, a: f32, b: f32, c: f32) -> Tuple[i32, f32]: - q_b: i16 = round_to_i16(b / S) - q_c: i16 = round_to_i16(c / (a * S**2)) - S_out: f32 = a * S**2 - q_out: i32 = i_poly(q, q_b, q_c) - return q_out, S_out - - -def get_scaling_factor(alpha: f32, n_bits: int = 8) -> f32: - S: f32 = alpha / (2**(n_bits - 1) - 1) - return S - - -def quantize(activations: np.ndarray, alpha: f32, n_bits: int = 8, S: Optional[f32] = None) -> Tuple[np.ndarray, f32]: - x_q = np.clip(activations, -alpha, alpha) - if S is None: - S = get_scaling_factor(alpha, n_bits) - x_q = x_q / S - x_q = np.array(list(map(round, x_q))) - return x_q, S - - -def dequantize(quantized_activations: np.ndarray, alpha: f32, n_bits: int = 8) -> np.ndarray: - S = get_scaling_factor(alpha, n_bits) - activations = quantized_activations * S - return activations - - -def get_almost_symmetric_scaling_factor(clip_lo: f32, n_bits: int = 8) -> Tuple[f32, f32]: - if 2**n_bits == 2: - return 1 - n_levels = 2**n_bits - scale = (-n_levels + 2) / n_levels - clip_hi = clip_lo * scale - S = clip_hi / (n_levels / 2 - 1) - return S, clip_hi - - -def almost_symmetric_quantize(activations: np.ndarray, clip_lo: f32, n_bits: int = 8) -> Tuple[np.ndarray, f32]: - S, clip_hi = get_almost_symmetric_scaling_factor(clip_lo, n_bits) - x_q = np.clip(activations, clip_lo, clip_hi) - x_q = x_q / S - x_q = np.array(list(map(round, x_q))) - return x_q, S - - -def almost_symmetric_dequantize(quantized_activations: np.ndarray, clip_lo: f32, n_bits: int = 8) -> np.ndarray: - S, _ = get_almost_symmetric_scaling_factor(clip_lo, n_bits) - activations = quantized_activations * S - return activations - - def generateTestVectors(path, **kwargs): s = kwargs['S'] p = kwargs['P'] diff --git a/PyITA/gelu.py b/PyITA/gelu.py new file mode 100644 index 0000000..034d041 --- /dev/null +++ b/PyITA/gelu.py @@ -0,0 +1,122 @@ +# ---------------------------------------------------------------------- +# +# File: gelu.py +# +# Last edited: 24.09.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np + +from .util import (round_to_i8, round_to_u8, round_to_i16) +from typing import Tuple +from numpy import int8 as i8, int16 as i16, int32 as i32, float32 as f32, uint8 as u8, uint16 as u16 + + +def i_gelu(q: i8, q_1: i16, q_b: i16, q_c: i16) -> i32: + q_clipped = max(q, -2**7 + 1) + q_erf: i32 = i_erf(q_clipped, q_b, q_c) + q_out: i32 = q_clipped * (q_erf + q_1) + return q_out + + +def gelu_requantize(q: i32, eps_mul: i8, eps_shift: u8, eps_add: u8) -> i8: + q_mul: i64 = eps_mul * q + shifted: f32 = q_mul / 2**float(eps_shift) + eps_add + q_req: i8 = round_to_i8(shifted) + return q_req + + +def i_gelu_requantized(q: i8, q_1: i16, q_b: i16, q_c: i16, eps_mul: u8, eps_shift: u8, eps_add: u8) -> i8: + q_out: i32 = i_gelu(q, q_1, q_b, q_c) + q_req: i8 = gelu_requantize(q_out, eps_mul, eps_shift, eps_add) + return q_req + + +def get_i_gelu_constants(S: f32) -> Tuple[i16, i16, i16, float, float, float]: + a: float = -0.2888 + b: float = -1.769 + c: float = 1 + S_2: f32 = S / np.sqrt(2) + q_1: i16 = round_to_i16(1 / (a * S_2**2)) + q_b: i16 = round_to_i16(b / S_2) + q_c: i16 = round_to_i16(c / (a * S_2**2)) + return q_1, q_b, q_c, a, b, c + + +def get_i_gelu_requantized_constants(S: f32, D: i32) -> Tuple[i16, i16, i16, float, float, float, u8, u8, u8, f32]: + q_1, q_b, q_c, a, b, c = get_i_gelu_constants(S) + S_2: f32 = S / np.sqrt(2) + S_out: f32 = S * a * S_2**2 / 2 + # Flip sign of eps_mul to ensure its positive + eps_mul: u8 = round_to_u8(-S_out / S * D) + eps_shift: u8 = round_to_i8(np.log2(D)) + eps_add: u8 = 0 + # Compensate for the sign flip in eps_mul by negating S + return q_1, q_b, q_c, a, b, c, eps_mul, eps_shift, eps_add, -S + + +def i_gelu_wrapper(q: i8, S: f32) -> Tuple[i32, f32]: + S_2: f32 = S / np.sqrt(2) + q_1, q_b, q_c, a, _, _ = get_i_gelu_constants(S) + q_out: i32 = i_gelu(q, q_1, q_b, q_c) + S_out: f32 = S * a * S_2**2 / 2 + return q_out, S_out + + +def i_gelu_wrapper_requantized(q: i8, S: f32, D: i32) -> Tuple[i8, f32]: + q_1, q_b, q_c, a, _, _, eps_mul, eps_shift, eps_add, S_out = get_i_gelu_requantized_constants(S, D) + q_out: i32 = i_gelu_requantized(q, q_1, q_b, q_c, eps_mul, eps_shift, eps_add) + return q_out, S_out + + +def i_erf(q: i8, q_b: i16, q_c: i16) -> i32: + q_sgn: i8 = np.sign(q) + q_abs: i8 = np.abs(q) + q_clipped: i8 = np.clip(q_abs, 0, -q_b) + q_L: i32 = i_poly(q_clipped, q_b, q_c) + q_out: i32 = q_sgn * q_L + return q_out + + +def i_erf_wrapper(q: i8, S: i8) -> Tuple[i32, f32]: + a: float = -0.2888 + b: float = -1.769 + c: float = 1 + q_b: i16 = round_to_i16(b / S) + q_c: i16 = round_to_i16(c / (a * S**2)) + S_out: f32 = a * S**2 + q_out: i32 = i_erf(q, q_b, q_c) + return q_out, S_out + + +def i_poly(q: i8, q_b: i16, q_c: i16) -> i32: + q16: i16 = q.astype(i16) + q_c32: i32 = q_c.astype(i32) + d: i16 = q16 + q_b + d_sq: i16 = d**2 + q_out: i32 = d_sq + q_c32 + return q_out.astype(i32) + + +def i_poly_wrapper(q: i8, S: f32, a: f32, b: f32, c: f32) -> Tuple[i32, f32]: + q_b: i16 = round_to_i16(b / S) + q_c: i16 = round_to_i16(c / (a * S**2)) + S_out: f32 = a * S**2 + q_out: i32 = i_poly(q, q_b, q_c) + return q_out, S_out diff --git a/PyITA/test_ITA.py b/PyITA/test_gelu.py similarity index 89% rename from PyITA/test_ITA.py rename to PyITA/test_gelu.py index a9f4722..fb1804a 100644 --- a/PyITA/test_ITA.py +++ b/PyITA/test_gelu.py @@ -1,3 +1,28 @@ +# ---------------------------------------------------------------------- +# +# File: test_gelu.py +# +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This test file is used to check the integer quantization of the GELU function. + import pytest import torch import numpy as np @@ -6,6 +31,8 @@ import matplotlib.pyplot as plt import seaborn as sns +from .util import * +from .gelu import * from .ITA import * N_SAMPLES = 75 @@ -41,6 +68,8 @@ def plot(data: pd.DataFrame, title: str, quantized_y_label: str, expected_y_labe ax.set_xlabel('$x$') ax.set_ylabel('Value') filename = os.path.join(plot_dir, f'{title}.png') + if not os.path.exists(plot_dir): + os.makedirs(plot_dir) plt.savefig(filename) diff --git a/PyITA/util.py b/PyITA/util.py index db1253b..a8526bf 100644 --- a/PyITA/util.py +++ b/PyITA/util.py @@ -25,11 +25,12 @@ # limitations under the License. import os -from typing import SupportsIndex, Tuple, Union +from typing import Optional, SupportsIndex, Tuple, Union import numpy as np from numpy.typing import DTypeLike +from numpy import int8 as i8, int16 as i16, int32 as i32, float32 as f32, uint8 as u8, uint16 as u16 def random_shuffled_tensor(shape, bitwidth: int, type: DTypeLike = np.int8, scaling = 1 / 4) -> np.ndarray: """ @@ -450,3 +451,77 @@ def split_matrix(m: np.ndarray, block_shape: Tuple[SupportsIndex, SupportsIndex] return res else: raise ValueError("Matrix must be 2D") + + +def round(x: f32, n_bits: int = 8): + x_clip = np.clip(x, -2**(n_bits - 1), 2**(n_bits - 1) - 1) + return np.floor(x_clip + 0.5 + np.finfo(f32).eps).astype(int) + + +def clip(x: f32, n_bits: int = 8) -> f32: + return np.clip(x, -2**(n_bits - 1), 2**(n_bits - 1) - 1) + + +def round_and_clip(x: f32, n_bits: int = 8) -> f32: + x_rounded = np.floor(x + 0.5 + np.finfo(f32).eps) + x_clipped = clip(x_rounded, n_bits) + return x_clipped + + +def round_to_i8(x: f32) -> i8: + x_rounded_clipped: f32 = round_and_clip(x, 8) + return x_rounded_clipped.astype(i8) + + +def round_to_u8(x: f32) -> u8: + x_rounded_clipped: f32 = round_and_clip(x, 8) + return x_rounded_clipped.astype(u8) + + +def round_to_i16(x: f32) -> i16: + x_rounded_clipped: f32 = round_and_clip(x, 16) + return x_rounded_clipped.astype(i16) + + +def get_scaling_factor(alpha: f32, n_bits: int = 8) -> f32: + S: f32 = alpha / (2**(n_bits - 1) - 1) + return S + + +def quantize(activations: np.ndarray, alpha: f32, n_bits: int = 8, S: Optional[f32] = None) -> Tuple[np.ndarray, f32]: + x_q = np.clip(activations, -alpha, alpha) + if S is None: + S = get_scaling_factor(alpha, n_bits) + x_q = x_q / S + x_q = np.array(list(map(round, x_q))) + return x_q, S + + +def dequantize(quantized_activations: np.ndarray, alpha: f32, n_bits: int = 8) -> np.ndarray: + S = get_scaling_factor(alpha, n_bits) + activations = quantized_activations * S + return activations + + +def get_almost_symmetric_scaling_factor(clip_lo: f32, n_bits: int = 8) -> Tuple[f32, f32]: + if 2**n_bits == 2: + return 1 + n_levels = 2**n_bits + scale = (-n_levels + 2) / n_levels + clip_hi = clip_lo * scale + S = clip_hi / (n_levels / 2 - 1) + return S, clip_hi + + +def almost_symmetric_quantize(activations: np.ndarray, clip_lo: f32, n_bits: int = 8) -> Tuple[np.ndarray, f32]: + S, clip_hi = get_almost_symmetric_scaling_factor(clip_lo, n_bits) + x_q = np.clip(activations, clip_lo, clip_hi) + x_q = x_q / S + x_q = np.array(list(map(round, x_q))) + return x_q, S + + +def almost_symmetric_dequantize(quantized_activations: np.ndarray, clip_lo: f32, n_bits: int = 8) -> np.ndarray: + S, _ = get_almost_symmetric_scaling_factor(clip_lo, n_bits) + activations = quantized_activations * S + return activations