Skip to content

Commit

Permalink
Fix float fdiv BiF implementation
Browse files Browse the repository at this point in the history
.
  • Loading branch information
igorban-intel authored and igcbot committed Aug 8, 2024
1 parent 277e089 commit d568692
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 67 deletions.
155 changes: 102 additions & 53 deletions IGC/VectorCompiler/lib/BiF/Library/Math/F32/fdiv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,43 +52,52 @@ CM_NODEBUG CM_INLINE mask<N> check_special(vector<float, N> a,
}

template <int N> CM_NODEBUG CM_INLINE mask<N> check_is_nan(vector<float, N> q) {
vector<uint32_t, N> q_int = q.template format<uint32_t>();
vector<float, N> q_abs = math::absolute(q.cl_vector());
vector<uint32_t, N> q_int = q_abs.template format<uint32_t>();
return (q_int > exp_bitmask);
}

template <int N>
CM_NODEBUG CM_INLINE mask<N> check_is_denormak(vector<float, N> q) {
CM_NODEBUG CM_INLINE mask<N> check_is_denormal(vector<float, N> q) {
vector<uint32_t, N> q_int = q.template format<uint32_t>();
return ((q_int & exp_bitmask) == 0);
}

template <int N>
CM_NODEBUG CM_INLINE vector<float, N> normalize_exp(vector<float, N> a) {
// normalize (scale by 2^32)
vector<uint32_t, N> local_s = normalize_bitmask;
vector<float, N> local_s_f = local_s.template format<float>();
auto local_a = a * local_s_f;
// normalize (scale by 2^32)
return local_a;
}

template <int N>
CM_NODEBUG CM_INLINE vector<int32_t, N> get_exp_diff(vector<float, N> a,
vector<float, N> b) {
vector<float, N> a_abs = math::absolute(a.cl_vector());
vector<float, N> b_abs = math::absolute(b.cl_vector());
vector<uint32_t, N> a_int = a_abs.template format<uint32_t>();
vector<uint32_t, N> b_int = a_abs.template format<uint32_t>();
CM_NODEBUG CM_INLINE static vector<int32_t, N>
get_exp_diff(vector<float, N> a, vector<float, N> b) {
vector<float, N> a_abs = math::absolute(a);
vector<float, N> b_abs = math::absolute(b);
vector<int32_t, N> a_int = a_abs.template format<int32_t>();
vector<int32_t, N> b_int = b_abs.template format<int32_t>();
vector<int32_t, N> diff = a_int - b_int;
return diff >> exp_shift;
diff = diff >> exp_shift;
return diff;
}

template <int N>
CM_NODEBUG CM_NOINLINE static cl_vector<float, N>
__impl_div_ieee_step_7__rtz_(cl_vector<float, N> cr1, cl_vector<float, N> cy3,
cl_vector<float, N> cq1) {
vector<float, N> r1 = cr1, y3 = cy3, q1 = cq1;
return math::mad(r1, y3, q1).cl_vector();
}

template <int N>
CM_NODEBUG CM_INLINE vector<float, N> cut_mantissa_and_sign(vector<float, N> a,
unsigned or_data) {
vector<uint32_t, N> a_int = a.template format<uint32_t>();
auto mantissa_and_sign = a_int & (mantissa_bitmask | sign_bit);
vector<uint32_t, N> zero = or_data;
mantissa_and_sign |= zero;
mantissa_and_sign |= or_data;
return mantissa_and_sign.template format<float>();
}

Expand All @@ -110,10 +119,21 @@ template <int N>
CM_NODEBUG CM_INLINE vector<float, N> or_with_pred(vector<float, N> a,
mask<N> mask) {
vector<uint32_t, N> a_int = a.template format<uint32_t>();
a_int |= mask;
vector<uint32_t, N> mask_or = 0x1;
mask_or.merge(0x0, mask);
a_int |= mask_or;
return a_int.template format<float>();
}

template <int N>
CM_NODEBUG CM_INLINE vector<uint32_t, N> or_with_pred(vector<uint32_t, N> a_int,
mask<N> mask) {
vector<uint32_t, N> mask_or = 0x1;
mask_or.merge(0x0, mask);
a_int |= mask_or;
return a_int;
}

template <int N>
CM_NODEBUG CM_INLINE vector<float, N>
__impl_fdiv_ieee_special_div(vector<float, N> a, vector<float, N> b,
Expand Down Expand Up @@ -141,7 +161,7 @@ __impl_fdiv_ieee_special_div(vector<float, N> a, vector<float, N> b,
// a is Inf?
auto a_isInf = (exp_a == exp_mask) & (filled_out == 0);
// return NaN_Indef if y is also Inf; Inf with proper sign otherwise
result.merge(xor_sign(a, sgn_x) - xor_sign(b, sgn_y), a_isInf);
result.merge(xor_sign(a, sgn_y) - xor_sign(b, sgn_x), a_isInf);
filled_out |= a_isInf;

auto exp_b = get_exp(b);
Expand All @@ -152,31 +172,23 @@ __impl_fdiv_ieee_special_div(vector<float, N> a, vector<float, N> b,
filled_out |= b_isInf;

// b is 0?
mask<N> b_isZero = (xor_sign(b, sgn_y) == 0.0f) & (filled_out == 0);
mask<N> b_isZero = (b_abs == 0.0f) & (filled_out == 0);
if (b_isZero.any()) {
mask<N> a_isZero = (xor_sign(a, sgn_x) == 0.0f);
mask<N> a_isZero = (a_abs == 0.0f);
auto mq = xor_sign(b, exp_bitmask);
mq.merge(xor_sign(mq, sgn_x), (a_isZero == 0));
mq.merge(a * mq, a_isZero);
result.merge(mq, b_isZero);
filled_out |= b_isZero;
}
// a is 0?
mask<N> a_isZero = (xor_sign(a, sgn_x) == 0.0f) & (filled_out == 0);
mask<N> a_isZero = (a_abs == 0.0f) & (filled_out == 0);
result.merge(a * b, a_isZero);
filled_out |= a_isZero;

return result;
}

template <int N>
CM_NODEBUG CM_NOINLINE static cl_vector<float, N>
__impl_div_ieee_step_7__rtz_(cl_vector<float, N> cr1, cl_vector<float, N> cy3,
cl_vector<float, N> cq1) {
vector<float, N> r1 = cr1, y3 = cy3, q1 = cq1;
return math::mad(r1, y3, q1).cl_vector();
}

template <int N>
CM_NODEBUG CM_INLINE vector<float, N>
__impl_fdiv_ieee_undeflow_case(vector<float, N> mr1, vector<float, N> y1,
Expand All @@ -190,7 +202,7 @@ __impl_fdiv_ieee_undeflow_case(vector<float, N> mr1, vector<float, N> y1,
// is normal quotient exact?
// test whether r1==0
// auto inexact_bit = ((mr1.w << 1)) ? 1 : 0;
mask<N> r1_is_not_zero = mr1 != 0.0;
mask<N> r1_is_zero = mr1 == 0.0f;

// shift amount is -(ediff5+126)=-itmp1
// shift_amount >= 1
Expand All @@ -199,31 +211,53 @@ __impl_fdiv_ieee_undeflow_case(vector<float, N> mr1, vector<float, N> y1,
// if shift_amount>1, then inexact bit is OR-ed into sticky bit (last mantissa
// bit) for round-to-nearest and shift_amount==1, last mantissa bit is a round
// bit and inexact_bit is used to complete the rounding
mask<N> ammount_ge1 = (shift_amount > 1) | r1_is_not_zero;
mask<N> ammount_ge1 = (shift_amount > 1) | r1_is_zero;
if (ammount_ge1.any()) {
// signed mantissa of quotient
auto mq_u = cut_mantissa_and_sign(mq, exp_float_min);
mq_u = or_with_pred(mq_u, (r1_is_not_zero == 0));
auto mq_u_local = cut_mantissa_and_sign(mq, exp_float_min);
mq_u_local = or_with_pred(mq_u_local, (r1_is_zero == 0));
// perform scaling in user rounding mode
mq_u = mq_u * scaling_factor(itmp1);
result.merge(mq_u, ammount_ge1);
mq_u_local = mq_u_local * scaling_factor(itmp1);
result.merge(mq_u_local, ammount_ge1);
}

// mantissa
auto mq_u = cut_mantissa(mq);
auto sgn_q = get_sign(mq);
// will shift result by 2

vector<uint32_t, N> mq_u_uint = mq_u.template format<uint32_t>();
mq_u_uint = (mq_u_uint << 1) | sgn_q;
mq_u_uint = or_with_pred(mq_u_uint, (r1_is_zero == 0));

vector<uint32_t, N> s_e2 = 0x3e800000;
auto s_e1 = sgn_q | exp_float_min;

// force UF flag to be set
mask<N> mq_u_isDenormal = (mq_u_uint & exp_float_min) == 0;
vector<uint32_t, N> s_e3 = 0x00c00000;
// ensure mq_u.f is not denormal (so it is not flushed to zero)
s_e3.merge(0x00a00000, mq_u_isDenormal);
mq_u_uint |= exp_float_min;
// add sign to shifted leading bit + correction
s_e3 |= sgn_q;
vector<float, N> s_e3_float = s_e3.template format<float>();
mq = mq_u_uint.template format<float>();

vector<float, N> s_e2_f = s_e2.template format<float>();
mq = math::mad(mq, s_e2_f, s_e3_float);
// eliminate leading bit (but UF flag will not be set)
vector<float, N> s_e1_float = s_e1.template format<float>();
mq = mq - s_e1_float;

// force UF flag to be set
s_e1_float *= s_e1_float;
vector<uint32_t, N> mq_u_int = s_e1_float.template format<uint32_t>();

// artificial dependency
sgn_q |= (mq_u_int & mantissa_loss);
vector<float, N> sgn_q_float = sgn_q.template format<float>();
result.merge(sgn_q_float, ammount_ge1 == 0);
vector<uint32_t, N> mq_uint = mq.template format<uint32_t>();
mq_uint |= (mq_u_int & mantissa_loss);
mq = mq_uint.template format<float>();
result.merge(mq, ammount_ge1 == 0);
return result;
}

Expand All @@ -232,60 +266,61 @@ CM_NODEBUG CM_INLINE vector<float, N>
__impl_fdiv_ieee_long_path(vector<float, N> a, vector<float, N> b) {

vector<float, N> result;
vector<int32_t, N> ediff5;

auto expon_x = get_exp(a);
auto expon_y = get_exp(b);
vector<int32_t, N> ediff5;

// filter out Inf/NaN, zeroes, denormals
mask<N> special_div = check_special(a, b);
mask<N> filled_out = false;
if (special_div.any()) {
auto special_div_res = __impl_fdiv_ieee_special_div(a, b, filled_out);
result.merge(special_div_res, filled_out);

if (filled_out.all())
return result;
special_div &= (filled_out == 0);
// for denormal inputs, zeroes, NaNs, Inf
// a denormal, or b denormal
// initialize scale exponents

vector<uint32_t, N> isx = 0;
vector<uint32_t, N> isy = 0;

auto a_isDenormal = check_is_denormak(a) & special_div;
auto a_isDenormal = check_is_denormal(a) & special_div;
if (a_isDenormal.any()) {
a.merge(normalize_exp(a), a_isDenormal);
expon_x.merge(get_exp(a) - 32, a_isDenormal);
isx.merge(32, a_isDenormal);
}
auto b_isDenormal = check_is_denormak(b) & special_div;
auto b_isDenormal = check_is_denormal(b) & special_div;
if (b_isDenormal.any()) {
b.merge(normalize_exp(b), b_isDenormal);
expon_y.merge(get_exp(b) - 32, b_isDenormal);
isy.merge(32, b_isDenormal);
}
// used to detect gradual underflow
ediff5.merge(get_exp_diff(a, b) - isx + isy, special_div);
// signed mantissas, needed for long path computation
// return to long computation path
}
// signed mantissas, needed for long path computation
// return to long computation path
vector<float, N> ma = cut_mantissa_and_sign(a, exp_float_zero);
vector<float, N> mb = cut_mantissa_and_sign(b, exp_float_zero);

ediff5.merge(get_exp_diff(a, b), (special_div == 0));
vector<uint32_t, N> e_diff = expon_x - expon_y;
// signed mantissas
auto ma = cut_mantissa_and_sign(a, exp_float_zero);
auto mb = cut_mantissa_and_sign(b, exp_float_zero);
vector<uint32_t, N> ediff = expon_x - expon_y;

// will check whether quotient is in gradual underflow range
vector<int32_t, N> itmp1 = ediff5 + 126;
vector<int32_t, N> itmp2 = ediff5 + 126 + 25;
vector<int32_t, N> ig_uf = itmp1 & (~itmp2);
mask<N> gradual_underflow = ig_uf < 0;

mask<N> gradual_underflow = (ediff5 < 126 + 25) & (ediff5 >= 101);
const vector<float, N> one = 1.0f;
// perform division on signed mantissas: ma.f/mb.f
auto y0 = math::reciprocal(mb);
// Step(1), q0=a*y0
auto q0 = ma * y0;
// Step(2), e0=(1-b*y0)
const vector<float, N> one = 1.0f;
auto e0 = math::mad(-mb, y0, one);
// Step(3), y1=y0+e0*y0
auto y1 = math::mad(e0, y0, y0);
Expand All @@ -306,7 +341,21 @@ __impl_fdiv_ieee_long_path(vector<float, N> a, vector<float, N> b) {
// Step(7), q=q1+r1*y1
auto mq = math::mad(mr1, y1, q1);

vector<int32_t, N> se_diff = e_diff.template format<int32_t>();
// scale by 2^exponent_q
// split exponent difference into smaller parts (so that scale factors can be
// represented in SP format) three scale factors are needed when inputs may be
// denormal limit range of e_diff, so that two scale factors are sufficient
// (to avoid incorrect flag settings)
mask<N> fix_exp = (ediff + 126) > (126 * 2 + 126);
if (fix_exp.any()) {
vector<int32_t, N> sgn_e = ediff.template format<int32_t>();
sgn_e.merge(0, fix_exp == 0);
sgn_e = sgn_e >> 11;
// set e_diff to 124*2 with proper sign
ediff.merge(sgn_e ^ (124 + 124 + sgn_e), fix_exp);
}

vector<int32_t, N> se_diff = ediff.template format<int32_t>();
auto es1 = (se_diff) >> 1;
auto es2 = se_diff - es1;

Expand All @@ -316,8 +365,6 @@ __impl_fdiv_ieee_long_path(vector<float, N> a, vector<float, N> b) {
umq += (ues1 << exp_shift);
// one more scale factor
auto s_e2 = scaling_factor(es2);

// result
mq = umq.template format<float>();
mq *= s_e2;

Expand All @@ -337,8 +384,10 @@ CM_NODEBUG CM_INLINE vector<float, N> __impl_fdiv_ieee(vector<float, N> a,
// check exponent ranges
// Main path will be taken for expon_x in [bias-62, bias+63] and expon_y in
// [bias-63, bias+63]
mask<N> x_long_path = (expon_x + 62 - exp_bias) >= (64 + 62);
mask<N> y_long_path = (expon_y + 63 - exp_bias) >= (64 + 63);
auto exp_x_bias = math::absolute((expon_x + 62 - exp_bias).cl_vector());
auto exp_y_bias = math::absolute((expon_y + 63 - exp_bias).cl_vector());
mask<N> x_long_path = exp_x_bias >= (64 + 62);
mask<N> y_long_path = exp_y_bias >= (64 + 63);
mask<N> long_path = x_long_path | y_long_path;

if (long_path.any())
Expand Down
24 changes: 10 additions & 14 deletions IGC/VectorCompiler/test/GenXBuiltinFunctions/fdiv_f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,24 @@
;
;============================ end_copyright_notice =============================

; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXBuiltinFunctions \
; RUN: -march=genx64 -mtriple=spir64-unknown-unknown -mcpu=XeLPG -S < %s 2>&1 \
; RUN: | FileCheck %s
; RUN: %opt %use_old_pass_manager% -GenXBuiltinFunctions -march=genx64 \
; RUN: -vc-builtins-bif-path=%VC_BUILTINS_BIF_XeLPG% -mcpu=XeLPG \
; RUN: -mtriple=spir64-unknown-unknown -S < %s 2>&1 | FileCheck %s

; CHECK-NOT: WARNING
; CHECK: CheckModuleDebugify: PASS
; RUN: %opt %use_old_pass_manager% -GenXBuiltinFunctions -march=genx64 \
; RUN: -vc-builtins-bif-path=%VC_BUILTINS_BIF_XeHPC% -mcpu=XeHPC \
; RUN: -mtriple=spir64-unknown-unknown -S < %s 2>&1 | FileCheck %s \
; RUN: --check-prefix=CHECK-NOEMU

; Function Attrs: nofree nosync nounwind readnone
declare <32 x float> @llvm.genx.ieee.div.v32f32(<32 x float>, <32 x float>)

define dllexport spir_kernel void @test_kernel(<32 x float> %l, <32 x float> %r) {
; CHECK: = fdiv <32 x float> %l, %r
; CHECK-NOEMU: = fdiv <32 x float> %l, %r
%1 = fdiv <32 x float> %l, %r
; CHECK: = call <32 x float> @__vc_builtin_fdiv_v32f32(<32 x float> %l, <32 x float> %r)
; CHECK-NOEMU: = call <32 x float> @llvm.genx.ieee.div.v32f32
%2 = call <32 x float> @llvm.genx.ieee.div.v32f32(<32 x float> %l, <32 x float> %r)
ret void
}

; COM: The presence of these __vc_builtin_* funcitions is a HACK to trick VC
; COM: backend into thinking that we have built-in routines
define <32 x float> @__vc_builtin_fdiv_v32f32(<32 x float> %l, <32 x float> %r) #0 {
ret <32 x float> zeroinitializer
}

attributes #0 = { "VC.Builtin" }
}

0 comments on commit d568692

Please sign in to comment.