From d56869288f68f5551beca144dabbfcf6dae6df92 Mon Sep 17 00:00:00 2001 From: "Gorban, Igor" Date: Thu, 8 Aug 2024 20:55:07 +0000 Subject: [PATCH] Fix float fdiv BiF implementation . --- .../lib/BiF/Library/Math/F32/fdiv.cpp | 155 ++++++++++++------ .../test/GenXBuiltinFunctions/fdiv_f32.ll | 24 ++- 2 files changed, 112 insertions(+), 67 deletions(-) diff --git a/IGC/VectorCompiler/lib/BiF/Library/Math/F32/fdiv.cpp b/IGC/VectorCompiler/lib/BiF/Library/Math/F32/fdiv.cpp index 7ff5b6c897c8..f1d66bcd973d 100644 --- a/IGC/VectorCompiler/lib/BiF/Library/Math/F32/fdiv.cpp +++ b/IGC/VectorCompiler/lib/BiF/Library/Math/F32/fdiv.cpp @@ -52,34 +52,44 @@ CM_NODEBUG CM_INLINE mask check_special(vector a, } template CM_NODEBUG CM_INLINE mask check_is_nan(vector q) { - vector q_int = q.template format(); + vector q_abs = math::absolute(q.cl_vector()); + vector q_int = q_abs.template format(); return (q_int > exp_bitmask); } template -CM_NODEBUG CM_INLINE mask check_is_denormak(vector q) { +CM_NODEBUG CM_INLINE mask check_is_denormal(vector q) { vector q_int = q.template format(); return ((q_int & exp_bitmask) == 0); } template CM_NODEBUG CM_INLINE vector normalize_exp(vector a) { + // normalize (scale by 2^32) vector local_s = normalize_bitmask; vector local_s_f = local_s.template format(); auto local_a = a * local_s_f; - // normalize (scale by 2^32) return local_a; } template -CM_NODEBUG CM_INLINE vector get_exp_diff(vector a, - vector b) { - vector a_abs = math::absolute(a.cl_vector()); - vector b_abs = math::absolute(b.cl_vector()); - vector a_int = a_abs.template format(); - vector b_int = a_abs.template format(); +CM_NODEBUG CM_INLINE static vector +get_exp_diff(vector a, vector b) { + vector a_abs = math::absolute(a); + vector b_abs = math::absolute(b); + vector a_int = a_abs.template format(); + vector b_int = b_abs.template format(); vector diff = a_int - b_int; - return diff >> exp_shift; + diff = diff >> exp_shift; + return diff; +} + +template +CM_NODEBUG CM_NOINLINE static cl_vector +__impl_div_ieee_step_7__rtz_(cl_vector cr1, cl_vector cy3, + cl_vector cq1) { + vector r1 = cr1, y3 = cy3, q1 = cq1; + return math::mad(r1, y3, q1).cl_vector(); } template @@ -87,8 +97,7 @@ CM_NODEBUG CM_INLINE vector cut_mantissa_and_sign(vector a, unsigned or_data) { vector a_int = a.template format(); auto mantissa_and_sign = a_int & (mantissa_bitmask | sign_bit); - vector zero = or_data; - mantissa_and_sign |= zero; + mantissa_and_sign |= or_data; return mantissa_and_sign.template format(); } @@ -110,10 +119,21 @@ template CM_NODEBUG CM_INLINE vector or_with_pred(vector a, mask mask) { vector a_int = a.template format(); - a_int |= mask; + vector mask_or = 0x1; + mask_or.merge(0x0, mask); + a_int |= mask_or; return a_int.template format(); } +template +CM_NODEBUG CM_INLINE vector or_with_pred(vector a_int, + mask mask) { + vector mask_or = 0x1; + mask_or.merge(0x0, mask); + a_int |= mask_or; + return a_int; +} + template CM_NODEBUG CM_INLINE vector __impl_fdiv_ieee_special_div(vector a, vector b, @@ -141,7 +161,7 @@ __impl_fdiv_ieee_special_div(vector a, vector b, // a is Inf? auto a_isInf = (exp_a == exp_mask) & (filled_out == 0); // return NaN_Indef if y is also Inf; Inf with proper sign otherwise - result.merge(xor_sign(a, sgn_x) - xor_sign(b, sgn_y), a_isInf); + result.merge(xor_sign(a, sgn_y) - xor_sign(b, sgn_x), a_isInf); filled_out |= a_isInf; auto exp_b = get_exp(b); @@ -152,9 +172,9 @@ __impl_fdiv_ieee_special_div(vector a, vector b, filled_out |= b_isInf; // b is 0? - mask b_isZero = (xor_sign(b, sgn_y) == 0.0f) & (filled_out == 0); + mask b_isZero = (b_abs == 0.0f) & (filled_out == 0); if (b_isZero.any()) { - mask a_isZero = (xor_sign(a, sgn_x) == 0.0f); + mask a_isZero = (a_abs == 0.0f); auto mq = xor_sign(b, exp_bitmask); mq.merge(xor_sign(mq, sgn_x), (a_isZero == 0)); mq.merge(a * mq, a_isZero); @@ -162,21 +182,13 @@ __impl_fdiv_ieee_special_div(vector a, vector b, filled_out |= b_isZero; } // a is 0? - mask a_isZero = (xor_sign(a, sgn_x) == 0.0f) & (filled_out == 0); + mask a_isZero = (a_abs == 0.0f) & (filled_out == 0); result.merge(a * b, a_isZero); filled_out |= a_isZero; return result; } -template -CM_NODEBUG CM_NOINLINE static cl_vector -__impl_div_ieee_step_7__rtz_(cl_vector cr1, cl_vector cy3, - cl_vector cq1) { - vector r1 = cr1, y3 = cy3, q1 = cq1; - return math::mad(r1, y3, q1).cl_vector(); -} - template CM_NODEBUG CM_INLINE vector __impl_fdiv_ieee_undeflow_case(vector mr1, vector y1, @@ -190,7 +202,7 @@ __impl_fdiv_ieee_undeflow_case(vector mr1, vector y1, // is normal quotient exact? // test whether r1==0 // auto inexact_bit = ((mr1.w << 1)) ? 1 : 0; - mask r1_is_not_zero = mr1 != 0.0; + mask r1_is_zero = mr1 == 0.0f; // shift amount is -(ediff5+126)=-itmp1 // shift_amount >= 1 @@ -199,31 +211,53 @@ __impl_fdiv_ieee_undeflow_case(vector mr1, vector y1, // if shift_amount>1, then inexact bit is OR-ed into sticky bit (last mantissa // bit) for round-to-nearest and shift_amount==1, last mantissa bit is a round // bit and inexact_bit is used to complete the rounding - mask ammount_ge1 = (shift_amount > 1) | r1_is_not_zero; + mask ammount_ge1 = (shift_amount > 1) | r1_is_zero; if (ammount_ge1.any()) { // signed mantissa of quotient - auto mq_u = cut_mantissa_and_sign(mq, exp_float_min); - mq_u = or_with_pred(mq_u, (r1_is_not_zero == 0)); + auto mq_u_local = cut_mantissa_and_sign(mq, exp_float_min); + mq_u_local = or_with_pred(mq_u_local, (r1_is_zero == 0)); // perform scaling in user rounding mode - mq_u = mq_u * scaling_factor(itmp1); - result.merge(mq_u, ammount_ge1); + mq_u_local = mq_u_local * scaling_factor(itmp1); + result.merge(mq_u_local, ammount_ge1); } // mantissa auto mq_u = cut_mantissa(mq); auto sgn_q = get_sign(mq); // will shift result by 2 + + vector mq_u_uint = mq_u.template format(); + mq_u_uint = (mq_u_uint << 1) | sgn_q; + mq_u_uint = or_with_pred(mq_u_uint, (r1_is_zero == 0)); + + vector s_e2 = 0x3e800000; auto s_e1 = sgn_q | exp_float_min; - // force UF flag to be set + mask mq_u_isDenormal = (mq_u_uint & exp_float_min) == 0; + vector s_e3 = 0x00c00000; + // ensure mq_u.f is not denormal (so it is not flushed to zero) + s_e3.merge(0x00a00000, mq_u_isDenormal); + mq_u_uint |= exp_float_min; + // add sign to shifted leading bit + correction + s_e3 |= sgn_q; + vector s_e3_float = s_e3.template format(); + mq = mq_u_uint.template format(); + + vector s_e2_f = s_e2.template format(); + mq = math::mad(mq, s_e2_f, s_e3_float); + // eliminate leading bit (but UF flag will not be set) vector s_e1_float = s_e1.template format(); + mq = mq - s_e1_float; + + // force UF flag to be set s_e1_float *= s_e1_float; vector mq_u_int = s_e1_float.template format(); // artificial dependency - sgn_q |= (mq_u_int & mantissa_loss); - vector sgn_q_float = sgn_q.template format(); - result.merge(sgn_q_float, ammount_ge1 == 0); + vector mq_uint = mq.template format(); + mq_uint |= (mq_u_int & mantissa_loss); + mq = mq_uint.template format(); + result.merge(mq, ammount_ge1 == 0); return result; } @@ -232,9 +266,10 @@ CM_NODEBUG CM_INLINE vector __impl_fdiv_ieee_long_path(vector a, vector b) { vector result; + vector ediff5; + auto expon_x = get_exp(a); auto expon_y = get_exp(b); - vector ediff5; // filter out Inf/NaN, zeroes, denormals mask special_div = check_special(a, b); @@ -242,21 +277,22 @@ __impl_fdiv_ieee_long_path(vector a, vector b) { if (special_div.any()) { auto special_div_res = __impl_fdiv_ieee_special_div(a, b, filled_out); result.merge(special_div_res, filled_out); - + if (filled_out.all()) + return result; + special_div &= (filled_out == 0); // for denormal inputs, zeroes, NaNs, Inf // a denormal, or b denormal // initialize scale exponents - vector isx = 0; vector isy = 0; - auto a_isDenormal = check_is_denormak(a) & special_div; + auto a_isDenormal = check_is_denormal(a) & special_div; if (a_isDenormal.any()) { a.merge(normalize_exp(a), a_isDenormal); expon_x.merge(get_exp(a) - 32, a_isDenormal); isx.merge(32, a_isDenormal); } - auto b_isDenormal = check_is_denormak(b) & special_div; + auto b_isDenormal = check_is_denormal(b) & special_div; if (b_isDenormal.any()) { b.merge(normalize_exp(b), b_isDenormal); expon_y.merge(get_exp(b) - 32, b_isDenormal); @@ -264,28 +300,27 @@ __impl_fdiv_ieee_long_path(vector a, vector b) { } // used to detect gradual underflow ediff5.merge(get_exp_diff(a, b) - isx + isy, special_div); - // signed mantissas, needed for long path computation - // return to long computation path } + // signed mantissas, needed for long path computation + // return to long computation path + vector ma = cut_mantissa_and_sign(a, exp_float_zero); + vector mb = cut_mantissa_and_sign(b, exp_float_zero); ediff5.merge(get_exp_diff(a, b), (special_div == 0)); - vector e_diff = expon_x - expon_y; - // signed mantissas - auto ma = cut_mantissa_and_sign(a, exp_float_zero); - auto mb = cut_mantissa_and_sign(b, exp_float_zero); + vector ediff = expon_x - expon_y; // will check whether quotient is in gradual underflow range vector itmp1 = ediff5 + 126; vector itmp2 = ediff5 + 126 + 25; vector ig_uf = itmp1 & (~itmp2); + mask gradual_underflow = ig_uf < 0; - mask gradual_underflow = (ediff5 < 126 + 25) & (ediff5 >= 101); - const vector one = 1.0f; // perform division on signed mantissas: ma.f/mb.f auto y0 = math::reciprocal(mb); // Step(1), q0=a*y0 auto q0 = ma * y0; // Step(2), e0=(1-b*y0) + const vector one = 1.0f; auto e0 = math::mad(-mb, y0, one); // Step(3), y1=y0+e0*y0 auto y1 = math::mad(e0, y0, y0); @@ -306,7 +341,21 @@ __impl_fdiv_ieee_long_path(vector a, vector b) { // Step(7), q=q1+r1*y1 auto mq = math::mad(mr1, y1, q1); - vector se_diff = e_diff.template format(); + // scale by 2^exponent_q + // split exponent difference into smaller parts (so that scale factors can be + // represented in SP format) three scale factors are needed when inputs may be + // denormal limit range of e_diff, so that two scale factors are sufficient + // (to avoid incorrect flag settings) + mask fix_exp = (ediff + 126) > (126 * 2 + 126); + if (fix_exp.any()) { + vector sgn_e = ediff.template format(); + sgn_e.merge(0, fix_exp == 0); + sgn_e = sgn_e >> 11; + // set e_diff to 124*2 with proper sign + ediff.merge(sgn_e ^ (124 + 124 + sgn_e), fix_exp); + } + + vector se_diff = ediff.template format(); auto es1 = (se_diff) >> 1; auto es2 = se_diff - es1; @@ -316,8 +365,6 @@ __impl_fdiv_ieee_long_path(vector a, vector b) { umq += (ues1 << exp_shift); // one more scale factor auto s_e2 = scaling_factor(es2); - - // result mq = umq.template format(); mq *= s_e2; @@ -337,8 +384,10 @@ CM_NODEBUG CM_INLINE vector __impl_fdiv_ieee(vector a, // check exponent ranges // Main path will be taken for expon_x in [bias-62, bias+63] and expon_y in // [bias-63, bias+63] - mask x_long_path = (expon_x + 62 - exp_bias) >= (64 + 62); - mask y_long_path = (expon_y + 63 - exp_bias) >= (64 + 63); + auto exp_x_bias = math::absolute((expon_x + 62 - exp_bias).cl_vector()); + auto exp_y_bias = math::absolute((expon_y + 63 - exp_bias).cl_vector()); + mask x_long_path = exp_x_bias >= (64 + 62); + mask y_long_path = exp_y_bias >= (64 + 63); mask long_path = x_long_path | y_long_path; if (long_path.any()) diff --git a/IGC/VectorCompiler/test/GenXBuiltinFunctions/fdiv_f32.ll b/IGC/VectorCompiler/test/GenXBuiltinFunctions/fdiv_f32.ll index 14dc78fa1ee2..ab8ef2697147 100644 --- a/IGC/VectorCompiler/test/GenXBuiltinFunctions/fdiv_f32.ll +++ b/IGC/VectorCompiler/test/GenXBuiltinFunctions/fdiv_f32.ll @@ -6,28 +6,24 @@ ; ;============================ end_copyright_notice ============================= -; RUN: %opt %use_old_pass_manager% -enable-debugify -GenXBuiltinFunctions \ -; RUN: -march=genx64 -mtriple=spir64-unknown-unknown -mcpu=XeLPG -S < %s 2>&1 \ -; RUN: | FileCheck %s +; RUN: %opt %use_old_pass_manager% -GenXBuiltinFunctions -march=genx64 \ +; RUN: -vc-builtins-bif-path=%VC_BUILTINS_BIF_XeLPG% -mcpu=XeLPG \ +; RUN: -mtriple=spir64-unknown-unknown -S < %s 2>&1 | FileCheck %s -; CHECK-NOT: WARNING -; CHECK: CheckModuleDebugify: PASS +; RUN: %opt %use_old_pass_manager% -GenXBuiltinFunctions -march=genx64 \ +; RUN: -vc-builtins-bif-path=%VC_BUILTINS_BIF_XeHPC% -mcpu=XeHPC \ +; RUN: -mtriple=spir64-unknown-unknown -S < %s 2>&1 | FileCheck %s \ +; RUN: --check-prefix=CHECK-NOEMU ; Function Attrs: nofree nosync nounwind readnone declare <32 x float> @llvm.genx.ieee.div.v32f32(<32 x float>, <32 x float>) define dllexport spir_kernel void @test_kernel(<32 x float> %l, <32 x float> %r) { ; CHECK: = fdiv <32 x float> %l, %r + ; CHECK-NOEMU: = fdiv <32 x float> %l, %r %1 = fdiv <32 x float> %l, %r ; CHECK: = call <32 x float> @__vc_builtin_fdiv_v32f32(<32 x float> %l, <32 x float> %r) + ; CHECK-NOEMU: = call <32 x float> @llvm.genx.ieee.div.v32f32 %2 = call <32 x float> @llvm.genx.ieee.div.v32f32(<32 x float> %l, <32 x float> %r) ret void -} - -; COM: The presence of these __vc_builtin_* funcitions is a HACK to trick VC -; COM: backend into thinking that we have built-in routines -define <32 x float> @__vc_builtin_fdiv_v32f32(<32 x float> %l, <32 x float> %r) #0 { - ret <32 x float> zeroinitializer -} - -attributes #0 = { "VC.Builtin" } \ No newline at end of file +} \ No newline at end of file