From 84754d4d4c4748ce3ea5f8dc592a966654d86fdc Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 30 Dec 2024 06:49:15 +0100 Subject: [PATCH 01/25] Fancy macros which cannot be much fancier without breaking the osaca parser --- numerics/sin_cos.cpp | 113 ++++++++++++++++++++++++++++++++----------- numerics/sin_cos.hpp | 4 +- 2 files changed, 86 insertions(+), 31 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 7bd4567711..c2d1982d30 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -16,8 +16,61 @@ #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" +static bool iaca_breaker = false; +#define IACA_FUNCTION_DOUBLE(arg) \ + volatile double iaca_result = arg; \ + IACA_VC64_START; \ + IACA_LOOP: \ + arg = iaca_result +#define IACA_RETURN(result) \ + iaca_result = result; \ + if (iaca_breaker) { \ + goto IACA_END_LOOP; \ + } else { \ + goto IACA_LOOP; \ + } +#define IACA_FUNCTION_END \ + goto IACA_LOOP; \ + IACA_END_LOOP: \ + IACA_VC64_END; \ + return iaca_result +#define IACA_IF(condition) \ + if constexpr (volatile bool IACA_computed_condition = (condition); \ + [] { UNDER_IACA_HYPOTHESES(return (condition)); }()) + +#define UNDER_IACA_HYPOTHESES(statement) \ + do { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 0.1; \ + /* From argument reduction. */ \ + constexpr double n_double = θ * (2 / π); \ + constexpr double reduction_value = θ - n_double * π_over_2_high; \ + constexpr double reduction_error = n_double * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = \ + static_cast(n_double) & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the top-level \ + * functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + { statement; } \ + } while (false) + +#else +#define IACA_FUNCTION_DOUBLE(arg) (double const arg) +#define IACA_RETURN(result) return result +#define IACA_IF (condition) if (condition) +#define UNDER_IACA_HYPOTHESES(statement) #endif + namespace principia { namespace numerics { namespace _sin_cos { @@ -75,8 +128,8 @@ inline std::int64_t AccurateTableIndex(double const abs_x) { template double FusedMultiplyAdd(double const a, double const b, double const c) { - if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + IACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplyAdd; return FusedMultiplyAdd(a, b, c); } else { @@ -86,8 +139,8 @@ double FusedMultiplyAdd(double const a, double const b, double const c) { template double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { - if ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + IACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; return FusedNegatedMultiplyAdd(a, b, c); } else { @@ -110,7 +163,8 @@ inline double DetectDangerousRounding(double const x, double const Δx) { _mm_castsi128_pd(_mm_sub_epi64(error_128i, value_exponent_128i))); // TODO(phl): Error analysis to refine the thresholds. Also, can we avoid // negative numbers? - if (normalized_error < -0x1.ffffp971 && normalized_error > -0x1.00008p972) { + IACA_IF (normalized_error < -0x1.ffffp971 && + normalized_error > -0x1.00008p972) { #if _DEBUG LOG(ERROR) << std::setprecision(25) << x << " " << std::hexfloat << value << " " << error << " " << normalized_error; @@ -124,12 +178,12 @@ inline double DetectDangerousRounding(double const x, double const Δx) { inline void Reduce(Argument const θ, DoublePrecision& θ_reduced, std::int64_t& quadrant) { - if (θ < π / 4 && θ > -π / 4) { + IACA_IF (θ < π / 4 && θ > -π / 4) { θ_reduced.value = θ; θ_reduced.error = 0; quadrant = 0; return; - } else if (θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { + } else IACA_IF (θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { // We are not very sensitive to rounding errors in this expression, because // in the worst case it could cause the reduced angle to jump from the // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment @@ -142,7 +196,7 @@ inline void Reduce(Argument const θ, Argument const error = n_double * π_over_2_low; θ_reduced = QuickTwoDifference(value, error); // TODO(phl): Error analysis needed to find the right bounds. - if (θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { + IACA_IF (θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { quadrant = n & 0b11; return; } @@ -180,7 +234,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { auto const& x = θ_reduced.value; auto const& e = θ_reduced.error; double const abs_x = std::abs(x); - if (abs_x < sin_near_zero_cutoff) { + IACA_IF (abs_x < sin_near_zero_cutoff) { double const x² = x * x; double const x³ = x² * x; double const x³_term = FusedMultiplyAdd( @@ -257,8 +311,8 @@ Value __cdecl Sin(Argument const θ) { std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - if (UseHardwareFMA) { - if (quadrant & 0b1) { + IACA_IF (UseHardwareFMA) { + IACA_IF (quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { #if PRINCIPIA_USE_OSACA_SIN @@ -270,55 +324,56 @@ Value __cdecl Sin(Argument const θ) { #endif } } else { - if (quadrant & 0b1) { + IACA_IF (quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { value = SinImplementation(θ_reduced); } } - if (value != value) { + IACA_IF (value != value) { return cr_sin(θ); - } else if (quadrant & 0b10) { + } + IACA_IF (quadrant & 0b10) { return -value; } else { return value; } } +namespace IACA_assumptions { +constexpr std::int64_t quadrant = 1; +} // namespace IACA_assumptions + #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif -Value __cdecl Cos(Argument const θ) { +Value __cdecl Cos(Argument θ) { + IACA_FUNCTION_DOUBLE(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - if (UseHardwareFMA) { - if (quadrant & 0b1) { + IACA_IF (UseHardwareFMA) { + IACA_IF (quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { -#if PRINCIPIA_USE_OSACA_COS - IACA_VC64_START; -#endif value = CosImplementation(θ_reduced); -#if PRINCIPIA_USE_OSACA_COS - IACA_VC64_END; -#endif } } else { - if (quadrant & 0b1) { + IACA_IF (quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } - if (value != value) { - return cr_cos(θ); - } else if (quadrant == 1 || quadrant == 2) { - return -value; + IACA_IF (value != value) { + IACA_RETURN(cr_cos(θ)); + } else IACA_IF (quadrant == 1 || quadrant == 2) { + IACA_RETURN(-value); } else { - return value; + IACA_RETURN(value); } + IACA_FUNCTION_END; } } // namespace internal diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index 07a26ee280..4451c15830 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -7,9 +7,9 @@ namespace numerics { namespace _sin_cos { namespace internal { -#define PRINCIPIA_INLINE_SIN_COS 0 +#define PRINCIPIA_INLINE_SIN_COS 1 #define PRINCIPIA_USE_OSACA_SIN 0 -#define PRINCIPIA_USE_OSACA_COS 0 +#define PRINCIPIA_USE_OSACA_COS 1 #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) From 121bcede048e5b43a6c2bfd8230aeede0d645df8 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 00:17:38 +0100 Subject: [PATCH 02/25] Now it does not work --- numerics/numerics.vcxproj | 9 +++++++++ numerics/sin_cos.cpp | 37 ++++++++++++++++++------------------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/numerics/numerics.vcxproj b/numerics/numerics.vcxproj index 6e3b3e9b8b..940f3d0125 100644 --- a/numerics/numerics.vcxproj +++ b/numerics/numerics.vcxproj @@ -10,6 +10,15 @@ + + + + + + + + + diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index c2d1982d30..4433ae573c 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -16,25 +16,24 @@ #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" -static bool iaca_breaker = false; -#define IACA_FUNCTION_DOUBLE(arg) \ - volatile double iaca_result = arg; \ - IACA_VC64_START; \ - IACA_LOOP: \ - arg = iaca_result -#define IACA_RETURN(result) \ - iaca_result = result; \ - if (iaca_breaker) { \ - goto IACA_END_LOOP; \ - } else { \ - goto IACA_LOOP; \ - } -#define IACA_FUNCTION_END \ - goto IACA_LOOP; \ - IACA_END_LOOP: \ - IACA_VC64_END; \ - return iaca_result -#define IACA_IF(condition) \ +static bool const iaca_loop_terminator = false; +#define IACA_FUNCTION_DOUBLE(arg) \ + double iaca_result = arg; \ + IACA_VC64_START; \ + IACA_LOOP : \ + arg = iaca_result +#define IACA_RETURN(result) IACA_RETURN_AT(__LINE__, result) +#define IACA_RETURN_AT(line, result) IACA_RETURN_AT2(line, result) +#define IACA_RETURN_AT2(line, result) \ + if (iaca_loop_terminator) { \ + goto IACA_END_LOOP_##line; \ + } \ + iaca_result = (result); \ + goto IACA_LOOP; \ + IACA_VC64_END; \ + IACA_END_LOOP_##line : return iaca_result +#define IACA_FUNCTION_END + #define IACA_IF(condition) \ if constexpr (volatile bool IACA_computed_condition = (condition); \ [] { UNDER_IACA_HYPOTHESES(return (condition)); }()) From 13be66bbdd6fbf4eea26ac1b5f999cb7059ea2ca Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 01:16:04 +0100 Subject: [PATCH 03/25] good enough --- numerics/sin_cos.cpp | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 4433ae573c..bf2326b10c 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -16,24 +16,21 @@ #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" -static bool const iaca_loop_terminator = false; +static bool iaca_loop_terminator = false; #define IACA_FUNCTION_DOUBLE(arg) \ - double iaca_result = arg; \ + double iaca_loop_carry = arg; \ IACA_VC64_START; \ - IACA_LOOP : \ - arg = iaca_result -#define IACA_RETURN(result) IACA_RETURN_AT(__LINE__, result) -#define IACA_RETURN_AT(line, result) IACA_RETURN_AT2(line, result) -#define IACA_RETURN_AT2(line, result) \ - if (iaca_loop_terminator) { \ - goto IACA_END_LOOP_##line; \ - } \ - iaca_result = (result); \ - goto IACA_LOOP; \ - IACA_VC64_END; \ - IACA_END_LOOP_##line : return iaca_result -#define IACA_FUNCTION_END - #define IACA_IF(condition) \ + IACA_LOOP: \ + arg = iaca_loop_carry +#define IACA_RETURN(result) \ + iaca_loop_carry = (result); \ + if (!iaca_loop_terminator) { \ + goto IACA_LOOP; \ + } \ + volatile double iaca_result = iaca_loop_carry; \ + IACA_VC64_END; \ + return iaca_result +#define IACA_IF(condition) \ if constexpr (volatile bool IACA_computed_condition = (condition); \ [] { UNDER_IACA_HYPOTHESES(return (condition)); }()) @@ -372,7 +369,6 @@ Value __cdecl Cos(Argument θ) { } else { IACA_RETURN(value); } - IACA_FUNCTION_END; } } // namespace internal From fb5e651101b0875f154d7d3d0e922ed7d91a8822 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 01:30:22 +0100 Subject: [PATCH 04/25] both functions --- numerics/sin_cos.cpp | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index bf2326b10c..e1f5d6a6f0 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,14 +14,31 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" +#if PRINCIPIA_USE_OSACA_SIN +#define IACA_SIN_BEGIN IACA_FUNCTION_BEGIN +#define IACA_RETURN_SIN IACA_RETURN +#else +#define IACA_SIN_BEGIN(arg) +#define IACA_RETURN_SIN(result) return (result) +#endif + +#if PRINCIPIA_USE_OSACA_COS +#define IACA_COS_BEGIN IACA_FUNCTION_BEGIN +#define IACA_RETURN_COS IACA_RETURN +#else +#define IACA_COS_BEGIN(arg) +#define IACA_RETURN_COS(result) return (result) +#endif + #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" static bool iaca_loop_terminator = false; -#define IACA_FUNCTION_DOUBLE(arg) \ +#define IACA_FUNCTION_BEGIN(arg) \ double iaca_loop_carry = arg; \ IACA_VC64_START; \ IACA_LOOP: \ arg = iaca_loop_carry + #define IACA_RETURN(result) \ iaca_loop_carry = (result); \ if (!iaca_loop_terminator) { \ @@ -60,10 +77,7 @@ static bool iaca_loop_terminator = false; } while (false) #else -#define IACA_FUNCTION_DOUBLE(arg) (double const arg) -#define IACA_RETURN(result) return result #define IACA_IF (condition) if (condition) -#define UNDER_IACA_HYPOTHESES(statement) #endif @@ -303,6 +317,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { FORCE_INLINE(inline) #endif Value __cdecl Sin(Argument const θ) { + IACA_SIN_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); @@ -327,12 +342,11 @@ Value __cdecl Sin(Argument const θ) { } } IACA_IF (value != value) { - return cr_sin(θ); - } - IACA_IF (quadrant & 0b10) { - return -value; + IACA_RETURN_SIN(cr_sin(θ)); + } else IACA_IF(quadrant & 0b10) { + IACA_RETURN_SIN(-value); } else { - return value; + IACA_RETURN_SIN(value); } } @@ -344,7 +358,7 @@ constexpr std::int64_t quadrant = 1; FORCE_INLINE(inline) #endif Value __cdecl Cos(Argument θ) { - IACA_FUNCTION_DOUBLE(θ); + IACA_COS_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); @@ -363,11 +377,11 @@ Value __cdecl Cos(Argument θ) { } } IACA_IF (value != value) { - IACA_RETURN(cr_cos(θ)); + IACA_RETURN_COS(cr_cos(θ)); } else IACA_IF (quadrant == 1 || quadrant == 2) { - IACA_RETURN(-value); + IACA_RETURN_COS(-value); } else { - IACA_RETURN(value); + IACA_RETURN_COS(value); } } From e2c5cec3c140b20daeb3ad6106dedb97f6ed2c59 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 01:34:52 +0100 Subject: [PATCH 05/25] OSACA --- numerics/sin_cos.cpp | 118 +++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index e1f5d6a6f0..612b61b2ed 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -15,43 +15,43 @@ #include "quantities/elementary_functions.hpp" #if PRINCIPIA_USE_OSACA_SIN -#define IACA_SIN_BEGIN IACA_FUNCTION_BEGIN -#define IACA_RETURN_SIN IACA_RETURN +#define OSACA_SIN_BEGIN OSACA_FUNCTION_BEGIN +#define OSACA_RETURN_SIN OSACA_RETURN #else -#define IACA_SIN_BEGIN(arg) -#define IACA_RETURN_SIN(result) return (result) +#define OSACA_SIN_BEGIN(arg) +#define OSACA_RETURN_SIN(result) return (result) #endif #if PRINCIPIA_USE_OSACA_COS -#define IACA_COS_BEGIN IACA_FUNCTION_BEGIN -#define IACA_RETURN_COS IACA_RETURN +#define OSACA_COS_BEGIN OSACA_FUNCTION_BEGIN +#define OSACA_RETURN_COS OSACA_RETURN #else -#define IACA_COS_BEGIN(arg) -#define IACA_RETURN_COS(result) return (result) +#define OSACA_COS_BEGIN(arg) +#define OSACA_RETURN_COS(result) return (result) #endif #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" -static bool iaca_loop_terminator = false; -#define IACA_FUNCTION_BEGIN(arg) \ - double iaca_loop_carry = arg; \ +static bool OSACA_loop_terminator = false; +#define OSACA_FUNCTION_BEGIN(arg) \ + double OSACA_loop_carry = arg; \ IACA_VC64_START; \ - IACA_LOOP: \ - arg = iaca_loop_carry - -#define IACA_RETURN(result) \ - iaca_loop_carry = (result); \ - if (!iaca_loop_terminator) { \ - goto IACA_LOOP; \ - } \ - volatile double iaca_result = iaca_loop_carry; \ - IACA_VC64_END; \ - return iaca_result -#define IACA_IF(condition) \ - if constexpr (volatile bool IACA_computed_condition = (condition); \ - [] { UNDER_IACA_HYPOTHESES(return (condition)); }()) - -#define UNDER_IACA_HYPOTHESES(statement) \ + OSACA_LOOP: \ + arg = OSACA_loop_carry + +#define OSACA_RETURN(result) \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_LOOP; \ + } \ + volatile double OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + return OSACA_result +#define OSACA_IF(condition) \ + if constexpr (volatile bool OSACA_computed_condition = (condition); \ + [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) + +#define UNDER_OSACA_HYPOTHESES(statement) \ do { \ constexpr bool UseHardwareFMA = true; \ constexpr double θ = 0.1; \ @@ -77,7 +77,7 @@ static bool iaca_loop_terminator = false; } while (false) #else -#define IACA_IF (condition) if (condition) +#define OSACA_IF (condition) if (condition) #endif @@ -138,7 +138,7 @@ inline std::int64_t AccurateTableIndex(double const abs_x) { template double FusedMultiplyAdd(double const a, double const b, double const c) { - IACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + OSACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplyAdd; return FusedMultiplyAdd(a, b, c); @@ -149,7 +149,7 @@ double FusedMultiplyAdd(double const a, double const b, double const c) { template double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { - IACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + OSACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; return FusedNegatedMultiplyAdd(a, b, c); @@ -173,7 +173,7 @@ inline double DetectDangerousRounding(double const x, double const Δx) { _mm_castsi128_pd(_mm_sub_epi64(error_128i, value_exponent_128i))); // TODO(phl): Error analysis to refine the thresholds. Also, can we avoid // negative numbers? - IACA_IF (normalized_error < -0x1.ffffp971 && + OSACA_IF (normalized_error < -0x1.ffffp971 && normalized_error > -0x1.00008p972) { #if _DEBUG LOG(ERROR) << std::setprecision(25) << x << " " << std::hexfloat << value @@ -188,12 +188,12 @@ inline double DetectDangerousRounding(double const x, double const Δx) { inline void Reduce(Argument const θ, DoublePrecision& θ_reduced, std::int64_t& quadrant) { - IACA_IF (θ < π / 4 && θ > -π / 4) { + OSACA_IF (θ < π / 4 && θ > -π / 4) { θ_reduced.value = θ; θ_reduced.error = 0; quadrant = 0; return; - } else IACA_IF (θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { + } else OSACA_IF (θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { // We are not very sensitive to rounding errors in this expression, because // in the worst case it could cause the reduced angle to jump from the // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment @@ -206,7 +206,7 @@ inline void Reduce(Argument const θ, Argument const error = n_double * π_over_2_low; θ_reduced = QuickTwoDifference(value, error); // TODO(phl): Error analysis needed to find the right bounds. - IACA_IF (θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { + OSACA_IF (θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { quadrant = n & 0b11; return; } @@ -244,7 +244,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { auto const& x = θ_reduced.value; auto const& e = θ_reduced.error; double const abs_x = std::abs(x); - IACA_IF (abs_x < sin_near_zero_cutoff) { + OSACA_IF (abs_x < sin_near_zero_cutoff) { double const x² = x * x; double const x³ = x² * x; double const x³_term = FusedMultiplyAdd( @@ -262,8 +262,8 @@ Value SinImplementation(DoublePrecision const θ_reduced) { // [GB91] incorporates `e` in the computation of `h`. However, `x` and `e` // don't overlap and in the first interval `x` and `h` may be of the same // order of magnitude. Instead we incorporate the terms in `e` and `e * h` - // later in the computation. Note that the terms in `e * h²` and higher are - // *not* computed correctly below because they don't matter. + // later in the computation. Note that the terms in `e * h²` and higher are + // *not* computed correctly below because they don't matter. double const h = x - x₀; DoublePrecision const sin_x₀_plus_h_cos_x₀ = @@ -317,71 +317,71 @@ Value CosImplementation(DoublePrecision const θ_reduced) { FORCE_INLINE(inline) #endif Value __cdecl Sin(Argument const θ) { - IACA_SIN_BEGIN(θ); + OSACA_SIN_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - IACA_IF (UseHardwareFMA) { - IACA_IF (quadrant & 0b1) { + OSACA_IF (UseHardwareFMA) { + OSACA_IF (quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { #if PRINCIPIA_USE_OSACA_SIN - IACA_VC64_START; + OSACA_VC64_START; #endif value = SinImplementation(θ_reduced); #if PRINCIPIA_USE_OSACA_SIN - IACA_VC64_END; + OSACA_VC64_END; #endif } } else { - IACA_IF (quadrant & 0b1) { + OSACA_IF (quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { value = SinImplementation(θ_reduced); } } - IACA_IF (value != value) { - IACA_RETURN_SIN(cr_sin(θ)); - } else IACA_IF(quadrant & 0b10) { - IACA_RETURN_SIN(-value); + OSACA_IF (value != value) { + OSACA_RETURN_SIN(cr_sin(θ)); + } else OSACA_IF(quadrant & 0b10) { + OSACA_RETURN_SIN(-value); } else { - IACA_RETURN_SIN(value); + OSACA_RETURN_SIN(value); } } -namespace IACA_assumptions { +namespace OSACA_assumptions { constexpr std::int64_t quadrant = 1; -} // namespace IACA_assumptions +} // namespace OSACA_assumptions #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif Value __cdecl Cos(Argument θ) { - IACA_COS_BEGIN(θ); + OSACA_COS_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - IACA_IF (UseHardwareFMA) { - IACA_IF (quadrant & 0b1) { + OSACA_IF (UseHardwareFMA) { + OSACA_IF (quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } else { - IACA_IF (quadrant & 0b1) { + OSACA_IF (quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } - IACA_IF (value != value) { - IACA_RETURN_COS(cr_cos(θ)); - } else IACA_IF (quadrant == 1 || quadrant == 2) { - IACA_RETURN_COS(-value); + OSACA_IF (value != value) { + OSACA_RETURN_COS(cr_cos(θ)); + } else OSACA_IF (quadrant == 1 || quadrant == 2) { + OSACA_RETURN_COS(-value); } else { - IACA_RETURN_COS(value); + OSACA_RETURN_COS(value); } } From cea3826731a84de4d7d81f6dac2c10f095bb6740 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 01:48:30 +0100 Subject: [PATCH 06/25] volatile input --- numerics/sin_cos.cpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 612b61b2ed..acbcc2d1c5 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -33,10 +33,15 @@ #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" static bool OSACA_loop_terminator = false; -#define OSACA_FUNCTION_BEGIN(arg) \ - double OSACA_loop_carry = arg; \ - IACA_VC64_START; \ - OSACA_LOOP: \ +#define OSACA_FUNCTION_BEGIN(arg) \ + double volatile OSACA_input = arg; \ + /* Putting a load of the input from memory in the analysed section makes the \ + * dependency graph clearer, but adds a potentially spurious move to the \ + * loop-carried latency. Remove the `volatile` above to carry the loop \ + * through registers.*/ \ + IACA_VC64_START; \ + double OSACA_loop_carry = OSACA_input; \ + OSACA_LOOP: \ arg = OSACA_loop_carry #define OSACA_RETURN(result) \ @@ -44,11 +49,11 @@ static bool OSACA_loop_terminator = false; if (!OSACA_loop_terminator) { \ goto OSACA_LOOP; \ } \ - volatile double OSACA_result = OSACA_loop_carry; \ + double volatile OSACA_result = OSACA_loop_carry; \ IACA_VC64_END; \ return OSACA_result #define OSACA_IF(condition) \ - if constexpr (volatile bool OSACA_computed_condition = (condition); \ + if constexpr (bool volatile OSACA_computed_condition = (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) #define UNDER_OSACA_HYPOTHESES(statement) \ From 72bc9184afe74c8a69c995158ca2928d172da66e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 01:50:38 +0100 Subject: [PATCH 07/25] case --- numerics/sin_cos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index acbcc2d1c5..266139068c 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -41,13 +41,13 @@ static bool OSACA_loop_terminator = false; * through registers.*/ \ IACA_VC64_START; \ double OSACA_loop_carry = OSACA_input; \ - OSACA_LOOP: \ + OSACA_loop: \ arg = OSACA_loop_carry #define OSACA_RETURN(result) \ OSACA_loop_carry = (result); \ if (!OSACA_loop_terminator) { \ - goto OSACA_LOOP; \ + goto OSACA_loop; \ } \ double volatile OSACA_result = OSACA_loop_carry; \ IACA_VC64_END; \ From 4c86f368a963cca336ae5accf5abc70262df5e0a Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 02:04:29 +0100 Subject: [PATCH 08/25] flags to 0 --- numerics/sin_cos.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index 4451c15830..07a26ee280 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -7,9 +7,9 @@ namespace numerics { namespace _sin_cos { namespace internal { -#define PRINCIPIA_INLINE_SIN_COS 1 +#define PRINCIPIA_INLINE_SIN_COS 0 #define PRINCIPIA_USE_OSACA_SIN 0 -#define PRINCIPIA_USE_OSACA_COS 1 +#define PRINCIPIA_USE_OSACA_COS 0 #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) From dfbeabe6069f0f30e132038f57d62d9e18c2e65b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 02:10:52 +0100 Subject: [PATCH 09/25] no namespace --- numerics/sin_cos.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 4096065b23..88db80cd70 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -356,10 +356,6 @@ Value __cdecl Sin(Argument const θ) { } } -namespace OSACA_assumptions { -constexpr std::int64_t quadrant = 1; -} // namespace OSACA_assumptions - #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif From 0850aa2da505cf9b83407943a33f26ce87dc3d45 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 02:30:59 +0100 Subject: [PATCH 10/25] Revert spurious vcxproj change --- numerics/numerics.vcxproj | 9 --------- 1 file changed, 9 deletions(-) diff --git a/numerics/numerics.vcxproj b/numerics/numerics.vcxproj index 940f3d0125..6e3b3e9b8b 100644 --- a/numerics/numerics.vcxproj +++ b/numerics/numerics.vcxproj @@ -10,15 +10,6 @@ - - - - - - - - - From 47e269bb76158eb4518eb19e0020b1309bff9711 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 02:51:53 +0100 Subject: [PATCH 11/25] Try to appease the linter --- numerics/sin_cos.cpp | 94 ++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 88db80cd70..0f3436d610 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -35,10 +35,10 @@ static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ double volatile OSACA_input = arg; \ - /* Putting a load of the input from memory in the analysed section makes the \ - * dependency graph clearer, but adds a potentially spurious move to the \ - * loop-carried latency. Remove the `volatile` above to carry the loop \ - * through registers.*/ \ + /* Putting a load of the input from memory in the analysed section makes */ \ + /* the dependency graph clearer, but adds a potentially spurious move to */ \ + /* the loop-carried latency. Remove the `volatile` above to carry the */ \ + /* loop through registers.*/ \ IACA_VC64_START; \ double OSACA_loop_carry = OSACA_input; \ OSACA_loop: \ @@ -56,33 +56,33 @@ static bool OSACA_loop_terminator = false; if constexpr (bool volatile OSACA_computed_condition = (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) -#define UNDER_OSACA_HYPOTHESES(statement) \ - do { \ - constexpr bool UseHardwareFMA = true; \ - constexpr double θ = 0.1; \ - /* From argument reduction. */ \ - constexpr double n_double = θ * (2 / π); \ - constexpr double reduction_value = θ - n_double * π_over_2_high; \ - constexpr double reduction_error = n_double * π_over_2_low; \ - /* Used to determine whether a better argument reduction is needed. */ \ - constexpr DoublePrecision θ_reduced = \ - QuickTwoDifference(reduction_value, reduction_error); \ - /* Used in Sin to detect the near-0 case. */ \ - constexpr double abs_x = \ - θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ - /* Used throughout the top-level functions. */ \ - constexpr std::int64_t quadrant = \ - static_cast(n_double) & 0b11; \ - /* Used in DetectDangerousRounding. */ \ - constexpr double normalized_error = 0; \ - /* Not NaN is the only part that matters; used at the end of the top-level \ - * functions to determine whether to call the slow path. */ \ - constexpr double value = 1; \ - { statement; } \ +#define UNDER_OSACA_HYPOTHESES(statement) \ + do { \ + constexpr bool UseHardwareFMA = true; \ + constexpr double θ = 0.1; \ + /* From argument reduction. */ \ + constexpr double n_double = θ * (2 / π); \ + constexpr double reduction_value = θ - n_double * π_over_2_high; \ + constexpr double reduction_error = n_double * π_over_2_low; \ + /* Used to determine whether a better argument reduction is needed. */ \ + constexpr DoublePrecision θ_reduced = \ + QuickTwoDifference(reduction_value, reduction_error); \ + /* Used in Sin to detect the near-0 case. */ \ + constexpr double abs_x = \ + θ_reduced.value > 0 ? θ_reduced.value : -θ_reduced.value; \ + /* Used throughout the top-level functions. */ \ + constexpr std::int64_t quadrant = \ + static_cast(n_double) & 0b11; \ + /* Used in DetectDangerousRounding. */ \ + constexpr double normalized_error = 0; \ + /* Not NaN is the only part that matters; used at the end of the */ \ + /* top-level functions to determine whether to call the slow path. */ \ + constexpr double value = 1; \ + { statement; } \ } while (false) #else -#define OSACA_IF (condition) if (condition) +#define OSACA_IF_(condition) if (condition) #endif @@ -143,8 +143,8 @@ inline std::int64_t AccurateTableIndex(double const abs_x) { template double FusedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + OSACA_IF_((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplyAdd; return FusedMultiplyAdd(a, b, c); } else { @@ -154,8 +154,8 @@ double FusedMultiplyAdd(double const a, double const b, double const c) { template double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF ((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || - (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { + OSACA_IF_((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; return FusedNegatedMultiplyAdd(a, b, c); } else { @@ -178,7 +178,7 @@ inline double DetectDangerousRounding(double const x, double const Δx) { _mm_castsi128_pd(_mm_sub_epi64(error_128i, value_exponent_128i))); // TODO(phl): Error analysis to refine the thresholds. Also, can we avoid // negative numbers? - OSACA_IF (normalized_error < -0x1.ffffp971 && + OSACA_IF_(normalized_error < -0x1.ffffp971 && normalized_error > -0x1.00008p972) { #if _DEBUG LOG(ERROR) << std::setprecision(25) << x << " " << std::hexfloat << value @@ -193,12 +193,12 @@ inline double DetectDangerousRounding(double const x, double const Δx) { inline void Reduce(Argument const θ, DoublePrecision& θ_reduced, std::int64_t& quadrant) { - OSACA_IF (θ < π / 4 && θ > -π / 4) { + OSACA_IF_(θ < π / 4 && θ > -π / 4) { θ_reduced.value = θ; θ_reduced.error = 0; quadrant = 0; return; - } else OSACA_IF (θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { + } else OSACA_IF_(θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { // We are not very sensitive to rounding errors in this expression, because // in the worst case it could cause the reduced angle to jump from the // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment @@ -211,7 +211,7 @@ inline void Reduce(Argument const θ, Argument const error = n_double * π_over_2_low; θ_reduced = QuickTwoDifference(value, error); // TODO(phl): Error analysis needed to find the right bounds. - OSACA_IF (θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { + OSACA_IF_(θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { quadrant = n & 0b11; return; } @@ -249,7 +249,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { auto const& x = θ_reduced.value; auto const& e = θ_reduced.error; double const abs_x = std::abs(x); - OSACA_IF (abs_x < sin_near_zero_cutoff) { + OSACA_IF_(abs_x < sin_near_zero_cutoff) { double const x² = x * x; double const x³ = x² * x; double const x³_term = FusedMultiplyAdd( @@ -328,8 +328,8 @@ Value __cdecl Sin(Argument const θ) { std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - OSACA_IF (UseHardwareFMA) { - OSACA_IF (quadrant & 0b1) { + OSACA_IF_(UseHardwareFMA) { + OSACA_IF_(quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { #if PRINCIPIA_USE_OSACA_SIN @@ -341,15 +341,15 @@ Value __cdecl Sin(Argument const θ) { #endif } } else { - OSACA_IF (quadrant & 0b1) { + OSACA_IF_(quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { value = SinImplementation(θ_reduced); } } - OSACA_IF (value != value) { + OSACA_IF_(value != value) { OSACA_RETURN_SIN(cr_sin(θ)); - } else OSACA_IF(quadrant & 0b10) { + } else OSACA_IF_(quadrant & 0b10) { OSACA_RETURN_SIN(-value); } else { OSACA_RETURN_SIN(value); @@ -365,22 +365,22 @@ Value __cdecl Cos(Argument θ) { std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - OSACA_IF (UseHardwareFMA) { - OSACA_IF (quadrant & 0b1) { + OSACA_IF_(UseHardwareFMA) { + OSACA_IF_(quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } else { - OSACA_IF (quadrant & 0b1) { + OSACA_IF_(quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } - OSACA_IF (value != value) { + OSACA_IF_(value != value) { OSACA_RETURN_COS(cr_cos(θ)); - } else OSACA_IF (quadrant == 1 || quadrant == 2) { + } else OSACA_IF_(quadrant == 1 || quadrant == 2) { OSACA_RETURN_COS(-value); } else { OSACA_RETURN_COS(value); From 7c8082237d2f0495c303351d34fc33973554e235 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 02:59:12 +0100 Subject: [PATCH 12/25] _ --- numerics/sin_cos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 0f3436d610..b226fedcce 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -52,7 +52,7 @@ static bool OSACA_loop_terminator = false; double volatile OSACA_result = OSACA_loop_carry; \ IACA_VC64_END; \ return OSACA_result -#define OSACA_IF(condition) \ +#define OSACA_IF_(condition) \ if constexpr (bool volatile OSACA_computed_condition = (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) From 4b58a41c6838765a3073d76c5a8e0027c3b33339 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 03:16:26 +0100 Subject: [PATCH 13/25] Some comments --- numerics/sin_cos.cpp | 48 ++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index b226fedcce..34c67ca9af 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -52,7 +52,7 @@ static bool OSACA_loop_terminator = false; double volatile OSACA_result = OSACA_loop_carry; \ IACA_VC64_END; \ return OSACA_result -#define OSACA_IF_(condition) \ +#define OSACA_IF(condition) \ if constexpr (bool volatile OSACA_computed_condition = (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) @@ -82,9 +82,11 @@ static bool OSACA_loop_terminator = false; } while (false) #else -#define OSACA_IF_(condition) if (condition) +#define OSACA_IF(condition) if (condition) #endif +#define OSACA_ELSE_IF else OSACA_IF + namespace principia { namespace numerics { @@ -143,7 +145,7 @@ inline std::int64_t AccurateTableIndex(double const abs_x) { template double FusedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF_((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedMultiplyAdd; return FusedMultiplyAdd(a, b, c); @@ -154,7 +156,7 @@ double FusedMultiplyAdd(double const a, double const b, double const c) { template double FusedNegatedMultiplyAdd(double const a, double const b, double const c) { - OSACA_IF_((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || + OSACA_IF((fma_policy == FMAPolicy::Force && CanEmitFMAInstructions) || (fma_policy == FMAPolicy::Auto && UseHardwareFMA)) { using quantities::_elementary_functions::FusedNegatedMultiplyAdd; return FusedNegatedMultiplyAdd(a, b, c); @@ -178,7 +180,7 @@ inline double DetectDangerousRounding(double const x, double const Δx) { _mm_castsi128_pd(_mm_sub_epi64(error_128i, value_exponent_128i))); // TODO(phl): Error analysis to refine the thresholds. Also, can we avoid // negative numbers? - OSACA_IF_(normalized_error < -0x1.ffffp971 && + OSACA_IF(normalized_error < -0x1.ffffp971 && normalized_error > -0x1.00008p972) { #if _DEBUG LOG(ERROR) << std::setprecision(25) << x << " " << std::hexfloat << value @@ -193,12 +195,12 @@ inline double DetectDangerousRounding(double const x, double const Δx) { inline void Reduce(Argument const θ, DoublePrecision& θ_reduced, std::int64_t& quadrant) { - OSACA_IF_(θ < π / 4 && θ > -π / 4) { + OSACA_IF(θ < π / 4 && θ > -π / 4) { θ_reduced.value = θ; θ_reduced.error = 0; quadrant = 0; return; - } else OSACA_IF_(θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { + } OSACA_ELSE_IF(θ <= π_over_2_threshold && θ >= -π_over_2_threshold) { // We are not very sensitive to rounding errors in this expression, because // in the worst case it could cause the reduced angle to jump from the // vicinity of π / 4 to the vicinity of -π / 4 with appropriate adjustment @@ -211,7 +213,7 @@ inline void Reduce(Argument const θ, Argument const error = n_double * π_over_2_low; θ_reduced = QuickTwoDifference(value, error); // TODO(phl): Error analysis needed to find the right bounds. - OSACA_IF_(θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { + OSACA_IF(θ_reduced.value < -0x1.0p-30 || θ_reduced.value > 0x1.0p-30) { quadrant = n & 0b11; return; } @@ -249,7 +251,7 @@ Value SinImplementation(DoublePrecision const θ_reduced) { auto const& x = θ_reduced.value; auto const& e = θ_reduced.error; double const abs_x = std::abs(x); - OSACA_IF_(abs_x < sin_near_zero_cutoff) { + OSACA_IF(abs_x < sin_near_zero_cutoff) { double const x² = x * x; double const x³ = x² * x; double const x³_term = FusedMultiplyAdd( @@ -328,28 +330,22 @@ Value __cdecl Sin(Argument const θ) { std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - OSACA_IF_(UseHardwareFMA) { - OSACA_IF_(quadrant & 0b1) { + OSACA_IF(UseHardwareFMA) { + OSACA_IF(quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { -#if PRINCIPIA_USE_OSACA_SIN - OSACA_VC64_START; -#endif value = SinImplementation(θ_reduced); -#if PRINCIPIA_USE_OSACA_SIN - OSACA_VC64_END; -#endif } } else { - OSACA_IF_(quadrant & 0b1) { + OSACA_IF(quadrant & 0b1) { value = CosImplementation(θ_reduced); } else { value = SinImplementation(θ_reduced); } } - OSACA_IF_(value != value) { + OSACA_IF(value != value) { OSACA_RETURN_SIN(cr_sin(θ)); - } else OSACA_IF_(quadrant & 0b10) { + } OSACA_ELSE_IF(quadrant & 0b10) { OSACA_RETURN_SIN(-value); } else { OSACA_RETURN_SIN(value); @@ -359,28 +355,28 @@ Value __cdecl Sin(Argument const θ) { #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif -Value __cdecl Cos(Argument θ) { +Value __cdecl Cos(Argument const θ) { OSACA_COS_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); double value; - OSACA_IF_(UseHardwareFMA) { - OSACA_IF_(quadrant & 0b1) { + OSACA_IF(UseHardwareFMA) { + OSACA_IF(quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } else { - OSACA_IF_(quadrant & 0b1) { + OSACA_IF(quadrant & 0b1) { value = SinImplementation(θ_reduced); } else { value = CosImplementation(θ_reduced); } } - OSACA_IF_(value != value) { + OSACA_IF(value != value) { OSACA_RETURN_COS(cr_cos(θ)); - } else OSACA_IF_(quadrant == 1 || quadrant == 2) { + } OSACA_ELSE_IF(quadrant == 1 || quadrant == 2) { OSACA_RETURN_COS(-value); } else { OSACA_RETURN_COS(value); From 8066703cd1726bb4e225ae2e4390e734d312c477 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 03:22:52 +0100 Subject: [PATCH 14/25] no const --- numerics/sin_cos.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 34c67ca9af..76690554a5 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -324,7 +324,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif -Value __cdecl Sin(Argument const θ) { +Value __cdecl Sin(Argument θ) { OSACA_SIN_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; @@ -355,7 +355,7 @@ Value __cdecl Sin(Argument const θ) { #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) #endif -Value __cdecl Cos(Argument const θ) { +Value __cdecl Cos(Argument θ) { OSACA_COS_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; From f806cc602a3e3fa11138dad135265138ba97e962 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 03:24:20 +0100 Subject: [PATCH 15/25] More comments --- numerics/sin_cos.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 76690554a5..fc829a11e5 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -33,6 +33,7 @@ #if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" static bool OSACA_loop_terminator = false; + #define OSACA_FUNCTION_BEGIN(arg) \ double volatile OSACA_input = arg; \ /* Putting a load of the input from memory in the analysed section makes */ \ @@ -52,6 +53,7 @@ static bool OSACA_loop_terminator = false; double volatile OSACA_result = OSACA_loop_carry; \ IACA_VC64_END; \ return OSACA_result + #define OSACA_IF(condition) \ if constexpr (bool volatile OSACA_computed_condition = (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) @@ -181,7 +183,7 @@ inline double DetectDangerousRounding(double const x, double const Δx) { // TODO(phl): Error analysis to refine the thresholds. Also, can we avoid // negative numbers? OSACA_IF(normalized_error < -0x1.ffffp971 && - normalized_error > -0x1.00008p972) { + normalized_error > -0x1.00008p972) { #if _DEBUG LOG(ERROR) << std::setprecision(25) << x << " " << std::hexfloat << value << " " << error << " " << normalized_error; From 4217151fb3c4d817bcfe7550386212b4c4482289 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 04:53:34 +0100 Subject: [PATCH 16/25] comments --- numerics/sin_cos.cpp | 140 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 117 insertions(+), 23 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index fc829a11e5..316965f77b 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,24 +14,83 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" -#if PRINCIPIA_USE_OSACA_SIN -#define OSACA_SIN_BEGIN OSACA_FUNCTION_BEGIN -#define OSACA_RETURN_SIN OSACA_RETURN -#else -#define OSACA_SIN_BEGIN(arg) -#define OSACA_RETURN_SIN(result) return (result) -#endif +#define PRINCIPIA_USE_OSACA PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS -#if PRINCIPIA_USE_OSACA_COS -#define OSACA_COS_BEGIN OSACA_FUNCTION_BEGIN -#define OSACA_RETURN_COS OSACA_RETURN -#else -#define OSACA_COS_BEGIN(arg) -#define OSACA_RETURN_COS(result) return (result) -#endif +#if PRINCIPIA_USE_OSACA -#if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS #include "intel/iacaMarks.h" + +// The macros OSACA_FUNCTION_BEGIN and OSACA_RETURN are used to analyse the +// latency of a double -> double function, as measured, e.g., by the +// nanobenchmarks; this notionally corresponds to the duration of an iteration +// of a loop x = f(x). +// The latency-critical path of the function is reported as the loop-carried +// dependency by OSACA, and as the critical path by IACA in throughput analysis +// mode. +// OSACA and IACA are only suitable for benchmarking a single path. Any if +// statements, including in inline functions called by the function being +// analysed, should be replaced with OSACA_IF. The path should be determined by +// providing any necessary constexpr expressions in UNDER_OSACA_HYPOTHESES. +// if OSACA_EVALUATE_CONDITIONS is set to 1, code will be generated to evaluate +// the conditions for the branches taken (outside the loop-carried path, as they +// would be with correct branch prediction). +// OSACA_EVALUATE_CONDITIONS can be set to 0 to get a more streamlined +// dependency graph when the conditions are irrelevant. Note that this can +// result in artificially low port pressures. +#if 0 +// Usage: +double f(double const x) { + double const abs_x = std::abs(x); + if (abs_x < 0.1) { + return x; + } else if (x < 0) { + return f_negative(x); + } else { + return f_positive(x); + } +} +// becomes: +double f(double x) { // The argument cannot be const. + OSACA_FUNCTION_BEGIN(x); + double const abs_x = std::abs(x); + OSACA_IF(abs_x < 0.1) { + OSACA_RETURN(x); + } OSACA_ELSE_IF (x < 0) { // `else OSACA_IF` works, but upsets the linter. + OSACA_RETURN(f_negative(x)); + } else { + OSACA_RETURN(f_positive(x)); + } +} +// To analyse it near x = 5: +#define UNDER_OSACA_HYPOTHESES(statement) \ + do { \ + { \ + constexpr double x = 5; \ + /* To avoid inconsistent definitions, constexpr code can be used as */ \ + /* needed. */ \ + constexpr double abs_x = x > 0 ? x : -x; \ + statement; \ + } \ + } while (false) +#endif + +#define OSACA_EVALUATE_CONDITIONS 1 + +// In principle, the loop-carried dependency for function latency analysis is +// achieved by wrapping the body of the function in an infinite loop, assigning +// the result to the argument at each iteration, and adding IACA markers outside +// the loop. There are some subtleties: +// — We need to trick the compiler into believing the loop is finite, so that it +// doesn’t optimize away the end marker or even the function. This is +// achieved by exiting based on the value of OSACA_loop_terminator. +// — Return statements may be in if statements, and there may be several of +// them, so they cannot be the end of a loop started unconditionally. Instead +// we loop with goto. +// — Some volatile reads and writes are used to clarify identity of the +// registers in the generated code (where the names of OSACA_input and +// OSACA_result appear in movsd instructions) and to improve the structure of +// the generated graph. + static bool OSACA_loop_terminator = false; #define OSACA_FUNCTION_BEGIN(arg) \ @@ -54,10 +113,51 @@ static bool OSACA_loop_terminator = false; IACA_VC64_END; \ return OSACA_result -#define OSACA_IF(condition) \ - if constexpr (bool volatile OSACA_computed_condition = (condition); \ +// The branch not taken, determined by evaluating the condition +// `UNDER_OSACA_HYPOTHESES`, is eliminated by `if constexpr`; the condition is +// also compiled normally and assigned to a boolean; whether this results in any +// generated code depends on `OSACA_EVALUATE_CONDITIONS`. Note that, with +// `OSACA_EVALUATE_CONDITIONS`, in `OSACA_IF(p) { } OSACA_ELSE_IF(q) { }`, if +// `p` holds `UNDER_OSACA_HYPOTHESES`, code is generated to evalutae `p`, but +// not `q`. + +#define OSACA_IF(condition) \ + if constexpr (bool OSACA_CONDITION_QUALIFIER OSACA_computed_condition = \ + (condition); \ [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) +#if OSACA_EVALUATE_CONDITIONS +#define OSACA_CONDITION_QUALIFIER volatile +#else +#define OSACA_CONDITION_QUALIFIER +#endif + +#else + +#define OSACA_IF(condition) if (condition) + +#endif // PRINCIPIA_USE_OSACA + +#define OSACA_ELSE_IF else OSACA_IF // NOLINT + +// Sin- and Cos-specific definitions: + +#if PRINCIPIA_USE_OSACA_SIN +#define OSACA_SIN_BEGIN OSACA_FUNCTION_BEGIN +#define OSACA_RETURN_SIN OSACA_RETURN +#else +#define OSACA_SIN_BEGIN(arg) +#define OSACA_RETURN_SIN(result) return (result) +#endif + +#if PRINCIPIA_USE_OSACA_COS +#define OSACA_COS_BEGIN OSACA_FUNCTION_BEGIN +#define OSACA_RETURN_COS OSACA_RETURN +#else +#define OSACA_COS_BEGIN(arg) +#define OSACA_RETURN_COS(result) return (result) +#endif + #define UNDER_OSACA_HYPOTHESES(statement) \ do { \ constexpr bool UseHardwareFMA = true; \ @@ -83,12 +183,6 @@ static bool OSACA_loop_terminator = false; { statement; } \ } while (false) -#else -#define OSACA_IF(condition) if (condition) -#endif - -#define OSACA_ELSE_IF else OSACA_IF - namespace principia { namespace numerics { From eaaf4eecc28a8801df31b8a33fd6a4f69f66d5e2 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 04:55:38 +0100 Subject: [PATCH 17/25] lint --- numerics/sin_cos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 316965f77b..1f8d4db202 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -55,7 +55,7 @@ double f(double x) { // The argument cannot be const. double const abs_x = std::abs(x); OSACA_IF(abs_x < 0.1) { OSACA_RETURN(x); - } OSACA_ELSE_IF (x < 0) { // `else OSACA_IF` works, but upsets the linter. + } OSACA_ELSE_IF(x < 0) { // `else OSACA_IF` works, but upsets the linter. OSACA_RETURN(f_negative(x)); } else { OSACA_RETURN(f_positive(x)); From c1ab8c74853b14bfd60620eab24ddf29db1ba4ab Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 05:04:58 +0100 Subject: [PATCH 18/25] comments intensify --- numerics/sin_cos.cpp | 53 ++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 1f8d4db202..5b6fc48269 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -39,28 +39,43 @@ // result in artificially low port pressures. #if 0 // Usage: -double f(double const x) { - double const abs_x = std::abs(x); - if (abs_x < 0.1) { - return x; - } else if (x < 0) { - return f_negative(x); - } else { - return f_positive(x); + double f(double const x) { + double const abs_x = std::abs(x); + if (abs_x < 0.1) { + return x; + } else if (x < 0) { + return -f_positive(x); + } else { + return f_positive(x); + } + } + double f_positive(x) { + if (x > 3) { + return 1; + } else { + // … + } } -} // becomes: -double f(double x) { // The argument cannot be const. - OSACA_FUNCTION_BEGIN(x); - double const abs_x = std::abs(x); - OSACA_IF(abs_x < 0.1) { - OSACA_RETURN(x); - } OSACA_ELSE_IF(x < 0) { // `else OSACA_IF` works, but upsets the linter. - OSACA_RETURN(f_negative(x)); - } else { - OSACA_RETURN(f_positive(x)); + double f(double x) { // The argument cannot be const. + OSACA_FUNCTION_BEGIN(x); + double const abs_x = std::abs(x); + OSACA_IF(abs_x < 0.1) { + OSACA_RETURN(x); + } OSACA_ELSE_IF(x < 0) { // `else OSACA_IF` works, but upsets the linter. + OSACA_RETURN(-f_positive(x)); + } else { + OSACA_RETURN(f_positive(x)); + } + } + // Other functions can return normally, but need to use OSACA_IF: + double f_positive(x) { + OSACA_IF(x > 3) { + return 1; + } else { + // … + } } -} // To analyse it near x = 5: #define UNDER_OSACA_HYPOTHESES(statement) \ do { \ From ea8a155c17cdec9b4fc546a45154f37e28428091 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 05:09:20 +0100 Subject: [PATCH 19/25] unnest --- numerics/sin_cos.cpp | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 5b6fc48269..97f20e22b0 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -77,15 +77,13 @@ } } // To analyse it near x = 5: -#define UNDER_OSACA_HYPOTHESES(statement) \ - do { \ - { \ - constexpr double x = 5; \ - /* To avoid inconsistent definitions, constexpr code can be used as */ \ - /* needed. */ \ - constexpr double abs_x = x > 0 ? x : -x; \ - statement; \ - } \ +#define UNDER_OSACA_HYPOTHESES(statement) \ + do { \ + constexpr double x = 5; \ + /* To avoid inconsistent definitions, constexpr code can be used as */ \ + /* needed. */ \ + constexpr double abs_x = x > 0 ? x : -x; \ + statement; \ } while (false) #endif From de2cba28c3482bd9cdaad8e8820a17c06032cc0e Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 05:13:49 +0100 Subject: [PATCH 20/25] =?UTF-8?q?=CE=BB?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- numerics/sin_cos.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 97f20e22b0..c4693d9469 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -77,14 +77,14 @@ } } // To analyse it near x = 5: -#define UNDER_OSACA_HYPOTHESES(statement) \ - do { \ +#define UNDER_OSACA_HYPOTHESES(expression) \ + [] { \ constexpr double x = 5; \ /* To avoid inconsistent definitions, constexpr code can be used as */ \ /* needed. */ \ constexpr double abs_x = x > 0 ? x : -x; \ - statement; \ - } while (false) + return (expression); \ + }() #endif #define OSACA_EVALUATE_CONDITIONS 1 @@ -137,7 +137,7 @@ static bool OSACA_loop_terminator = false; #define OSACA_IF(condition) \ if constexpr (bool OSACA_CONDITION_QUALIFIER OSACA_computed_condition = \ (condition); \ - [] { UNDER_OSACA_HYPOTHESES(return (condition)); }()) + UNDER_OSACA_HYPOTHESES(condition)) #if OSACA_EVALUATE_CONDITIONS #define OSACA_CONDITION_QUALIFIER volatile @@ -171,8 +171,8 @@ static bool OSACA_loop_terminator = false; #define OSACA_RETURN_COS(result) return (result) #endif -#define UNDER_OSACA_HYPOTHESES(statement) \ - do { \ +#define UNDER_OSACA_HYPOTHESES(expression) \ + [] { \ constexpr bool UseHardwareFMA = true; \ constexpr double θ = 0.1; \ /* From argument reduction. */ \ @@ -193,8 +193,8 @@ static bool OSACA_loop_terminator = false; /* Not NaN is the only part that matters; used at the end of the */ \ /* top-level functions to determine whether to call the slow path. */ \ constexpr double value = 1; \ - { statement; } \ - } while (false) + return expression; \ + }() namespace principia { From fa127b56de95f0b8f61293155fdf4564e3b758a7 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 05:40:13 +0100 Subject: [PATCH 21/25] fix the example --- numerics/sin_cos.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index c4693d9469..a6f200afdb 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -44,13 +44,13 @@ if (abs_x < 0.1) { return x; } else if (x < 0) { - return -f_positive(x); + return -f_positive(abs_x); } else { - return f_positive(x); + return f_positive(abs_x); } } - double f_positive(x) { - if (x > 3) { + double f_positive(double const abs_x) { + if (abs_x > 3) { return 1; } else { // … @@ -63,14 +63,15 @@ OSACA_IF(abs_x < 0.1) { OSACA_RETURN(x); } OSACA_ELSE_IF(x < 0) { // `else OSACA_IF` works, but upsets the linter. - OSACA_RETURN(-f_positive(x)); + OSACA_RETURN(-f_positive(abs_x)); } else { - OSACA_RETURN(f_positive(x)); + OSACA_RETURN(f_positive(abs_x)); } } - // Other functions can return normally, but need to use OSACA_IF: - double f_positive(x) { - OSACA_IF(x > 3) { + // Other functions can have const arguments and return normally, but need to + // use OSACA_IF: + double f_positive(double const abs_x) { + OSACA_IF(abs_x > 3) { return 1; } else { // … From 1d0b1edd9e3b696ceeda88aca78cec85f2c4147b Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 14:07:47 +0100 Subject: [PATCH 22/25] OSACA_CARRY_LOOP_THROUGH_REGISTER --- numerics/sin_cos.cpp | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index a6f200afdb..4fca1719f1 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -31,7 +31,7 @@ // statements, including in inline functions called by the function being // analysed, should be replaced with OSACA_IF. The path should be determined by // providing any necessary constexpr expressions in UNDER_OSACA_HYPOTHESES. -// if OSACA_EVALUATE_CONDITIONS is set to 1, code will be generated to evaluate +// If OSACA_EVALUATE_CONDITIONS is set to 1, code will be generated to evaluate // the conditions for the branches taken (outside the loop-carried path, as they // would be with correct branch prediction). // OSACA_EVALUATE_CONDITIONS can be set to 0 to get a more streamlined @@ -79,7 +79,7 @@ } // To analyse it near x = 5: #define UNDER_OSACA_HYPOTHESES(expression) \ - [] { \ + [&] { \ constexpr double x = 5; \ /* To avoid inconsistent definitions, constexpr code can be used as */ \ /* needed. */ \ @@ -87,9 +87,6 @@ return (expression); \ }() #endif - -#define OSACA_EVALUATE_CONDITIONS 1 - // In principle, the loop-carried dependency for function latency analysis is // achieved by wrapping the body of the function in an infinite loop, assigning // the result to the argument at each iteration, and adding IACA markers outside @@ -104,18 +101,24 @@ // registers in the generated code (where the names of OSACA_input and // OSACA_result appear in movsd instructions) and to improve the structure of // the generated graph. +// +// Putting a load of the input from memory in the analysed section makes the +// OSACA dependency graph clearer. However: +// — it adds a spurious move to the latency; +// — some tools (IACA, LLVM-MCA) cannot see the dependency through memory. +// Set OSACA_CARRY_LOOP_THROUGH_REGISTER to 1 to carry the loop dependency +// through a register instead. + +#define OSACA_EVALUATE_CONDITIONS 1 +#define OSACA_CARRY_LOOP_THROUGH_REGISTER 0 static bool OSACA_loop_terminator = false; -#define OSACA_FUNCTION_BEGIN(arg) \ - double volatile OSACA_input = arg; \ - /* Putting a load of the input from memory in the analysed section makes */ \ - /* the dependency graph clearer, but adds a potentially spurious move to */ \ - /* the loop-carried latency. Remove the `volatile` above to carry the */ \ - /* loop through registers.*/ \ - IACA_VC64_START; \ - double OSACA_loop_carry = OSACA_input; \ - OSACA_loop: \ +#define OSACA_FUNCTION_BEGIN(arg) \ + double OSACA_INPUT_QUALIFIER OSACA_input = arg; \ + IACA_VC64_START; \ + double OSACA_loop_carry = OSACA_input; \ + OSACA_loop: \ arg = OSACA_loop_carry #define OSACA_RETURN(result) \ @@ -127,6 +130,12 @@ static bool OSACA_loop_terminator = false; IACA_VC64_END; \ return OSACA_result +#if OSACA_CARRY_LOOP_THROUGH_REGISTER +#define OSACA_INPUT_QUALIFIER +#else +#define OSACA_INPUT_QUALIFIER volatile +#endif + // The branch not taken, determined by evaluating the condition // `UNDER_OSACA_HYPOTHESES`, is eliminated by `if constexpr`; the condition is // also compiled normally and assigned to a boolean; whether this results in any @@ -173,7 +182,7 @@ static bool OSACA_loop_terminator = false; #endif #define UNDER_OSACA_HYPOTHESES(expression) \ - [] { \ + [&] { \ constexpr bool UseHardwareFMA = true; \ constexpr double θ = 0.1; \ /* From argument reduction. */ \ From 8423ed95977359e8a9c8dd8c03f5568c005189ae Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 14:53:59 +0100 Subject: [PATCH 23/25] Review comments --- numerics/sin_cos.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index 4fca1719f1..b256da352e 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -23,7 +23,7 @@ // The macros OSACA_FUNCTION_BEGIN and OSACA_RETURN are used to analyse the // latency of a double -> double function, as measured, e.g., by the // nanobenchmarks; this notionally corresponds to the duration of an iteration -// of a loop x = f(x). +// of a loop `x = f(x)`. // The latency-critical path of the function is reported as the loop-carried // dependency by OSACA, and as the critical path by IACA in throughput analysis // mode. @@ -93,14 +93,14 @@ // the loop. There are some subtleties: // — We need to trick the compiler into believing the loop is finite, so that it // doesn’t optimize away the end marker or even the function. This is -// achieved by exiting based on the value of OSACA_loop_terminator. +// achieved by exiting based on the value of `OSACA_loop_terminator`. // — Return statements may be in if statements, and there may be several of // them, so they cannot be the end of a loop started unconditionally. Instead // we loop with goto. // — Some volatile reads and writes are used to clarify identity of the -// registers in the generated code (where the names of OSACA_input and -// OSACA_result appear in movsd instructions) and to improve the structure of -// the generated graph. +// registers in the generated code (where the names of `OSACA_input` and +// 'OSACA_result' appear in movsd instructions) and to improve the structure +// of the generated graph. // // Putting a load of the input from memory in the analysed section makes the // OSACA dependency graph clearer. However: @@ -141,7 +141,7 @@ static bool OSACA_loop_terminator = false; // also compiled normally and assigned to a boolean; whether this results in any // generated code depends on `OSACA_EVALUATE_CONDITIONS`. Note that, with // `OSACA_EVALUATE_CONDITIONS`, in `OSACA_IF(p) { } OSACA_ELSE_IF(q) { }`, if -// `p` holds `UNDER_OSACA_HYPOTHESES`, code is generated to evalutae `p`, but +// `p` holds `UNDER_OSACA_HYPOTHESES`, code is generated to evaluate `p`, but // not `q`. #define OSACA_IF(condition) \ @@ -155,7 +155,7 @@ static bool OSACA_loop_terminator = false; #define OSACA_CONDITION_QUALIFIER #endif -#else +#else // if !PRINCIPIA_USE_OSACA #define OSACA_IF(condition) if (condition) From 610363a3ae2661446dec6a17d7eea83efc039977 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 16:35:16 +0100 Subject: [PATCH 24/25] Easier switching between functions --- base/macros.hpp | 9 +++++ functions/sin_cos_test.cpp | 4 +- numerics/sin_cos.cpp | 76 +++++++++++++++++++------------------- numerics/sin_cos.hpp | 7 +++- 4 files changed, 52 insertions(+), 44 deletions(-) diff --git a/base/macros.hpp b/base/macros.hpp index 222a405301..d965266fe4 100644 --- a/base/macros.hpp +++ b/base/macros.hpp @@ -9,6 +9,15 @@ namespace base { #define STRINGIFY(X) #X #define STRINGIFY_EXPANSION(X) STRINGIFY(X) +#define PRINCIPIA_CONCATENATE_SENTINEL(X) X##0x4C##45##4E##49##54##4E##45##53 +// True if X is #defined to nothing, false if X is #defined to an identifier or +// is not defined. This macro should not be used with macros that expand to +// something other than an identifier. +#define PRINCIPIA_MACRO_IS_EMPTY(X) \ + (PRINCIPIA_CONCATENATE_SENTINEL(X) == \ + ('S' << 000 | 'E' << 010 | 'N' << 020 | 'T' << 030 | 'I' << 040 | \ + 'N' << 050 | 'E' << 060 | 'L' << 070)) + // See http://goo.gl/2EVxN4 for a partial overview of compiler detection and // version macros. #if defined(_MSC_VER) && defined(__clang__) diff --git a/functions/sin_cos_test.cpp b/functions/sin_cos_test.cpp index 9a605e8980..9a1d524567 100644 --- a/functions/sin_cos_test.cpp +++ b/functions/sin_cos_test.cpp @@ -33,15 +33,13 @@ class SinCosTest : public ::testing::Test { }; // Defined in sin_cos.hpp -#if PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS +#if PRINCIPIA_USE_OSACA // A convenient skeleton for analysing code with OSACA. Note that to speed-up // analysis, we disable all the other tests when using OSACA. TEST_F(SinCosTest, DISABLED_OSACA) { static_assert(PRINCIPIA_INLINE_SIN_COS == 1, "Must force inlining to use OSACA"); - static_assert(PRINCIPIA_USE_OSACA_SIN + PRINCIPIA_USE_OSACA_COS <= 1, - "Must use OSACA for at most one function"); auto osaca_sin = [](double const a) { return Sin(a); }; diff --git a/numerics/sin_cos.cpp b/numerics/sin_cos.cpp index b256da352e..b79dda44df 100644 --- a/numerics/sin_cos.cpp +++ b/numerics/sin_cos.cpp @@ -14,8 +14,6 @@ #include "numerics/polynomial_evaluators.hpp" #include "quantities/elementary_functions.hpp" -#define PRINCIPIA_USE_OSACA PRINCIPIA_USE_OSACA_SIN || PRINCIPIA_USE_OSACA_COS - #if PRINCIPIA_USE_OSACA #include "intel/iacaMarks.h" @@ -78,6 +76,7 @@ } } // To analyse it near x = 5: +#define OSACA_ANALYSED_FUNCTION f #define UNDER_OSACA_HYPOTHESES(expression) \ [&] { \ constexpr double x = 5; \ @@ -114,21 +113,34 @@ static bool OSACA_loop_terminator = false; -#define OSACA_FUNCTION_BEGIN(arg) \ - double OSACA_INPUT_QUALIFIER OSACA_input = arg; \ - IACA_VC64_START; \ - double OSACA_loop_carry = OSACA_input; \ - OSACA_loop: \ +#define OSACA_FUNCTION_BEGIN(arg) \ + double OSACA_INPUT_QUALIFIER OSACA_input = arg; \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + IACA_VC64_START; \ + } \ + double OSACA_loop_carry = OSACA_input; \ + _Pragma("warning(push)"); \ + _Pragma("warning(disable : 4102)"); \ + OSACA_loop: \ + _Pragma("warning(pop)"); \ arg = OSACA_loop_carry -#define OSACA_RETURN(result) \ - OSACA_loop_carry = (result); \ - if (!OSACA_loop_terminator) { \ - goto OSACA_loop; \ - } \ - double volatile OSACA_result = OSACA_loop_carry; \ - IACA_VC64_END; \ - return OSACA_result +#define OSACA_RETURN(result) \ + do { \ + if constexpr (std::string_view(__func__) == \ + STRINGIFY_EXPANSION(OSACA_ANALYSED_FUNCTION)) { \ + OSACA_loop_carry = (result); \ + if (!OSACA_loop_terminator) { \ + goto OSACA_loop; \ + } \ + double volatile OSACA_result = OSACA_loop_carry; \ + IACA_VC64_END; \ + return OSACA_result; \ + } else { \ + return (result); \ + } \ + } while (false) #if OSACA_CARRY_LOOP_THROUGH_REGISTER #define OSACA_INPUT_QUALIFIER @@ -157,6 +169,8 @@ static bool OSACA_loop_terminator = false; #else // if !PRINCIPIA_USE_OSACA +#define OSACA_FUNCTION_BEGIN(arg) +#define OSACA_RETURN(result) return (result) #define OSACA_IF(condition) if (condition) #endif // PRINCIPIA_USE_OSACA @@ -165,22 +179,6 @@ static bool OSACA_loop_terminator = false; // Sin- and Cos-specific definitions: -#if PRINCIPIA_USE_OSACA_SIN -#define OSACA_SIN_BEGIN OSACA_FUNCTION_BEGIN -#define OSACA_RETURN_SIN OSACA_RETURN -#else -#define OSACA_SIN_BEGIN(arg) -#define OSACA_RETURN_SIN(result) return (result) -#endif - -#if PRINCIPIA_USE_OSACA_COS -#define OSACA_COS_BEGIN OSACA_FUNCTION_BEGIN -#define OSACA_RETURN_COS OSACA_RETURN -#else -#define OSACA_COS_BEGIN(arg) -#define OSACA_RETURN_COS(result) return (result) -#endif - #define UNDER_OSACA_HYPOTHESES(expression) \ [&] { \ constexpr bool UseHardwareFMA = true; \ @@ -444,7 +442,7 @@ Value CosImplementation(DoublePrecision const θ_reduced) { FORCE_INLINE(inline) #endif Value __cdecl Sin(Argument θ) { - OSACA_SIN_BEGIN(θ); + OSACA_FUNCTION_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); @@ -463,11 +461,11 @@ Value __cdecl Sin(Argument θ) { } } OSACA_IF(value != value) { - OSACA_RETURN_SIN(cr_sin(θ)); + OSACA_RETURN(cr_sin(θ)); } OSACA_ELSE_IF(quadrant & 0b10) { - OSACA_RETURN_SIN(-value); + OSACA_RETURN(-value); } else { - OSACA_RETURN_SIN(value); + OSACA_RETURN(value); } } @@ -475,7 +473,7 @@ Value __cdecl Sin(Argument θ) { FORCE_INLINE(inline) #endif Value __cdecl Cos(Argument θ) { - OSACA_COS_BEGIN(θ); + OSACA_FUNCTION_BEGIN(θ); DoublePrecision θ_reduced; std::int64_t quadrant; Reduce(θ, θ_reduced, quadrant); @@ -494,11 +492,11 @@ Value __cdecl Cos(Argument θ) { } } OSACA_IF(value != value) { - OSACA_RETURN_COS(cr_cos(θ)); + OSACA_RETURN(cr_cos(θ)); } OSACA_ELSE_IF(quadrant == 1 || quadrant == 2) { - OSACA_RETURN_COS(-value); + OSACA_RETURN(-value); } else { - OSACA_RETURN_COS(value); + OSACA_RETURN(value); } } diff --git a/numerics/sin_cos.hpp b/numerics/sin_cos.hpp index c4a4b6fc4a..7d3a7eeb01 100644 --- a/numerics/sin_cos.hpp +++ b/numerics/sin_cos.hpp @@ -8,8 +8,11 @@ namespace _sin_cos { namespace internal { #define PRINCIPIA_INLINE_SIN_COS 0 -#define PRINCIPIA_USE_OSACA_SIN 0 -#define PRINCIPIA_USE_OSACA_COS 0 +#define OSACA_ANALYSED_FUNCTION + +#if defined(OSACA_ANALYSED_FUNCTION) +#define PRINCIPIA_USE_OSACA !PRINCIPIA_MACRO_IS_EMPTY(OSACA_ANALYSED_FUNCTION) +#endif #if PRINCIPIA_INLINE_SIN_COS FORCE_INLINE(inline) From ce7bc2a447f4db1cd080f6e48678da56df7bf8d5 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Tue, 31 Dec 2024 16:39:35 +0100 Subject: [PATCH 25/25] lint --- base/macros.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/base/macros.hpp b/base/macros.hpp index d965266fe4..f64df6057a 100644 --- a/base/macros.hpp +++ b/base/macros.hpp @@ -13,10 +13,10 @@ namespace base { // True if X is #defined to nothing, false if X is #defined to an identifier or // is not defined. This macro should not be used with macros that expand to // something other than an identifier. -#define PRINCIPIA_MACRO_IS_EMPTY(X) \ - (PRINCIPIA_CONCATENATE_SENTINEL(X) == \ - ('S' << 000 | 'E' << 010 | 'N' << 020 | 'T' << 030 | 'I' << 040 | \ - 'N' << 050 | 'E' << 060 | 'L' << 070)) +#define PRINCIPIA_MACRO_IS_EMPTY(X) \ + (PRINCIPIA_CONCATENATE_SENTINEL(X) == \ + ('S' << 000 | 'E' << 010 | 'N' << 020 | 'T' << 030 | 'I' << 040 | \ + 'N' << 050 | 'E' << 060 | 'L' << 070)) // See http://goo.gl/2EVxN4 for a partial overview of compiler detection and // version macros.