Skip to content

Commit

Permalink
Merge pull request #4102 from pleroy/Optimization
Browse files Browse the repository at this point in the history
Micro-optimizations of the Stehlé-Zimmermann search
  • Loading branch information
pleroy authored Sep 28, 2024
2 parents 8e00a8f + d5c5562 commit 68a0099
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 83 deletions.
10 changes: 6 additions & 4 deletions functions/accurate_table_generator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ using namespace principia::base::_thread_pool;
using namespace principia::numerics::_polynomial_in_monomial_basis;

using AccurateFunction = std::function<cpp_bin_float_50(cpp_rational const&)>;
using ApproximateFunction = std::function<double(cpp_rational const&)>;

template<typename ArgValue, int degree>
using AccuratePolynomial =
Expand All @@ -41,7 +42,7 @@ template<std::int64_t zeroes>
absl::StatusOr<cpp_rational> StehléZimmermannSimultaneousSearch(
std::array<AccurateFunction, 2> const& functions,
std::array<AccuratePolynomial<cpp_rational, 2>, 2> const& polynomials,
std::array<AccurateFunction, 2> const& remainders,
std::array<ApproximateFunction, 2> const& remainders,
cpp_rational const& starting_argument,
std::int64_t N,
std::int64_t T);
Expand All @@ -54,7 +55,7 @@ template<std::int64_t zeroes>
absl::StatusOr<cpp_rational> StehléZimmermannSimultaneousFullSearch(
std::array<AccurateFunction, 2> const& functions,
std::array<AccuratePolynomial<cpp_rational, 2>, 2> const& polynomials,
std::array<AccurateFunction, 2> const& remainders,
std::array<ApproximateFunction, 2> const& remainders,
cpp_rational const& starting_argument,
ThreadPool<void>* search_pool = nullptr);

Expand All @@ -67,7 +68,7 @@ StehléZimmermannSimultaneousMultisearch(
std::array<AccurateFunction, 2> const& functions,
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> const&
polynomials,
std::vector<std::array<AccurateFunction, 2>> const& remainders,
std::vector<std::array<ApproximateFunction, 2>> const& remainders,
std::vector<cpp_rational> const& starting_arguments);

// Same as above, but instead of accumulating all the results and returning them
Expand All @@ -78,7 +79,7 @@ void StehléZimmermannSimultaneousStreamingMultisearch(
std::array<AccurateFunction, 2> const& functions,
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> const&
polynomials,
std::vector<std::array<AccurateFunction, 2>> const& remainders,
std::vector<std::array<ApproximateFunction, 2>> const& remainders,
std::vector<cpp_rational> const& starting_arguments,
std::function<void(/*index=*/std::int64_t,
absl::StatusOr<cpp_rational>)> const& callback);
Expand All @@ -87,6 +88,7 @@ void StehléZimmermannSimultaneousStreamingMultisearch(

using internal::AccurateFunction;
using internal::AccuratePolynomial;
using internal::ApproximateFunction;
using internal::GalExhaustiveMultisearch;
using internal::GalExhaustiveSearch;
using internal::StehléZimmermannSimultaneousFullSearch;
Expand Down
24 changes: 12 additions & 12 deletions functions/accurate_table_generator_body.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ using namespace principia::numerics::_matrix_views;
using namespace principia::quantities::_elementary_functions;
using namespace principia::quantities::_quantities;

constexpr std::int64_t T_max = 16;
constexpr std::int64_t T_max = 4;
static_assert(T_max >= 1);

template<std::int64_t zeroes>
Expand Down Expand Up @@ -68,7 +68,7 @@ bool AllFunctionValuesHaveDesiredZeroes(
struct StehléZimmermannSpecification {
std::array<AccurateFunction, 2> functions;
std::array<AccuratePolynomial<cpp_rational, 2>, 2> polynomials;
std::array<AccurateFunction, 2> remainders;
std::array<ApproximateFunction, 2> remainders;
cpp_rational argument;
};

Expand Down Expand Up @@ -136,7 +136,7 @@ StehléZimmermannSpecification ScaleToBinade01(

std::array<double, 2> function_scales;
std::array<AccurateFunction, 2> scaled_functions;
std::array<AccurateFunction, 2> scaled_remainders;
std::array<ApproximateFunction, 2> scaled_remainders;
for (std::int64_t i = 0; i < scaled_functions.size(); ++i) {
function_scales[i] = compute_scale(functions[i](lower_bound),
functions[i](upper_bound));
Expand Down Expand Up @@ -439,7 +439,7 @@ template<std::int64_t zeroes>
absl::StatusOr<cpp_rational> StehléZimmermannSimultaneousSearch(
std::array<AccurateFunction, 2> const& functions,
std::array<AccuratePolynomial<cpp_rational, 2>, 2> const& polynomials,
std::array<AccurateFunction, 2> const& remainders,
std::array<ApproximateFunction, 2> const& remainders,
cpp_rational const& starting_argument,
std::int64_t const N,
std::int64_t const T) {
Expand Down Expand Up @@ -473,17 +473,17 @@ absl::StatusOr<cpp_rational> StehléZimmermannSimultaneousSearch(
// Step 2: compute ε. We use the remainders provided by the clients. Note
// that we could save on the number of evaluations by providing both bounds to
// a single call.
cpp_bin_float_50 ε = 0;
double ε = 0;
auto const T_over_N = cpp_rational(T, N);
for (std::int64_t i = 0; i < remainders.size(); ++i) {
auto const T_over_N = cpp_rational(T, N);
ε = std::max(ε, abs(N * remainders[i](starting_argument - T_over_N)));
ε = std::max(ε, abs(N * remainders[i](starting_argument + T_over_N)));
ε = std::max(ε, std::abs(N * remainders[i](starting_argument - T_over_N)));
ε = std::max(ε, std::abs(N * remainders[i](starting_argument + T_over_N)));
}
VLOG(3) << "ε = " << ε;

// Step 3, first part: compute Mʹ and C. Give up is C is 0, which may happen
// if ε is too large.
auto const Mʹ = static_cast<std::int64_t>(floor(M / (2 + 2 * M * ε)));
auto const Mʹ = static_cast<std::int64_t>(std::floor(M / (2 + 2 * M * ε)));
auto const C = 3 * Mʹ;
if (C == 0) {
return absl::FailedPreconditionError("Error too large");
Expand Down Expand Up @@ -616,7 +616,7 @@ template<std::int64_t zeroes>
absl::StatusOr<cpp_rational> StehléZimmermannSimultaneousFullSearch(
std::array<AccurateFunction, 2> const& functions,
std::array<AccuratePolynomial<cpp_rational, 2>, 2> const& polynomials,
std::array<AccurateFunction, 2> const& remainders,
std::array<ApproximateFunction, 2> const& remainders,
cpp_rational const& starting_argument,
ThreadPool<void>* const search_pool) {
// Start by scaling the specification of the search. The rest of this
Expand Down Expand Up @@ -772,7 +772,7 @@ StehléZimmermannSimultaneousMultisearch(
std::array<AccurateFunction, 2> const& functions,
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> const&
polynomials,
std::vector<std::array<AccurateFunction, 2>> const& remainders,
std::vector<std::array<ApproximateFunction, 2>> const& remainders,
std::vector<cpp_rational> const& starting_arguments) {
std::vector<absl::StatusOr<cpp_rational>> result;
result.resize(starting_arguments.size());
Expand All @@ -793,7 +793,7 @@ void StehléZimmermannSimultaneousStreamingMultisearch(
std::array<AccurateFunction, 2> const& functions,
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> const&
polynomials,
std::vector<std::array<AccurateFunction, 2>> const& remainders,
std::vector<std::array<ApproximateFunction, 2>> const& remainders,
std::vector<cpp_rational> const& starting_arguments,
std::function<void(/*index=*/std::int64_t,
absl::StatusOr<cpp_rational>)> const& callback) {
Expand Down
130 changes: 63 additions & 67 deletions functions/accurate_table_generator_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,17 +145,15 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannSinCos15) {

// Use the Lagrange form of the remainder. u may be above or below u₀. Note
// that we use the fact that the functions are monotonic.
auto const remainder_sin_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (-Cos(std::min(u, u₀) / 4) / 16) / Factorial(3);
auto const remainder_sin_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (-std::cos(std::min(u₀ + Δu, u₀) / 4) / 16) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (Sin(std::max(u, u₀) / 4) / 64) / Factorial(3);
auto const remainder_cos_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (std::sin(std::max(u₀ + Δu, u₀) / 4) / 64) / Factorial(3);
};

auto const u = StehléZimmermannSimultaneousSearch<15>(
Expand Down Expand Up @@ -207,17 +205,15 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannFullSinCos5NoScaling) {
-cpp_rational(Cos(u₀ / 4) / 32)},
u₀);

auto const remainder_sin_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (-Cos(std::min(u, u₀) / 4) / 16) / Factorial(3);
auto const remainder_sin_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (-std::cos(std::min(u₀ + Δu, u₀) / 4) / 16) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (Sin(std::max(u, u₀) / 4) / 64) / Factorial(3);
auto const remainder_cos_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (std::sin(std::max(u₀ + Δu, u₀) / 4) / 64) / Factorial(3);
};

auto const u = StehléZimmermannSimultaneousFullSearch<5>(
Expand Down Expand Up @@ -267,17 +263,15 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannFullSinCos15NoScaling) {
-cpp_rational(Cos(u₀ / 4) / 32)},
u₀);

auto const remainder_sin_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (-Cos(std::min(u, u₀) / 4) / 16) / Factorial(3);
auto const remainder_sin_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (-std::cos(std::min(u₀ + Δu, u₀) / 4) / 16) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[u₀ = cpp_rational(u₀)](cpp_rational const& u) {
auto const Δu = u - u₀;
auto const Δu³ = static_cast<cpp_bin_float_50>(Δu * Δu * Δu);
return Δu³ * (Sin(std::max(u, u₀) / 4) / 64) / Factorial(3);
auto const remainder_cos_taylor2 = [u₀](cpp_rational const& u) {
auto const Δu = static_cast<double>(u) - u₀;
auto const Δu³ = Δu * Δu * Δu;
return Δu³ * (std::sin(std::max(u₀ + Δu, u₀) / 4) / 64) / Factorial(3);
};

auto const u = StehléZimmermannSimultaneousFullSearch<15>(
Expand Down Expand Up @@ -319,17 +313,15 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannFullSinCos15WithScaling) {
-cpp_rational(Cos(x₀) / 2)},
x₀);

auto const remainder_sin_taylor2 =
[x₀ = cpp_rational(x₀)](cpp_rational const& x) {
auto const Δx = x - x₀;
auto const Δx³ = static_cast<cpp_bin_float_50>(Δx * Δx * Δx);
return -Δx³ * -Cos(std::min(x, x₀)) / Factorial(3);
auto const remainder_sin_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return -Δx³ * -std::cos(std::min(x₀ + Δx, x₀)) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[x₀ = cpp_rational(x₀)](cpp_rational const& x) {
auto const Δx = x - x₀;
auto const Δx³ = static_cast<cpp_bin_float_50>(Δx * Δx * Δx);
return Δx³ * Sin(std::max(x, x₀)) / Factorial(3);
auto const remainder_cos_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return Δx³ * std::sin(std::max(x₀ + Δx, x₀)) / Factorial(3);
};

auto const x = StehléZimmermannSimultaneousFullSearch<15>(
Expand Down Expand Up @@ -365,7 +357,7 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannMultisearchSinCos15) {
static constexpr std::int64_t index_end = 100;
std::vector<cpp_rational> starting_arguments;
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> polynomials;
std::vector<std::array<AccurateFunction, 2>> remainders;
std::vector<std::array<ApproximateFunction, 2>> remainders;
for (std::int64_t i = index_begin; i < index_end; ++i) {
auto const x₀ = i / 128.0;
AccuratePolynomial<cpp_rational, 2> const sin_taylor2(
Expand All @@ -379,18 +371,16 @@ TEST_F(AccurateTableGeneratorTest, StehléZimmermannMultisearchSinCos15) {
-cpp_rational(Cos(x₀) / 2)},
x₀);

auto const remainder_sin_taylor2 =
[x₀ = cpp_rational(x₀)](cpp_rational const& x) {
auto const Δx = x - x₀;
auto const Δx³ = static_cast<cpp_bin_float_50>(Δx * Δx * Δx);
return -Δx³ * Cos(std::min(x, x₀)) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[x₀ = cpp_rational(x₀)](cpp_rational const& x) {
auto const Δx = x - x₀;
auto const Δx³ = static_cast<cpp_bin_float_50>(Δx * Δx * Δx);
return Δx³ * Sin(std::max(x, x₀)) / Factorial(3);
};
auto const remainder_sin_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return -Δx³ * -std::cos(std::min(x₀ + Δx, x₀)) / Factorial(3);
};
auto const remainder_cos_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return Δx³ * std::sin(std::max(x₀ + Δx, x₀)) / Factorial(3);
};

starting_arguments.push_back(x₀);
polynomials.push_back({sin_taylor2, cos_taylor2});
Expand Down Expand Up @@ -450,9 +440,17 @@ TEST_F(AccurateTableGeneratorTest, DISABLED_SECULAR_SinCos18) {
CHECK_LT(centre(i_max) - h, π / 4);
CHECK_LT(π / 4, centre(i_max) + h);

// No need for fancy angle reduction as the angles are small.
AccurateFunction const accurate_sin = [](cpp_rational const& x) {
return sin(static_cast<cpp_bin_float_50>(x));
};
AccurateFunction const accurate_cos = [](cpp_rational const& x) {
return cos(static_cast<cpp_bin_float_50>(x));
};

std::vector<cpp_rational> starting_arguments;
std::vector<std::array<AccuratePolynomial<cpp_rational, 2>, 2>> polynomials;
std::vector<std::array<AccurateFunction, 2>> remainders;
std::vector<std::array<ApproximateFunction, 2>> remainders;
for (std::int64_t i = i_min; i <= i_max; ++i) {
double const x₀ = centre(i);
AccuratePolynomial<cpp_rational, 2> const sin_taylor2(
Expand All @@ -468,26 +466,24 @@ TEST_F(AccurateTableGeneratorTest, DISABLED_SECULAR_SinCos18) {

// The remainders don't need to be extremely precise, so for speed
// they are computed using double.
auto const remainder_sin_taylor2 =
[x₀ = static_cast<double>(cpp_rational(x₀))](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return -Δx³ * -std::cos(std::min(x₀ + Δx, x₀)) / Factorial(3);
};
auto const remainder_cos_taylor2 =
[x₀ = static_cast<double>(cpp_rational(x₀))](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return Δx³ * std::sin(std::max(x₀ + Δx, x₀)) / Factorial(3);
};
auto const remainder_sin_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return -Δx³ * -std::cos(std::min(x₀ + Δx, x₀)) / Factorial(3);
};
auto const remainder_cos_taylor2 = [x₀](cpp_rational const& x) {
auto const Δx = static_cast<double>(x) - x₀;
auto const Δx³ = Δx * Δx * Δx;
return Δx³ * std::sin(std::max(x₀ + Δx, x₀)) / Factorial(3);
};

starting_arguments.push_back(x₀);
polynomials.push_back({sin_taylor2, cos_taylor2});
remainders.push_back({remainder_sin_taylor2, remainder_cos_taylor2});
}

StehléZimmermannSimultaneousStreamingMultisearch<18>(
{Sin, Cos},
{accurate_sin, accurate_cos},
polynomials,
remainders,
starting_arguments,
Expand Down

0 comments on commit 68a0099

Please sign in to comment.