From f1463b45d029bee43c1f990da56e3fb5486daed1 Mon Sep 17 00:00:00 2001 From: JacksonCampolattaro Date: Sat, 29 Jun 2024 18:52:20 +0200 Subject: [PATCH] Organize different gravity derivative implementations, including Christoph's generated code. --- README.md | 13 +- benchmarks/CMakeLists.txt | 5 + benchmarks/multipoleMoment.cpp | 67 +- doc/pages/main.md | 13 +- include/symtensor/SymmetricTensorBase.h | 25 +- .../symtensor/{gravity.h => gravity/direct.h} | 120 +--- include/symtensor/gravity/einsum.h | 101 +++ include/symtensor/gravity/tensorlib.h | 45 ++ include/symtensor/gravity/tensorlib_autogen.h | 618 ++++++++++++++++++ 9 files changed, 847 insertions(+), 160 deletions(-) rename include/symtensor/{gravity.h => gravity/direct.h} (56%) create mode 100644 include/symtensor/gravity/einsum.h create mode 100644 include/symtensor/gravity/tensorlib.h create mode 100644 include/symtensor/gravity/tensorlib_autogen.h diff --git a/README.md b/README.md index c9e81b9..7a3e612 100644 --- a/README.md +++ b/README.md @@ -7,15 +7,14 @@ ![MIT license](https://img.shields.io/badge/license-MIT-A51931) ![C++20](https://img.shields.io/badge/standard-C%2B%2B20-blue?logo=cplusplus) -_For more information, see the [documentation](https://jackson.campolattaro.nl/symtensor)!_ +_For more information, see the [documentation](https://jackson.campolattaro.nl/symtensor)._ -Symtensor is designed for working with tensors which are... -- **Symmetric** — only the upper triangle needs to be stored. -- **Small** — indices have semantic meaning, similar to GLM's `mat3x3` type. -- **High-rank** — not just vectors and matrices, these tensors can have any number of dimensions. +Symtensor is designed for working with tensors which are **Symmetric**, **Small** and **High-rank**, and for the composition of those tensors into Multipoles. -These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are immediately ruled out for this purpose. +These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are ruled out for this purpose because they don't support condensed storage of symmetric tensors. -There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and provides no benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is entirely based on C-style arrays and not intended to be human-readable. +There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and doesn't provide benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is based on C-style arrays and not intended to be human-readable. The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_. + +Eventually, I intend to incorporate symtensor into a larger _n_-body solver built from the ground up. To this end, [Christoph Peters](https://momentsingraphics.de/About.html) has been a big help by generating derivatives of the Newton's gravity equation up to arbitrarily high orders. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index a74cb98..8cb0db0 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -27,3 +27,8 @@ target_link_libraries(benchmarks PRIVATE Catch2::Catch2WithMain symtensor::symtensor ) +target_compile_options( + benchmarks PRIVATE + $<$:-fassociative-math> + $<$:-fassociative-math> +) diff --git a/benchmarks/multipoleMoment.cpp b/benchmarks/multipoleMoment.cpp index 380d7ff..4352fdb 100644 --- a/benchmarks/multipoleMoment.cpp +++ b/benchmarks/multipoleMoment.cpp @@ -5,7 +5,10 @@ #include #include -#include + +#include "symtensor/gravity/direct.h" +#include "symtensor/gravity/einsum.h" +#include "symtensor/gravity/tensorlib.h" using namespace symtensor; @@ -16,31 +19,39 @@ TEST_CASE("benchmark: Multipole moment gravity approximation", "[MultipoleMoment TEST_CASE("benchmark: Gravity derivatives construction", "[Gravity]") { auto R = glm::vec3{1.0, 2.0, 3.0}; - - constexpr std::array input = {1, 2, 2, 3, 1}; - constexpr auto result = filter(); - - // Print the result (for non-constexpr context) - for (const auto &value: result) { - std::cout << value << std::endl; - } - - CHECK((gravity::D<2>(R) - gravity::derivative<2>(R)).norm() < 1e-7); - CHECK((gravity::D<3>(R) - gravity::derivative<3>(R)).norm() < 1e-7); - CHECK((gravity::D<4>(R) - gravity::derivative<4>(R)).norm() < 1e-7); - CHECK((gravity::D<5>(R) - gravity::derivative<5>(R)).norm() < 1e-7); // todo - -// BENCHMARK("D' Construction") { return gravity::D<1>(R); }; -// BENCHMARK("D' Construction (improved)") { return gravity::derivative<1>(R); }; -// BENCHMARK("D'' Construction") { return gravity::D<2>(R); }; -// BENCHMARK("D'' Construction (improved)") { return gravity::derivative<2>(R); }; -// BENCHMARK("D''' Construction") { return gravity::D<3>(R); }; -// BENCHMARK("D''' Construction (improved)") { return gravity::derivative<3>(R); }; -// BENCHMARK("D'''' Construction") { return gravity::D<4>(R); }; -// BENCHMARK("D'''' Construction (improved)") { return gravity::derivative<4>(R); }; -// BENCHMARK("D''''' Construction") { return gravity::D<5>(R); }; -// BENCHMARK("D''''' Construction (improved)") { return gravity::derivative<5>(R); }; // todo - - BENCHMARK("D1-4 Construction") { return gravity::Ds<4>(R); }; - BENCHMARK("D1-4 Construction (improved)") { return gravity::derivatives<4>(R); }; + // Einsum must match direct form + CHECK((gravity::direct::derivative<2>(R) - gravity::einsum::derivative<2>(R)).norm() < 1e-7); + CHECK((gravity::direct::derivative<3>(R) - gravity::einsum::derivative<3>(R)).norm() < 1e-7); + CHECK((gravity::direct::derivative<4>(R) - gravity::einsum::derivative<4>(R)).norm() < 1e-7); + // CHECK((gravity::D<5>(R) - gravity::derivative<5>(R)).norm() < 1e-7); // todo: unimplemented + + // Tensorlib must match direct form + CHECK((gravity::direct::derivative<2>(R) - gravity::tensorlib::derivative<2>(R)).norm() < 1e-7); + CHECK((gravity::direct::derivative<3>(R) - gravity::tensorlib::derivative<3>(R)).norm() < 1e-7); + CHECK((gravity::direct::derivative<4>(R) - gravity::tensorlib::derivative<4>(R)).norm() < 1e-7); + CHECK((gravity::direct::derivative<5>(R) - gravity::tensorlib::derivative<5>(R)).norm() < 1e-7); + + BENCHMARK("D' (direct)") { return gravity::direct::derivative<1>(R); }; + BENCHMARK("D' (einsum)") { return gravity::einsum::derivative<1>(R); }; + BENCHMARK("D' (tensorlib)") { return gravity::tensorlib::derivative<1>(R); }; + BENCHMARK("D'' (direct)") { return gravity::direct::derivative<2>(R); }; + BENCHMARK("D'' (einsum)") { return gravity::einsum::derivative<2>(R); }; + BENCHMARK("D'' (tensorlib)") { return gravity::tensorlib::derivative<2>(R); }; + BENCHMARK("D''' (direct)") { return gravity::direct::derivative<3>(R); }; + BENCHMARK("D''' (einsum)") { return gravity::einsum::derivative<3>(R); }; + BENCHMARK("D''' (tensorlib)") { return gravity::tensorlib::derivative<3>(R); }; + BENCHMARK("D'''' (direct)") { return gravity::direct::derivative<4>(R); }; + BENCHMARK("D'''' (einsum)") { return gravity::einsum::derivative<4>(R); }; + BENCHMARK("D'''' (tensorlib)") { return gravity::tensorlib::derivative<4>(R); }; + + BENCHMARK("D' - D'''' (direct)") { return gravity::direct::derivatives<4>(R); }; + BENCHMARK("D' - D'''' (einsum)") { return gravity::einsum::derivatives<4>(R); }; + BENCHMARK("D' - D'''' (tensorlib)") { return gravity::tensorlib::derivatives<4>(R); }; + + BENCHMARK("D' - D''''' (direct)") { return gravity::direct::derivatives<5>(R); }; + //BENCHMARK("D' - D''''' (einsum)") { return gravity::einsum::derivatives<5>(R); }; // todo: unimplemented + BENCHMARK("D' - D''''' (tensorlib)") { return gravity::tensorlib::derivatives<5>(R); }; + + // BENCHMARK("D1-5 Construction (direct)") { return gravity::direct::derivatives<5>(R); }; + // BENCHMARK("D1-5 Construction (tensorlib)") { return gravity::tensorlib::derivatives<5>(R); }; } diff --git a/doc/pages/main.md b/doc/pages/main.md index 3a95202..f1fae55 100644 --- a/doc/pages/main.md +++ b/doc/pages/main.md @@ -11,13 +11,12 @@ ![C++20](https://img.shields.io/badge/standard-C%2B%2B20-blue?logo=cplusplus) @m_enddiv -Symtensor is designed for working with tensors which are... -- **Symmetric** — only the upper triangle needs to be stored. -- **Small** — indices have semantic meaning, similar to GLM's `mat3x3` type. -- **High-rank** — not just vectors and matrices, these tensors can have any number of dimensions. +Symtensor is designed for working with tensors which are **Symmetric**, **Small** and **High-rank**, and for the composition of those tensors into Multipoles. -These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are immediately ruled out for this purpose. +These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are ruled out for this purpose because they don't support condensed storage of symmetric tensors. -There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and provides no benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is entirely based on C-style arrays and not intended to be human-readable. +There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and doesn't provide benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is based on C-style arrays and not intended to be human-readable. -The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_. \ No newline at end of file +The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_. + +Eventually, I intend to incorporate symtensor into a larger _n_-body solver built from the ground up. To this end, [Christoph Peters](https://momentsingraphics.de/About.html) has been a big help by generating derivatives of the Newton's gravity equation up to arbitrarily high orders. diff --git a/include/symtensor/SymmetricTensorBase.h b/include/symtensor/SymmetricTensorBase.h index 31a63bc..d9682fa 100644 --- a/include/symtensor/SymmetricTensorBase.h +++ b/include/symtensor/SymmetricTensorBase.h @@ -167,6 +167,13 @@ namespace symtensor { static_assert(sizeof...(s) == NumUniqueValues); } + /** + * @brief Constructor from an std::array of scalars. + * + * @param s an array of scalar values to initialize the tensor. + */ + explicit constexpr SymmetricTensorBase(const std::array &values) : _data{values} {} + /** * @brief (Implicit) constructor from a sequence of values. * @@ -692,15 +699,15 @@ namespace symtensor { static_assert(NumValues > 0); // Use a lookup table when the range of options is small enough -// if constexpr (R < 3) -// return [&](std::index_sequence) { -// return as_lookup_table< -// decltype([](std::array ind) consteval { return symtensor::flatIndex(ind, D); }), -// std::array, -// lexicographicalIndices(i)... -// >(indices); -// }(std::make_index_sequence()); -// else + // if constexpr (R < 3) + // return [&](std::index_sequence) { + // return as_lookup_table< + // decltype([](std::array ind) consteval { return symtensor::flatIndex(ind, D); }), + // std::array, + // lexicographicalIndices(i)... + // >(indices); + // }(std::make_index_sequence()); + // else return symtensor::flatIndex(indices, D); } diff --git a/include/symtensor/gravity.h b/include/symtensor/gravity/direct.h similarity index 56% rename from include/symtensor/gravity.h rename to include/symtensor/gravity/direct.h index 5d9b60d..e4545c0 100644 --- a/include/symtensor/gravity.h +++ b/include/symtensor/gravity/direct.h @@ -4,104 +4,14 @@ #include -#include -#include +#include "symtensor/symtensor.h" +#include "symtensor/util.h" -namespace symtensor::gravity { +namespace symtensor::gravity::direct { - template - inline constexpr auto product_of_elements(const Vector &v) { - // todo: maybe don't use std::apply for this? - return std::apply([&](auto ...i) { - return (v[static_cast(i)] * ...); - }, index); - } - - template - ALWAYS_INLINE auto derivative_at(const Vector &R) { - - auto r2 = glm::length2(R); - auto r = sqrt(r2); - auto inv_r2 = 1.0f / r2; - auto g1 = -inv_r2 / r; - auto g2 = -3.0f * g1 * inv_r2; - auto g3 = -5.0f * g2 * inv_r2; - auto g4 = -7.0f * g3 * inv_r2; - auto g5 = -9.0f * g4 * inv_r2; - -// auto r = glm::length(R); -// auto g1 = -1.0f / pow<3>(r); -// auto g2 = 3.0f / pow<5>(r); -// auto g3 = -15.0f / pow<7>(r); -// auto g4 = 105.0f / pow<9>(r); -// auto g5 = -945.0f / pow<11>(r); - - auto cartesian_product_term = product_of_elements(R); - if constexpr (N == 1) { - return R[static_cast(index[0])] * g1; - } else if constexpr (N == 2) { - return g1 * kronecker_delta(index) + g2 * cartesian_product_term; - } else if constexpr (N == 3) { - constexpr auto partitions = binomial_partitions<1>(index); - constexpr auto r_indices = std::get<0>(unzip(partitions)); - constexpr auto kronecker_indices = std::get<1>(unzip(partitions)); - constexpr auto repeats = repeats_table(kronecker_indices); - auto kronecker_product_term = [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { - return (( - force_consteval() && (repeats[i] > 0)> ? - product_of_elements(R) * repeats[i] : 0 - ) + ...); - }(std::make_index_sequence()); - return g2 * kronecker_product_term + g3 * cartesian_product_term; - } else if constexpr (N == 4) { - auto result = g4 * cartesian_product_term; - constexpr auto partitions = binomial_partitions<2>(index); - constexpr auto kronecker_indices = std::get<0>(unzip(partitions)); - constexpr auto r_indices = std::get<1>(unzip(partitions)); - { - constexpr auto kronecker_term = [&](std::index_sequence) constexpr { - return ((kronecker_delta(kronecker_indices[i]) & kronecker_delta(r_indices[i])) + ...); - }(std::make_index_sequence()); - result += g2 * kronecker_term; - } - { - constexpr auto repeats = repeats_table(kronecker_indices); - auto kronecker_product_term = [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { - return (( - force_consteval() && (repeats[i] > 0)> ? - product_of_elements(R) * repeats[i] : 0 - ) + ...); - }(std::make_index_sequence()); - result += g3 * kronecker_product_term; - } - - return result; - } else if constexpr (N == 5) { - auto result = g5 * cartesian_product_term; - return result; - } - return 0.0f; - } - - template - ALWAYS_INLINE auto derivative(const Vector &R) { - return SymmetricTensor3f::NullaryExpression([&]() LAMBDA_ALWAYS_INLINE { - return derivative_at(R); - }); - }; - - template - ALWAYS_INLINE auto derivatives(const Vector &R) { - return [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { -// return Multipole3f(derivative(R) ...); - return std::make_tuple(derivative(R) ...); - }(std::make_index_sequence()); - }; - - - // todo: this needs to be regularized -- there must be a pattern! + // An implementation of gravity derivatives in the style of GADGET-4 template - ALWAYS_INLINE auto D(const Vector &R) { + ALWAYS_INLINE auto derivative(const Vector &R) { auto r = glm::length(R); @@ -181,10 +91,10 @@ namespace symtensor::gravity { } else if constexpr (Order == 5) { - SymmetricTensor3f<2> R2 = SymmetricTensor3f<2>::CartesianPower(R); - SymmetricTensor3f<3> R3 = SymmetricTensor3f<3>::CartesianPower(R); + SymmetricTensor3f<2> R2 = SymmetricTensor3f < 2 > ::CartesianPower(R); + SymmetricTensor3f<3> R3 = SymmetricTensor3f < 3 > ::CartesianPower(R); - auto A = g5 * SymmetricTensor3f<5>::CartesianPower(R); + auto A = g5 * SymmetricTensor3f < 5 > ::CartesianPower(R); // ~~~ @@ -251,19 +161,11 @@ namespace symtensor::gravity { } template - ALWAYS_INLINE auto Ds(const Vector &R) { + ALWAYS_INLINE auto derivatives(const Vector &R) { return [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { - return std::make_tuple(D(R) ...); + return std::make_tuple(derivative(R) ...); }(std::make_index_sequence()); }; - auto derivative4(const glm::vec3 &R) { - return derivatives<4>(R); - } - - auto D4(const glm::vec3 &R) { - return Ds<4>(R); - } - -} +} \ No newline at end of file diff --git a/include/symtensor/gravity/einsum.h b/include/symtensor/gravity/einsum.h new file mode 100644 index 0000000..d0a2530 --- /dev/null +++ b/include/symtensor/gravity/einsum.h @@ -0,0 +1,101 @@ +#pragma once + +#define GLM_ENABLE_EXPERIMENTAL + +#include + +#include "symtensor/symtensor.h" +#include "symtensor/util.h" + +namespace symtensor::gravity::einsum { + + template + inline constexpr auto product_of_elements(const Vector &v) { + return std::apply([=](auto ...i) { + return (v[static_cast(i)] * ...); + }, index); + } + + template + ALWAYS_INLINE auto derivative_at(const Vector &R) { + + auto r2 = glm::length2(R); + auto r = std::sqrt(r2); + auto inv_r2 = 1.0f / r2; + auto g1 = -inv_r2 / r; + auto g2 = -3.0f * g1 * inv_r2; + auto g3 = -5.0f * g2 * inv_r2; + auto g4 = -7.0f * g3 * inv_r2; + auto g5 = -9.0f * g4 * inv_r2; + + // auto r = glm::length(R); + // auto g1 = -1.0f / pow<3>(r); + // auto g2 = 3.0f / pow<5>(r); + // auto g3 = -15.0f / pow<7>(r); + // auto g4 = 105.0f / pow<9>(r); + // auto g5 = -945.0f / pow<11>(r); + + auto cartesian_product_term = product_of_elements(R); + if constexpr (N == 1) { + return R[static_cast(index[0])] * g1; + } else if constexpr (N == 2) { + return g1 * kronecker_delta(index) + g2 * cartesian_product_term; + } else if constexpr (N == 3) { + constexpr auto partitions = binomial_partitions<1>(index); + constexpr auto r_indices = std::get<0>(unzip(partitions)); + constexpr auto kronecker_indices = std::get<1>(unzip(partitions)); + constexpr auto repeats = repeats_table(kronecker_indices); + auto kronecker_product_term = [=](std::index_sequence) LAMBDA_ALWAYS_INLINE { + return (( + force_consteval() && (repeats[i] > 0)> ? + product_of_elements(R) * repeats[i] : 0 + ) + ...); + }(std::make_index_sequence()); + return g2 * kronecker_product_term + g3 * cartesian_product_term; + } else if constexpr (N == 4) { + auto result = g4 * cartesian_product_term; + constexpr auto partitions = binomial_partitions<2>(index); + constexpr auto kronecker_indices = std::get<0>(unzip(partitions)); + constexpr auto r_indices = std::get<1>(unzip(partitions)); + { + constexpr auto kronecker_term = [=](std::index_sequence) constexpr { + return ((kronecker_delta(kronecker_indices[i]) & kronecker_delta(r_indices[i])) + ...); + }(std::make_index_sequence()); + result += g2 * kronecker_term; + } + { + constexpr auto repeats = repeats_table(kronecker_indices); + auto kronecker_product_term = [=](std::index_sequence) LAMBDA_ALWAYS_INLINE { + return (( + force_consteval() && (repeats[i] > 0)> ? + product_of_elements(R) * repeats[i] : 0 + ) + ...); + }(std::make_index_sequence()); + result += g3 * kronecker_product_term; + } + + return result; + } else if constexpr (N == 5) { + auto result = g5 * cartesian_product_term; + return result; + } + return 0.0f; + } + + template + ALWAYS_INLINE auto derivative(const Vector &R) { + return SymmetricTensor3f::NullaryExpression([&]() LAMBDA_ALWAYS_INLINE { + return derivative_at(R); + }); + }; + + template + ALWAYS_INLINE auto derivatives(const Vector &R) { + return [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { + // return Multipole3f(derivative(R) ...); + return std::make_tuple(derivative(R) ...); + }(std::make_index_sequence()); + }; + + +} diff --git a/include/symtensor/gravity/tensorlib.h b/include/symtensor/gravity/tensorlib.h new file mode 100644 index 0000000..3b1f347 --- /dev/null +++ b/include/symtensor/gravity/tensorlib.h @@ -0,0 +1,45 @@ +#pragma once + +#define GLM_ENABLE_EXPERIMENTAL + +#include + +#include "symtensor/symtensor.h" +#include "symtensor/util.h" + +#include "tensorlib_autogen.h" +#include "symtensor/MultipoleMoment.h" + +namespace symtensor::gravity::tensorlib { + + ALWAYS_INLINE auto derivatives5(const glm::vec3 &R) { + // todo: Not a fan of this. + float d0{}; + a_3 point{R.x, R.y, R.z}; + auto d = Multipole3f<5>{}; + auto &[st1, st2, st3, st4, st5] = d; + get_multipole_derivatives( + d0, + reinterpret_cast(st1), + reinterpret_cast(st2), + reinterpret_cast(st3), + reinterpret_cast(st4), + reinterpret_cast(st5), + point + ); + return d; + } + + template + ALWAYS_INLINE auto derivative(const Vector &R) { + return derivatives5(R).template tensor(); + }; + + template + ALWAYS_INLINE auto derivatives(const Vector &R) { + auto m = Multipole3f{}; + return [&](std::index_sequence) LAMBDA_ALWAYS_INLINE { + return Multipole3f(derivative(R) ...); + }(std::make_index_sequence()); + }; +} diff --git a/include/symtensor/gravity/tensorlib_autogen.h b/include/symtensor/gravity/tensorlib_autogen.h new file mode 100644 index 0000000..3dad655 --- /dev/null +++ b/include/symtensor/gravity/tensorlib_autogen.h @@ -0,0 +1,618 @@ +#pragma once +#include "../platform.h" + +/** + * The following is generated from tensor diagram notation by Christoph Peters. + */ + +#include + + +struct A_3 { + float a[3]; +}; + +struct AA_3 { + float a[6]; +}; + +struct AAA_3 { + float a[10]; +}; + +struct AAAA_3 { + float a[15]; +}; + +struct AAAAA_3 { + float a[21]; +}; + +struct a_3 { + float a[3]; +}; + +struct aa_3A_3 { + float a[6]; +}; + +ALWAYS_INLINE A_3 einsum_a_ab_out_b_known_1(const a_3& t_0) { + A_3 result; + result.a[0] = t_0.a[0]; + result.a[1] = t_0.a[1]; + result.a[2] = t_0.a[2]; + return result; +} + +ALWAYS_INLINE aa_3A_3 einsum_a_b_ab_group_0_1(const A_3& t_0) { + aa_3A_3 result; + result.a[0] = t_0.a[0] * t_0.a[0]; + result.a[1] = t_0.a[0] * t_0.a[1]; + result.a[2] = t_0.a[0] * t_0.a[2]; + result.a[3] = t_0.a[1] * t_0.a[1]; + result.a[4] = t_0.a[1] * t_0.a[2]; + result.a[5] = t_0.a[2] * t_0.a[2]; + return result; +} + +ALWAYS_INLINE AAA_3 einsum_a_bc_abc(const A_3& t_0, const aa_3A_3& t_1) { + AAA_3 result; + result.a[0] = t_0.a[0] * t_1.a[0]; + result.a[1] = t_0.a[0] * t_1.a[1]; + result.a[2] = t_0.a[0] * t_1.a[2]; + result.a[3] = t_0.a[0] * t_1.a[3]; + result.a[4] = t_0.a[0] * t_1.a[4]; + result.a[5] = t_0.a[0] * t_1.a[5]; + result.a[6] = t_0.a[1] * t_1.a[3]; + result.a[7] = t_0.a[1] * t_1.a[4]; + result.a[8] = t_0.a[1] * t_1.a[5]; + result.a[9] = t_0.a[2] * t_1.a[5]; + return result; +} + +ALWAYS_INLINE AAAA_3 einsum_a_bcd_abcd(const A_3& t_0, const AAA_3& t_1) { + AAAA_3 result; + result.a[0] = t_0.a[0] * t_1.a[0]; + result.a[1] = t_0.a[0] * t_1.a[1]; + result.a[2] = t_0.a[0] * t_1.a[2]; + result.a[3] = t_0.a[0] * t_1.a[3]; + result.a[4] = t_0.a[0] * t_1.a[4]; + result.a[5] = t_0.a[0] * t_1.a[5]; + result.a[6] = t_0.a[0] * t_1.a[6]; + result.a[7] = t_0.a[0] * t_1.a[7]; + result.a[8] = t_0.a[0] * t_1.a[8]; + result.a[9] = t_0.a[0] * t_1.a[9]; + result.a[10] = t_0.a[1] * t_1.a[6]; + result.a[11] = t_0.a[1] * t_1.a[7]; + result.a[12] = t_0.a[1] * t_1.a[8]; + result.a[13] = t_0.a[1] * t_1.a[9]; + result.a[14] = t_0.a[2] * t_1.a[9]; + return result; +} + +ALWAYS_INLINE AAAAA_3 einsum_a_bcde_abcde(const A_3& t_0, const AAAA_3& t_1) { + AAAAA_3 result; + result.a[0] = t_0.a[0] * t_1.a[0]; + result.a[1] = t_0.a[0] * t_1.a[1]; + result.a[2] = t_0.a[0] * t_1.a[2]; + result.a[3] = t_0.a[0] * t_1.a[3]; + result.a[4] = t_0.a[0] * t_1.a[4]; + result.a[5] = t_0.a[0] * t_1.a[5]; + result.a[6] = t_0.a[0] * t_1.a[6]; + result.a[7] = t_0.a[0] * t_1.a[7]; + result.a[8] = t_0.a[0] * t_1.a[8]; + result.a[9] = t_0.a[0] * t_1.a[9]; + result.a[10] = t_0.a[0] * t_1.a[10]; + result.a[11] = t_0.a[0] * t_1.a[11]; + result.a[12] = t_0.a[0] * t_1.a[12]; + result.a[13] = t_0.a[0] * t_1.a[13]; + result.a[14] = t_0.a[0] * t_1.a[14]; + result.a[15] = t_0.a[1] * t_1.a[10]; + result.a[16] = t_0.a[1] * t_1.a[11]; + result.a[17] = t_0.a[1] * t_1.a[12]; + result.a[18] = t_0.a[1] * t_1.a[13]; + result.a[19] = t_0.a[1] * t_1.a[14]; + result.a[20] = t_0.a[2] * t_1.a[14]; + return result; +} + +ALWAYS_INLINE float einsum_a_a(const a_3& t_0, const A_3& t_1) { + float result; + result = t_0.a[0] * t_1.a[0]; + result += t_0.a[1] * t_1.a[1]; + result += t_0.a[2] * t_1.a[2]; + return result; +} + +ALWAYS_INLINE float scale(const float& factor, const float& tensor) { + float result; + result = factor * tensor; + return result; +} + +ALWAYS_INLINE A_3 einsum_0_a_0_a_known_0_2(const A_3& t_1) { + A_3 result; + result.a[0] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[2] = t_1.a[2]; + return result; +} + +ALWAYS_INLINE A_3 einsum_0_a_a(const float& t_0, const A_3& t_1) { + A_3 result; + result.a[0] = t_0 * t_1.a[0]; + result.a[1] = t_0 * t_1.a[1]; + result.a[2] = t_0 * t_1.a[2]; + return result; +} + +ALWAYS_INLINE AA_3 einsum_0_ab_0_ab_known_0_2(const aa_3A_3& t_1) { + AA_3 result; + result.a[0] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[2] = t_1.a[2]; + result.a[3] = t_1.a[3]; + result.a[4] = t_1.a[4]; + result.a[5] = t_1.a[5]; + return result; +} + +ALWAYS_INLINE AA_3 einsum_0_ab_ab(const float& t_0, const AA_3& t_1) { + AA_3 result; + result.a[0] = t_0 * t_1.a[0]; + result.a[1] = t_0 * t_1.a[1]; + result.a[2] = t_0 * t_1.a[2]; + result.a[3] = t_0 * t_1.a[3]; + result.a[4] = t_0 * t_1.a[4]; + result.a[5] = t_0 * t_1.a[5]; + return result; +} + +ALWAYS_INLINE AA_3 einsum_0_ab_ab_known_1(const float& t_0) { + AA_3 result; + result.a[0] = t_0; + result.a[3] = t_0; + result.a[5] = t_0; + result.a[1] = 0; + result.a[2] = 0; + result.a[4] = 0; + return result; +} + +ALWAYS_INLINE AA_3 add(const AA_3& lhs, const AA_3& rhs) { + AA_3 result; + result.a[0] = lhs.a[0] + rhs.a[0]; + result.a[1] = lhs.a[1] + rhs.a[1]; + result.a[2] = lhs.a[2] + rhs.a[2]; + result.a[3] = lhs.a[3] + rhs.a[3]; + result.a[4] = lhs.a[4] + rhs.a[4]; + result.a[5] = lhs.a[5] + rhs.a[5]; + return result; +} + +ALWAYS_INLINE AAA_3 einsum_0_a_bc_abc_known_0_2(const A_3& t_1) { + AAA_3 result; + result.a[0] = 3 * t_1.a[0]; + result.a[3] = t_1.a[0]; + result.a[5] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[6] = 3 * t_1.a[1]; + result.a[8] = t_1.a[1]; + result.a[2] = t_1.a[2]; + result.a[7] = t_1.a[2]; + result.a[9] = 3 * t_1.a[2]; + result.a[4] = 0; + return result; +} + +ALWAYS_INLINE AAA_3 einsum_0_abc_abc(const float& t_0, const AAA_3& t_1) { + AAA_3 result; + result.a[0] = t_0 * t_1.a[0]; + result.a[1] = t_0 * t_1.a[1]; + result.a[2] = t_0 * t_1.a[2]; + result.a[3] = t_0 * t_1.a[3]; + result.a[4] = t_0 * t_1.a[4]; + result.a[5] = t_0 * t_1.a[5]; + result.a[6] = t_0 * t_1.a[6]; + result.a[7] = t_0 * t_1.a[7]; + result.a[8] = t_0 * t_1.a[8]; + result.a[9] = t_0 * t_1.a[9]; + return result; +} + +ALWAYS_INLINE AAA_3 einsum_0_abc_0_abc_known_0_2(const AAA_3& t_1) { + AAA_3 result; + result.a[0] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[2] = t_1.a[2]; + result.a[3] = t_1.a[3]; + result.a[4] = t_1.a[4]; + result.a[5] = t_1.a[5]; + result.a[6] = t_1.a[6]; + result.a[7] = t_1.a[7]; + result.a[8] = t_1.a[8]; + result.a[9] = t_1.a[9]; + return result; +} + +ALWAYS_INLINE AAA_3 add(const AAA_3& lhs, const AAA_3& rhs) { + AAA_3 result; + result.a[0] = lhs.a[0] + rhs.a[0]; + result.a[1] = lhs.a[1] + rhs.a[1]; + result.a[2] = lhs.a[2] + rhs.a[2]; + result.a[3] = lhs.a[3] + rhs.a[3]; + result.a[4] = lhs.a[4] + rhs.a[4]; + result.a[5] = lhs.a[5] + rhs.a[5]; + result.a[6] = lhs.a[6] + rhs.a[6]; + result.a[7] = lhs.a[7] + rhs.a[7]; + result.a[8] = lhs.a[8] + rhs.a[8]; + result.a[9] = lhs.a[9] + rhs.a[9]; + return result; +} + +ALWAYS_INLINE AAAA_3 einsum_0_ab_cd_abcd_known_0_2(const aa_3A_3& t_1) { + AAAA_3 result; + result.a[0] = 6 * t_1.a[0]; + result.a[3] = t_1.a[0]; + result.a[5] = t_1.a[0]; + result.a[1] = 3 * t_1.a[1]; + result.a[6] = 3 * t_1.a[1]; + result.a[8] = t_1.a[1]; + result.a[2] = 3 * t_1.a[2]; + result.a[7] = t_1.a[2]; + result.a[9] = 3 * t_1.a[2]; + result.a[3] += t_1.a[3]; + result.a[10] = 6 * t_1.a[3]; + result.a[12] = t_1.a[3]; + result.a[4] = t_1.a[4]; + result.a[11] = 3 * t_1.a[4]; + result.a[13] = 3 * t_1.a[4]; + result.a[5] += t_1.a[5]; + result.a[12] += t_1.a[5]; + result.a[14] = 6 * t_1.a[5]; + return result; +} + +ALWAYS_INLINE AAAA_3 einsum_0_abcd_abcd(const float& t_0, const AAAA_3& t_1) { + AAAA_3 result; + result.a[0] = t_0 * t_1.a[0]; + result.a[1] = t_0 * t_1.a[1]; + result.a[2] = t_0 * t_1.a[2]; + result.a[3] = t_0 * t_1.a[3]; + result.a[4] = t_0 * t_1.a[4]; + result.a[5] = t_0 * t_1.a[5]; + result.a[6] = t_0 * t_1.a[6]; + result.a[7] = t_0 * t_1.a[7]; + result.a[8] = t_0 * t_1.a[8]; + result.a[9] = t_0 * t_1.a[9]; + result.a[10] = t_0 * t_1.a[10]; + result.a[11] = t_0 * t_1.a[11]; + result.a[12] = t_0 * t_1.a[12]; + result.a[13] = t_0 * t_1.a[13]; + result.a[14] = t_0 * t_1.a[14]; + return result; +} + +ALWAYS_INLINE AAAA_3 einsum_0_abcd_abcd_known_1(const float& t_0) { + AAAA_3 result; + result.a[0] = 3 * t_0; + result.a[3] = t_0; + result.a[5] = t_0; + result.a[10] = 3 * t_0; + result.a[12] = t_0; + result.a[14] = 3 * t_0; + result.a[1] = 0; + result.a[2] = 0; + result.a[4] = 0; + result.a[6] = 0; + result.a[7] = 0; + result.a[8] = 0; + result.a[9] = 0; + result.a[11] = 0; + result.a[13] = 0; + return result; +} + +ALWAYS_INLINE AAAA_3 einsum_0_abcd_0_abcd_known_0_2(const AAAA_3& t_1) { + AAAA_3 result; + result.a[0] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[2] = t_1.a[2]; + result.a[3] = t_1.a[3]; + result.a[4] = t_1.a[4]; + result.a[5] = t_1.a[5]; + result.a[6] = t_1.a[6]; + result.a[7] = t_1.a[7]; + result.a[8] = t_1.a[8]; + result.a[9] = t_1.a[9]; + result.a[10] = t_1.a[10]; + result.a[11] = t_1.a[11]; + result.a[12] = t_1.a[12]; + result.a[13] = t_1.a[13]; + result.a[14] = t_1.a[14]; + return result; +} + +ALWAYS_INLINE AAAA_3 add(const AAAA_3& lhs, const AAAA_3& rhs) { + AAAA_3 result; + result.a[0] = lhs.a[0] + rhs.a[0]; + result.a[1] = lhs.a[1] + rhs.a[1]; + result.a[2] = lhs.a[2] + rhs.a[2]; + result.a[3] = lhs.a[3] + rhs.a[3]; + result.a[4] = lhs.a[4] + rhs.a[4]; + result.a[5] = lhs.a[5] + rhs.a[5]; + result.a[6] = lhs.a[6] + rhs.a[6]; + result.a[7] = lhs.a[7] + rhs.a[7]; + result.a[8] = lhs.a[8] + rhs.a[8]; + result.a[9] = lhs.a[9] + rhs.a[9]; + result.a[10] = lhs.a[10] + rhs.a[10]; + result.a[11] = lhs.a[11] + rhs.a[11]; + result.a[12] = lhs.a[12] + rhs.a[12]; + result.a[13] = lhs.a[13] + rhs.a[13]; + result.a[14] = lhs.a[14] + rhs.a[14]; + return result; +} + +ALWAYS_INLINE AAAAA_3 einsum_0_a_bcde_abcde_known_0_2(const A_3& t_1) { + AAAAA_3 result; + result.a[0] = 15 * t_1.a[0]; + result.a[3] = 3 * t_1.a[0]; + result.a[5] = 3 * t_1.a[0]; + result.a[10] = 3 * t_1.a[0]; + result.a[12] = t_1.a[0]; + result.a[14] = 3 * t_1.a[0]; + result.a[1] = 3 * t_1.a[1]; + result.a[6] = 3 * t_1.a[1]; + result.a[8] = t_1.a[1]; + result.a[15] = 15 * t_1.a[1]; + result.a[17] = 3 * t_1.a[1]; + result.a[19] = 3 * t_1.a[1]; + result.a[2] = 3 * t_1.a[2]; + result.a[7] = t_1.a[2]; + result.a[9] = 3 * t_1.a[2]; + result.a[16] = 3 * t_1.a[2]; + result.a[18] = 3 * t_1.a[2]; + result.a[20] = 15 * t_1.a[2]; + result.a[4] = 0; + result.a[11] = 0; + result.a[13] = 0; + return result; +} + +ALWAYS_INLINE AAAAA_3 einsum_0_abcde_abcde(const float& t_0, const AAAAA_3& t_1) { + AAAAA_3 result; + result.a[0] = t_0 * t_1.a[0]; + result.a[1] = t_0 * t_1.a[1]; + result.a[2] = t_0 * t_1.a[2]; + result.a[3] = t_0 * t_1.a[3]; + result.a[4] = t_0 * t_1.a[4]; + result.a[5] = t_0 * t_1.a[5]; + result.a[6] = t_0 * t_1.a[6]; + result.a[7] = t_0 * t_1.a[7]; + result.a[8] = t_0 * t_1.a[8]; + result.a[9] = t_0 * t_1.a[9]; + result.a[10] = t_0 * t_1.a[10]; + result.a[11] = t_0 * t_1.a[11]; + result.a[12] = t_0 * t_1.a[12]; + result.a[13] = t_0 * t_1.a[13]; + result.a[14] = t_0 * t_1.a[14]; + result.a[15] = t_0 * t_1.a[15]; + result.a[16] = t_0 * t_1.a[16]; + result.a[17] = t_0 * t_1.a[17]; + result.a[18] = t_0 * t_1.a[18]; + result.a[19] = t_0 * t_1.a[19]; + result.a[20] = t_0 * t_1.a[20]; + return result; +} + +ALWAYS_INLINE AAAAA_3 einsum_0_abc_de_abcde_known_0_2(const AAA_3& t_1) { + AAAAA_3 result; + result.a[0] = 10 * t_1.a[0]; + result.a[3] = t_1.a[0]; + result.a[5] = t_1.a[0]; + result.a[1] = 6 * t_1.a[1]; + result.a[6] = 3 * t_1.a[1]; + result.a[8] = t_1.a[1]; + result.a[2] = 6 * t_1.a[2]; + result.a[7] = t_1.a[2]; + result.a[9] = 3 * t_1.a[2]; + result.a[3] += 3 * t_1.a[3]; + result.a[10] = 6 * t_1.a[3]; + result.a[12] = t_1.a[3]; + result.a[4] = 3 * t_1.a[4]; + result.a[11] = 3 * t_1.a[4]; + result.a[13] = 3 * t_1.a[4]; + result.a[5] += 3 * t_1.a[5]; + result.a[12] += t_1.a[5]; + result.a[14] = 6 * t_1.a[5]; + result.a[6] += t_1.a[6]; + result.a[15] = 10 * t_1.a[6]; + result.a[17] = t_1.a[6]; + result.a[7] += t_1.a[7]; + result.a[16] = 6 * t_1.a[7]; + result.a[18] = 3 * t_1.a[7]; + result.a[8] += t_1.a[8]; + result.a[17] += 3 * t_1.a[8]; + result.a[19] = 6 * t_1.a[8]; + result.a[9] += t_1.a[9]; + result.a[18] += t_1.a[9]; + result.a[20] = 10 * t_1.a[9]; + return result; +} + +ALWAYS_INLINE AAAAA_3 einsum_0_abcde_0_abcde_known_0_2(const AAAAA_3& t_1) { + AAAAA_3 result; + result.a[0] = t_1.a[0]; + result.a[1] = t_1.a[1]; + result.a[2] = t_1.a[2]; + result.a[3] = t_1.a[3]; + result.a[4] = t_1.a[4]; + result.a[5] = t_1.a[5]; + result.a[6] = t_1.a[6]; + result.a[7] = t_1.a[7]; + result.a[8] = t_1.a[8]; + result.a[9] = t_1.a[9]; + result.a[10] = t_1.a[10]; + result.a[11] = t_1.a[11]; + result.a[12] = t_1.a[12]; + result.a[13] = t_1.a[13]; + result.a[14] = t_1.a[14]; + result.a[15] = t_1.a[15]; + result.a[16] = t_1.a[16]; + result.a[17] = t_1.a[17]; + result.a[18] = t_1.a[18]; + result.a[19] = t_1.a[19]; + result.a[20] = t_1.a[20]; + return result; +} + +ALWAYS_INLINE AAAAA_3 add(const AAAAA_3& lhs, const AAAAA_3& rhs) { + AAAAA_3 result; + result.a[0] = lhs.a[0] + rhs.a[0]; + result.a[1] = lhs.a[1] + rhs.a[1]; + result.a[2] = lhs.a[2] + rhs.a[2]; + result.a[3] = lhs.a[3] + rhs.a[3]; + result.a[4] = lhs.a[4] + rhs.a[4]; + result.a[5] = lhs.a[5] + rhs.a[5]; + result.a[6] = lhs.a[6] + rhs.a[6]; + result.a[7] = lhs.a[7] + rhs.a[7]; + result.a[8] = lhs.a[8] + rhs.a[8]; + result.a[9] = lhs.a[9] + rhs.a[9]; + result.a[10] = lhs.a[10] + rhs.a[10]; + result.a[11] = lhs.a[11] + rhs.a[11]; + result.a[12] = lhs.a[12] + rhs.a[12]; + result.a[13] = lhs.a[13] + rhs.a[13]; + result.a[14] = lhs.a[14] + rhs.a[14]; + result.a[15] = lhs.a[15] + rhs.a[15]; + result.a[16] = lhs.a[16] + rhs.a[16]; + result.a[17] = lhs.a[17] + rhs.a[17]; + result.a[18] = lhs.a[18] + rhs.a[18]; + result.a[19] = lhs.a[19] + rhs.a[19]; + result.a[20] = lhs.a[20] + rhs.a[20]; + return result; +} + +ALWAYS_INLINE void get_multipole_derivatives(float& derivative_0, A_3& derivative_1, AA_3& derivative_2, AAA_3& derivative_3, AAAA_3& derivative_4, AAAAA_3& derivative_5, const a_3& point) { + // + A_3 pA_power_1 = einsum_a_ab_out_b_known_1(point); + // fma: 6 + aa_3A_3 pA_power_2 = einsum_a_b_ab_group_0_1(pA_power_1); + // fma: 10 + AAA_3 pA_power_3 = einsum_a_bc_abc(pA_power_1, pA_power_2); + // fma: 15 + AAAA_3 pA_power_4 = einsum_a_bcd_abcd(pA_power_1, pA_power_3); + // fma: 21 + AAAAA_3 pA_power_5 = einsum_a_bcde_abcde(pA_power_1, pA_power_4); + // fma: 3 + float tmp = einsum_a_a(point, pA_power_1); + // sqrt: 1 + float g_0 = 1 / std::sqrt(tmp); + // fma: 1 + float g_power_1 = scale(1, g_0); + // fma: 1 + float g_power_2 = g_power_1 * g_0; + // fma: 1 + float g_power_3 = g_power_2 * g_0; + // fma: 1 + float g_power_4 = g_power_3 * g_0; + // fma: 1 + float g_power_5 = g_power_4 * g_0; + // fma: 1 + float g_power_6 = g_power_5 * g_0; + // fma: 1 + float g_power_7 = g_power_6 * g_0; + // fma: 1 + float g_power_8 = g_power_7 * g_0; + // fma: 1 + float g_power_9 = g_power_8 * g_0; + // fma: 1 + float g_power_10 = g_power_9 * g_0; + // fma: 1 + float g_power_11 = g_power_10 * g_0; + // fma: 1 + float tmp_01 = scale(1, g_power_1); + // + float g_0_01 = tmp_01; + // fma: 1 + float tmp_02 = scale(-1, g_power_3); + // + A_3 tmp_03 = einsum_0_a_0_a_known_0_2(pA_power_1); + // fma: 3 + A_3 g_1 = einsum_0_a_a(tmp_02, tmp_03); + // fma: 1 + float tmp_04 = scale(3, g_power_5); + // + AA_3 tmp_05 = einsum_0_ab_0_ab_known_0_2(pA_power_2); + // fma: 6 + AA_3 tmp_06 = einsum_0_ab_ab(tmp_04, tmp_05); + // fma: 1 + float tmp_07 = scale(-1, g_power_3); + // + AA_3 tmp_08 = einsum_0_ab_ab_known_1(tmp_07); + // fma: 6 + AA_3 g_2 = add(tmp_06, tmp_08); + // fma: 1 + float tmp_09 = scale(3, g_power_5); + // fma: 3 + AAA_3 tmp_10 = einsum_0_a_bc_abc_known_0_2(pA_power_1); + // fma: 10 + AAA_3 tmp_11 = einsum_0_abc_abc(tmp_09, tmp_10); + // fma: 1 + float tmp_12 = scale(-15, g_power_7); + // + AAA_3 tmp_13 = einsum_0_abc_0_abc_known_0_2(pA_power_3); + // fma: 10 + AAA_3 tmp_14 = einsum_0_abc_abc(tmp_12, tmp_13); + // fma: 10 + AAA_3 g_3 = add(tmp_11, tmp_14); + // fma: 1 + float tmp_15 = scale(-15, g_power_7); + // fma: 9 + AAAA_3 tmp_16 = einsum_0_ab_cd_abcd_known_0_2(pA_power_2); + // fma: 15 + AAAA_3 tmp_17 = einsum_0_abcd_abcd(tmp_15, tmp_16); + // fma: 1 + float tmp_18 = scale(3, g_power_5); + // fma: 3 + AAAA_3 tmp_19 = einsum_0_abcd_abcd_known_1(tmp_18); + // fma: 1 + float tmp_20 = scale(1.05e+02f, g_power_9); + // + AAAA_3 tmp_21 = einsum_0_abcd_0_abcd_known_0_2(pA_power_4); + // fma: 15 + AAAA_3 tmp_22 = einsum_0_abcd_abcd(tmp_20, tmp_21); + // fma: 15 + AAAA_3 tmp_23 = add(tmp_17, tmp_19); + // fma: 15 + AAAA_3 g_4 = add(tmp_23, tmp_22); + // fma: 1 + float tmp_24 = scale(-15, g_power_7); + // fma: 15 + AAAAA_3 tmp_25 = einsum_0_a_bcde_abcde_known_0_2(pA_power_1); + // fma: 21 + AAAAA_3 tmp_26 = einsum_0_abcde_abcde(tmp_24, tmp_25); + // fma: 1 + float tmp_27 = scale(1.05e+02f, g_power_9); + // fma: 21 + AAAAA_3 tmp_28 = einsum_0_abc_de_abcde_known_0_2(pA_power_3); + // fma: 21 + AAAAA_3 tmp_29 = einsum_0_abcde_abcde(tmp_27, tmp_28); + // fma: 1 + float remaining_factor = scale(-9.45e+02f, g_power_11); + // + AAAAA_3 tmp_30 = einsum_0_abcde_0_abcde_known_0_2(pA_power_5); + // fma: 21 + AAAAA_3 term = einsum_0_abcde_abcde(remaining_factor, tmp_30); + // fma: 21 + AAAAA_3 tmp_31 = add(tmp_26, tmp_29); + // fma: 21 + AAAAA_3 g_5 = add(tmp_31, term); + derivative_0 = g_0_01; + derivative_1 = g_1; + derivative_2 = g_2; + derivative_3 = g_3; + derivative_4 = g_4; + derivative_5 = g_5; + // fma: 339, sqrt: 1 +} +