From f1463b45d029bee43c1f990da56e3fb5486daed1 Mon Sep 17 00:00:00 2001
From: JacksonCampolattaro <jackcamp@vt.edu>
Date: Sat, 29 Jun 2024 18:52:20 +0200
Subject: [PATCH] Organize different gravity derivative implementations,
 including Christoph's generated code.

---
 README.md                                     |  13 +-
 benchmarks/CMakeLists.txt                     |   5 +
 benchmarks/multipoleMoment.cpp                |  67 +-
 doc/pages/main.md                             |  13 +-
 include/symtensor/SymmetricTensorBase.h       |  25 +-
 .../symtensor/{gravity.h => gravity/direct.h} | 120 +---
 include/symtensor/gravity/einsum.h            | 101 +++
 include/symtensor/gravity/tensorlib.h         |  45 ++
 include/symtensor/gravity/tensorlib_autogen.h | 618 ++++++++++++++++++
 9 files changed, 847 insertions(+), 160 deletions(-)
 rename include/symtensor/{gravity.h => gravity/direct.h} (56%)
 create mode 100644 include/symtensor/gravity/einsum.h
 create mode 100644 include/symtensor/gravity/tensorlib.h
 create mode 100644 include/symtensor/gravity/tensorlib_autogen.h

diff --git a/README.md b/README.md
index c9e81b9..7a3e612 100644
--- a/README.md
+++ b/README.md
@@ -7,15 +7,14 @@
 ![MIT license](https://img.shields.io/badge/license-MIT-A51931) 
 ![C++20](https://img.shields.io/badge/standard-C%2B%2B20-blue?logo=cplusplus)
 
-_For more information, see the [documentation](https://jackson.campolattaro.nl/symtensor)!_
+_For more information, see the [documentation](https://jackson.campolattaro.nl/symtensor)._
 
-Symtensor is designed for working with tensors which are...
-- **Symmetric** — only the upper triangle needs to be stored.
-- **Small** — indices have semantic meaning, similar to GLM's `mat3x3` type.
-- **High-rank** — not just vectors and matrices, these tensors can have any number of dimensions.
+Symtensor is designed for working with tensors which are **Symmetric**, **Small** and **High-rank**, and for the composition of those tensors into Multipoles.
 
-These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are immediately ruled out for this purpose.
+These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are ruled out for this purpose because they don't support condensed storage of symmetric tensors.
 
-There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and provides no benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is entirely based on C-style arrays and not intended to be human-readable.
+There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and doesn't provide benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is based on C-style arrays and not intended to be human-readable.
 
 The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_.
+
+Eventually, I intend to incorporate symtensor into a larger _n_-body solver built from the ground up. To this end, [Christoph Peters](https://momentsingraphics.de/About.html) has been a big help by generating derivatives of the Newton's gravity equation up to arbitrarily high orders. 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index a74cb98..8cb0db0 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -27,3 +27,8 @@ target_link_libraries(benchmarks PRIVATE
         Catch2::Catch2WithMain
         symtensor::symtensor
 )
+target_compile_options(
+        benchmarks PRIVATE
+        $<$<CXX_COMPILER_ID:Clang>:-fassociative-math>
+        $<$<CXX_COMPILER_ID:AppleClang>:-fassociative-math>
+)
diff --git a/benchmarks/multipoleMoment.cpp b/benchmarks/multipoleMoment.cpp
index 380d7ff..4352fdb 100644
--- a/benchmarks/multipoleMoment.cpp
+++ b/benchmarks/multipoleMoment.cpp
@@ -5,7 +5,10 @@
 #include <glm/geometric.hpp>
 
 #include <symtensor/Multipole.h>
-#include <symtensor/gravity.h>
+
+#include "symtensor/gravity/direct.h"
+#include "symtensor/gravity/einsum.h"
+#include "symtensor/gravity/tensorlib.h"
 
 using namespace symtensor;
 
@@ -16,31 +19,39 @@ TEST_CASE("benchmark: Multipole moment gravity approximation", "[MultipoleMoment
 TEST_CASE("benchmark: Gravity derivatives construction", "[Gravity]") {
     auto R = glm::vec3{1.0, 2.0, 3.0};
 
-
-    constexpr std::array<int, 5> input = {1, 2, 2, 3, 1};
-    constexpr auto result = filter<input, [&](auto v) constexpr { return v != 3; }>();
-
-    // Print the result (for non-constexpr context)
-    for (const auto &value: result) {
-        std::cout << value << std::endl;
-    }
-
-    CHECK((gravity::D<2>(R) - gravity::derivative<2>(R)).norm() < 1e-7);
-    CHECK((gravity::D<3>(R) - gravity::derivative<3>(R)).norm() < 1e-7);
-    CHECK((gravity::D<4>(R) - gravity::derivative<4>(R)).norm() < 1e-7);
-    CHECK((gravity::D<5>(R) - gravity::derivative<5>(R)).norm() < 1e-7); // todo
-
-//    BENCHMARK("D' Construction") { return gravity::D<1>(R); };
-//    BENCHMARK("D' Construction (improved)") { return gravity::derivative<1>(R); };
-//    BENCHMARK("D'' Construction") { return gravity::D<2>(R); };
-//    BENCHMARK("D'' Construction (improved)") { return gravity::derivative<2>(R); };
-//    BENCHMARK("D''' Construction") { return gravity::D<3>(R); };
-//    BENCHMARK("D''' Construction (improved)") { return gravity::derivative<3>(R); };
-//    BENCHMARK("D'''' Construction") { return gravity::D<4>(R); };
-//    BENCHMARK("D'''' Construction (improved)") { return gravity::derivative<4>(R); };
-//    BENCHMARK("D''''' Construction") { return gravity::D<5>(R); };
-//    BENCHMARK("D''''' Construction (improved)") { return gravity::derivative<5>(R); }; // todo
-
-    BENCHMARK("D1-4 Construction") { return gravity::Ds<4>(R); };
-    BENCHMARK("D1-4 Construction (improved)") { return gravity::derivatives<4>(R); };
+    // Einsum must match direct form
+    CHECK((gravity::direct::derivative<2>(R) - gravity::einsum::derivative<2>(R)).norm() < 1e-7);
+    CHECK((gravity::direct::derivative<3>(R) - gravity::einsum::derivative<3>(R)).norm() < 1e-7);
+    CHECK((gravity::direct::derivative<4>(R) - gravity::einsum::derivative<4>(R)).norm() < 1e-7);
+    // CHECK((gravity::D<5>(R) - gravity::derivative<5>(R)).norm() < 1e-7); // todo: unimplemented
+
+    // Tensorlib must match direct form
+    CHECK((gravity::direct::derivative<2>(R) - gravity::tensorlib::derivative<2>(R)).norm() < 1e-7);
+    CHECK((gravity::direct::derivative<3>(R) - gravity::tensorlib::derivative<3>(R)).norm() < 1e-7);
+    CHECK((gravity::direct::derivative<4>(R) - gravity::tensorlib::derivative<4>(R)).norm() < 1e-7);
+    CHECK((gravity::direct::derivative<5>(R) - gravity::tensorlib::derivative<5>(R)).norm() < 1e-7);
+
+    BENCHMARK("D' (direct)") { return gravity::direct::derivative<1>(R); };
+    BENCHMARK("D' (einsum)") { return gravity::einsum::derivative<1>(R); };
+    BENCHMARK("D' (tensorlib)") { return gravity::tensorlib::derivative<1>(R); };
+    BENCHMARK("D'' (direct)") { return gravity::direct::derivative<2>(R); };
+    BENCHMARK("D'' (einsum)") { return gravity::einsum::derivative<2>(R); };
+    BENCHMARK("D'' (tensorlib)") { return gravity::tensorlib::derivative<2>(R); };
+    BENCHMARK("D''' (direct)") { return gravity::direct::derivative<3>(R); };
+    BENCHMARK("D''' (einsum)") { return gravity::einsum::derivative<3>(R); };
+    BENCHMARK("D''' (tensorlib)") { return gravity::tensorlib::derivative<3>(R); };
+    BENCHMARK("D'''' (direct)") { return gravity::direct::derivative<4>(R); };
+    BENCHMARK("D'''' (einsum)") { return gravity::einsum::derivative<4>(R); };
+    BENCHMARK("D'''' (tensorlib)") { return gravity::tensorlib::derivative<4>(R); };
+
+    BENCHMARK("D' - D'''' (direct)") { return gravity::direct::derivatives<4>(R); };
+    BENCHMARK("D' - D'''' (einsum)") { return gravity::einsum::derivatives<4>(R); };
+    BENCHMARK("D' - D'''' (tensorlib)") { return gravity::tensorlib::derivatives<4>(R); };
+
+    BENCHMARK("D' - D''''' (direct)") { return gravity::direct::derivatives<5>(R); };
+    //BENCHMARK("D' - D''''' (einsum)") { return gravity::einsum::derivatives<5>(R); }; // todo: unimplemented
+    BENCHMARK("D' - D''''' (tensorlib)") { return gravity::tensorlib::derivatives<5>(R); };
+
+    //    BENCHMARK("D1-5 Construction (direct)") { return gravity::direct::derivatives<5>(R); };
+    //    BENCHMARK("D1-5 Construction (tensorlib)") { return gravity::tensorlib::derivatives<5>(R); };
 }
diff --git a/doc/pages/main.md b/doc/pages/main.md
index 3a95202..f1fae55 100644
--- a/doc/pages/main.md
+++ b/doc/pages/main.md
@@ -11,13 +11,12 @@
 ![C++20](https://img.shields.io/badge/standard-C%2B%2B20-blue?logo=cplusplus)
 @m_enddiv
 
-Symtensor is designed for working with tensors which are...
-- **Symmetric** — only the upper triangle needs to be stored.
-- **Small** — indices have semantic meaning, similar to GLM's `mat3x3` type.
-- **High-rank** — not just vectors and matrices, these tensors can have any number of dimensions.
+Symtensor is designed for working with tensors which are **Symmetric**, **Small** and **High-rank**, and for the composition of those tensors into Multipoles.
 
-These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are immediately ruled out for this purpose.
+These are unusual needs for a tensor library. The design comes from symtensor's original purpose as part of an implementation of the Fast Multipole Method for an [_n_-body gravity simulation](https://github.com/JacksonCampolattaro/n-body). Popular tensor libraries like [GLM](https://github.com/g-truc/glm), [Eigen](https://github.com/libigl/eigen), and [Fastor](https://github.com/romeric/Fastor) are ruled out for this purpose because they don't support condensed storage of symmetric tensors.
 
-There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and provides no benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is entirely based on C-style arrays and not intended to be human-readable.
+There are a handful of projects which aim to solve the same problems as this one, including [Tensor](https://github.com/thenumbernine/Tensor), [GADGET-4's symmetric tensor library](https://github.com/weiguangcui/Gadget4/blob/Gadget4-Simba/src/data/symtensors.h), and [fmmgen](https://github.com/rpep/fmmgen). Tensor is feature-rich, but has a broader maths focus and doesn't provide benchmarks. GADGET-4's library has been proven to perform well for _n_-body simulations, but is ultimately limited by its handwritten operators -- it can't be used for high-order multipoles. Fmmgen uses a script to generate efficient multipole and symmetric tensor code which makes it very flexible, but the code it produces is based on C-style arrays and not intended to be human-readable.
 
-The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_.
\ No newline at end of file
+The goal of this project is to produce a library which combines the modern sensibilities of _Tensor_ with the performance of _GADGET-4_'s library and the flexibility of _fmmgen_.
+
+Eventually, I intend to incorporate symtensor into a larger _n_-body solver built from the ground up. To this end, [Christoph Peters](https://momentsingraphics.de/About.html) has been a big help by generating derivatives of the Newton's gravity equation up to arbitrarily high orders.
diff --git a/include/symtensor/SymmetricTensorBase.h b/include/symtensor/SymmetricTensorBase.h
index 31a63bc..d9682fa 100644
--- a/include/symtensor/SymmetricTensorBase.h
+++ b/include/symtensor/SymmetricTensorBase.h
@@ -167,6 +167,13 @@ namespace symtensor {
             static_assert(sizeof...(s) == NumUniqueValues);
         }
 
+        /**
+         * @brief Constructor from an std::array of scalars.
+         *
+         * @param s an array of scalar values to initialize the tensor.
+         */
+        explicit constexpr SymmetricTensorBase(const std::array<S, NumUniqueValues> &values) : _data{values} {}
+
         /**
          * @brief (Implicit) constructor from a sequence of values.
          *
@@ -692,15 +699,15 @@ namespace symtensor {
             static_assert(NumValues > 0);
 
             // Use a lookup table when the range of options is small enough
-//            if constexpr (R < 3)
-//                return [&]<std::size_t... i>(std::index_sequence<i...>) {
-//                    return as_lookup_table<
-//                            decltype([](std::array<I, R> ind) consteval { return symtensor::flatIndex(ind, D); }),
-//                            std::array<I, R>,
-//                            lexicographicalIndices(i)...
-//                    >(indices);
-//                }(std::make_index_sequence<NumValues>());
-//            else
+            //            if constexpr (R < 3)
+            //                return [&]<std::size_t... i>(std::index_sequence<i...>) {
+            //                    return as_lookup_table<
+            //                            decltype([](std::array<I, R> ind) consteval { return symtensor::flatIndex(ind, D); }),
+            //                            std::array<I, R>,
+            //                            lexicographicalIndices(i)...
+            //                    >(indices);
+            //                }(std::make_index_sequence<NumValues>());
+            //            else
             return symtensor::flatIndex(indices, D);
         }
 
diff --git a/include/symtensor/gravity.h b/include/symtensor/gravity/direct.h
similarity index 56%
rename from include/symtensor/gravity.h
rename to include/symtensor/gravity/direct.h
index 5d9b60d..e4545c0 100644
--- a/include/symtensor/gravity.h
+++ b/include/symtensor/gravity/direct.h
@@ -4,104 +4,14 @@
 
 #include <glm/gtx/norm.hpp>
 
-#include <symtensor/symtensor.h>
-#include <symtensor/util.h>
+#include "symtensor/symtensor.h"
+#include "symtensor/util.h"
 
-namespace symtensor::gravity {
+namespace symtensor::gravity::direct {
 
-    template<auto index, indexable Vector>
-    inline constexpr auto product_of_elements(const Vector &v) {
-        // todo: maybe don't use std::apply for this?
-        return std::apply([&](auto ...i) {
-            return (v[static_cast<std::size_t>(i)] * ...);
-        }, index);
-    }
-
-    template<auto index, std::size_t N, indexable Vector>
-    ALWAYS_INLINE auto derivative_at(const Vector &R) {
-
-        auto r2 = glm::length2(R);
-        auto r = sqrt(r2);
-        auto inv_r2 = 1.0f / r2;
-        auto g1 = -inv_r2 / r;
-        auto g2 = -3.0f * g1 * inv_r2;
-        auto g3 = -5.0f * g2 * inv_r2;
-        auto g4 = -7.0f * g3 * inv_r2;
-        auto g5 = -9.0f * g4 * inv_r2;
-
-//        auto r = glm::length(R);
-//        auto g1 = -1.0f / pow<3>(r);
-//        auto g2 = 3.0f / pow<5>(r);
-//        auto g3 = -15.0f / pow<7>(r);
-//        auto g4 = 105.0f / pow<9>(r);
-//        auto g5 = -945.0f / pow<11>(r);
-
-        auto cartesian_product_term = product_of_elements<index>(R);
-        if constexpr (N == 1) {
-            return R[static_cast<std::size_t>(index[0])] * g1;
-        } else if constexpr (N == 2) {
-            return g1 * kronecker_delta<float>(index) + g2 * cartesian_product_term;
-        } else if constexpr (N == 3) {
-            constexpr auto partitions = binomial_partitions<1>(index);
-            constexpr auto r_indices = std::get<0>(unzip(partitions));
-            constexpr auto kronecker_indices = std::get<1>(unzip(partitions));
-            constexpr auto repeats = repeats_table(kronecker_indices);
-            auto kronecker_product_term = [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
-                return ((
-                        force_consteval<kronecker_delta<kronecker_indices[i]>() && (repeats[i] > 0)> ?
-                        product_of_elements<r_indices[i]>(R) * repeats[i] : 0
-                ) + ...);
-            }(std::make_index_sequence<kronecker_indices.size()>());
-            return g2 * kronecker_product_term + g3 * cartesian_product_term;
-        } else if constexpr (N == 4) {
-            auto result = g4 * cartesian_product_term;
-            constexpr auto partitions = binomial_partitions<2>(index);
-            constexpr auto kronecker_indices = std::get<0>(unzip(partitions));
-            constexpr auto r_indices = std::get<1>(unzip(partitions));
-            {
-                constexpr auto kronecker_term = [&]<auto... i>(std::index_sequence<i...>) constexpr {
-                    return ((kronecker_delta(kronecker_indices[i]) & kronecker_delta(r_indices[i])) + ...);
-                }(std::make_index_sequence<kronecker_indices.size() / 2>());
-                result += g2 * kronecker_term;
-            }
-            {
-                constexpr auto repeats = repeats_table(kronecker_indices);
-                auto kronecker_product_term = [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
-                    return ((
-                            force_consteval<kronecker_delta<kronecker_indices[i]>() && (repeats[i] > 0)> ?
-                            product_of_elements<r_indices[i]>(R) * repeats[i] : 0
-                    ) + ...);
-                }(std::make_index_sequence<kronecker_indices.size()>());
-                result += g3 * kronecker_product_term;
-            }
-
-            return result;
-        } else if constexpr (N == 5) {
-            auto result = g5 * cartesian_product_term;
-            return result;
-        }
-        return 0.0f;
-    }
-
-    template<std::size_t N, indexable Vector>
-    ALWAYS_INLINE auto derivative(const Vector &R) {
-        return SymmetricTensor3f<N>::NullaryExpression([&]<auto index>()  LAMBDA_ALWAYS_INLINE {
-            return derivative_at<index, N>(R);
-        });
-    };
-
-    template<std::size_t N, indexable Vector>
-    ALWAYS_INLINE auto derivatives(const Vector &R) {
-        return [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
-//            return Multipole3f<N>(derivative<i + 1>(R) ...);
-            return std::make_tuple(derivative<i + 1>(R) ...);
-        }(std::make_index_sequence<N>());
-    };
-
-
-    // todo: this needs to be regularized -- there must be a pattern!
+    // An implementation of gravity derivatives in the style of GADGET-4
     template<std::size_t Order, indexable Vector>
-    ALWAYS_INLINE auto D(const Vector &R) {
+    ALWAYS_INLINE auto derivative(const Vector &R) {
 
         auto r = glm::length(R);
 
@@ -181,10 +91,10 @@ namespace symtensor::gravity {
         } else if constexpr (Order == 5) {
 
 
-            SymmetricTensor3f<2> R2 = SymmetricTensor3f<2>::CartesianPower(R);
-            SymmetricTensor3f<3> R3 = SymmetricTensor3f<3>::CartesianPower(R);
+            SymmetricTensor3f<2> R2 = SymmetricTensor3f < 2 > ::CartesianPower(R);
+            SymmetricTensor3f<3> R3 = SymmetricTensor3f < 3 > ::CartesianPower(R);
 
-            auto A = g5 * SymmetricTensor3f<5>::CartesianPower(R);
+            auto A = g5 * SymmetricTensor3f < 5 > ::CartesianPower(R);
 
             // ~~~
 
@@ -251,19 +161,11 @@ namespace symtensor::gravity {
     }
 
     template<std::size_t N, indexable Vector>
-    ALWAYS_INLINE auto Ds(const Vector &R) {
+    ALWAYS_INLINE auto derivatives(const Vector &R) {
         return [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
-            return std::make_tuple(D<i + 1>(R) ...);
+            return std::make_tuple(derivative<i + 1>(R) ...);
         }(std::make_index_sequence<N>());
     };
 
 
-    auto derivative4(const glm::vec3 &R) {
-        return derivatives<4>(R);
-    }
-
-    auto D4(const glm::vec3 &R) {
-        return Ds<4>(R);
-    }
-
-}
+}
\ No newline at end of file
diff --git a/include/symtensor/gravity/einsum.h b/include/symtensor/gravity/einsum.h
new file mode 100644
index 0000000..d0a2530
--- /dev/null
+++ b/include/symtensor/gravity/einsum.h
@@ -0,0 +1,101 @@
+#pragma once
+
+#define GLM_ENABLE_EXPERIMENTAL
+
+#include <glm/gtx/norm.hpp>
+
+#include "symtensor/symtensor.h"
+#include "symtensor/util.h"
+
+namespace symtensor::gravity::einsum {
+
+    template<auto index, indexable Vector>
+    inline constexpr auto product_of_elements(const Vector &v) {
+        return std::apply([=](auto ...i) {
+            return (v[static_cast<std::size_t>(i)] * ...);
+        }, index);
+    }
+
+    template<auto index, std::size_t N, indexable Vector>
+    ALWAYS_INLINE auto derivative_at(const Vector &R) {
+
+        auto r2 = glm::length2(R);
+        auto r = std::sqrt(r2);
+        auto inv_r2 = 1.0f / r2;
+        auto g1 = -inv_r2 / r;
+        auto g2 = -3.0f * g1 * inv_r2;
+        auto g3 = -5.0f * g2 * inv_r2;
+        auto g4 = -7.0f * g3 * inv_r2;
+        auto g5 = -9.0f * g4 * inv_r2;
+
+        //        auto r = glm::length(R);
+        //        auto g1 = -1.0f / pow<3>(r);
+        //        auto g2 = 3.0f / pow<5>(r);
+        //        auto g3 = -15.0f / pow<7>(r);
+        //        auto g4 = 105.0f / pow<9>(r);
+        //        auto g5 = -945.0f / pow<11>(r);
+
+        auto cartesian_product_term = product_of_elements<index>(R);
+        if constexpr (N == 1) {
+            return R[static_cast<std::size_t>(index[0])] * g1;
+        } else if constexpr (N == 2) {
+            return g1 * kronecker_delta<float>(index) + g2 * cartesian_product_term;
+        } else if constexpr (N == 3) {
+            constexpr auto partitions = binomial_partitions<1>(index);
+            constexpr auto r_indices = std::get<0>(unzip(partitions));
+            constexpr auto kronecker_indices = std::get<1>(unzip(partitions));
+            constexpr auto repeats = repeats_table(kronecker_indices);
+            auto kronecker_product_term = [=]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
+                    return ((
+                    force_consteval<kronecker_delta<kronecker_indices[i]>() && (repeats[i] > 0)> ?
+                    product_of_elements<r_indices[i]>(R) * repeats[i] : 0
+                    ) + ...);
+            }(std::make_index_sequence<kronecker_indices.size()>());
+            return g2 * kronecker_product_term + g3 * cartesian_product_term;
+        } else if constexpr (N == 4) {
+            auto result = g4 * cartesian_product_term;
+            constexpr auto partitions = binomial_partitions<2>(index);
+            constexpr auto kronecker_indices = std::get<0>(unzip(partitions));
+            constexpr auto r_indices = std::get<1>(unzip(partitions));
+            {
+                constexpr auto kronecker_term = [=]<auto... i>(std::index_sequence<i...>) constexpr {
+                    return ((kronecker_delta(kronecker_indices[i]) & kronecker_delta(r_indices[i])) + ...);
+                }(std::make_index_sequence<kronecker_indices.size() / 2>());
+                result += g2 * kronecker_term;
+            }
+            {
+                constexpr auto repeats = repeats_table(kronecker_indices);
+                auto kronecker_product_term = [=]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
+                        return ((
+                        force_consteval<kronecker_delta<kronecker_indices[i]>() && (repeats[i] > 0)> ?
+                        product_of_elements<r_indices[i]>(R) * repeats[i] : 0
+                        ) + ...);
+                }(std::make_index_sequence<kronecker_indices.size()>());
+                result += g3 * kronecker_product_term;
+            }
+
+            return result;
+        } else if constexpr (N == 5) {
+            auto result = g5 * cartesian_product_term;
+            return result;
+        }
+        return 0.0f;
+    }
+
+    template<std::size_t N, indexable Vector>
+    ALWAYS_INLINE auto derivative(const Vector &R) {
+        return SymmetricTensor3f<N>::NullaryExpression([&]<auto index>()  LAMBDA_ALWAYS_INLINE {
+                return derivative_at<index, N>(R);
+        });
+    };
+
+    template<std::size_t N, indexable Vector>
+    ALWAYS_INLINE auto derivatives(const Vector &R) {
+        return [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
+                //                        return Multipole3f<N>(derivative<i + 1>(R) ...);
+                return std::make_tuple(derivative<i + 1>(R) ...);
+        }(std::make_index_sequence<N>());
+    };
+
+
+}
diff --git a/include/symtensor/gravity/tensorlib.h b/include/symtensor/gravity/tensorlib.h
new file mode 100644
index 0000000..3b1f347
--- /dev/null
+++ b/include/symtensor/gravity/tensorlib.h
@@ -0,0 +1,45 @@
+#pragma once
+
+#define GLM_ENABLE_EXPERIMENTAL
+
+#include <glm/gtx/norm.hpp>
+
+#include "symtensor/symtensor.h"
+#include "symtensor/util.h"
+
+#include "tensorlib_autogen.h"
+#include "symtensor/MultipoleMoment.h"
+
+namespace symtensor::gravity::tensorlib {
+
+    ALWAYS_INLINE auto derivatives5(const glm::vec3 &R) {
+        // todo: Not a fan of this.
+        float d0{};
+        a_3 point{R.x, R.y, R.z};
+        auto d = Multipole3f<5>{};
+        auto &[st1, st2, st3, st4, st5] = d;
+        get_multipole_derivatives(
+                d0,
+                reinterpret_cast<A_3 &>(st1),
+                reinterpret_cast<AA_3 &>(st2),
+                reinterpret_cast<AAA_3 &>(st3),
+                reinterpret_cast<AAAA_3 &>(st4),
+                reinterpret_cast<AAAAA_3 &>(st5),
+                point
+        );
+        return d;
+    }
+
+    template<std::size_t N, indexable Vector>
+    ALWAYS_INLINE auto derivative(const Vector &R) {
+        return derivatives5(R).template tensor<N>();
+    };
+
+    template<std::size_t N, indexable Vector>
+    ALWAYS_INLINE auto derivatives(const Vector &R) {
+        auto m = Multipole3f<N>{};
+        return [&]<auto... i>(std::index_sequence<i...>) LAMBDA_ALWAYS_INLINE {
+            return Multipole3f<N>(derivative<i + 1>(R) ...);
+        }(std::make_index_sequence<N>());
+    };
+}
diff --git a/include/symtensor/gravity/tensorlib_autogen.h b/include/symtensor/gravity/tensorlib_autogen.h
new file mode 100644
index 0000000..3dad655
--- /dev/null
+++ b/include/symtensor/gravity/tensorlib_autogen.h
@@ -0,0 +1,618 @@
+#pragma once
+#include "../platform.h"
+
+/**
+ * The following is generated from tensor diagram notation by Christoph Peters.
+ */
+
+#include <cmath>
+
+
+struct A_3 {
+	float a[3];
+};
+
+struct AA_3 {
+	float a[6];
+};
+
+struct AAA_3 {
+	float a[10];
+};
+
+struct AAAA_3 {
+	float a[15];
+};
+
+struct AAAAA_3 {
+	float a[21];
+};
+
+struct a_3 {
+	float a[3];
+};
+
+struct aa_3A_3 {
+	float a[6];
+};
+
+ALWAYS_INLINE A_3 einsum_a_ab_out_b_known_1(const a_3& t_0) {
+	A_3 result;
+	result.a[0] =  t_0.a[0];
+	result.a[1] =  t_0.a[1];
+	result.a[2] =  t_0.a[2];
+	return result;
+}
+
+ALWAYS_INLINE aa_3A_3 einsum_a_b_ab_group_0_1(const A_3& t_0) {
+	aa_3A_3 result;
+	result.a[0] =  t_0.a[0] * t_0.a[0];
+	result.a[1] =  t_0.a[0] * t_0.a[1];
+	result.a[2] =  t_0.a[0] * t_0.a[2];
+	result.a[3] =  t_0.a[1] * t_0.a[1];
+	result.a[4] =  t_0.a[1] * t_0.a[2];
+	result.a[5] =  t_0.a[2] * t_0.a[2];
+	return result;
+}
+
+ALWAYS_INLINE AAA_3 einsum_a_bc_abc(const A_3& t_0, const aa_3A_3& t_1) {
+	AAA_3 result;
+	result.a[0] =  t_0.a[0] * t_1.a[0];
+	result.a[1] =  t_0.a[0] * t_1.a[1];
+	result.a[2] =  t_0.a[0] * t_1.a[2];
+	result.a[3] =  t_0.a[0] * t_1.a[3];
+	result.a[4] =  t_0.a[0] * t_1.a[4];
+	result.a[5] =  t_0.a[0] * t_1.a[5];
+	result.a[6] =  t_0.a[1] * t_1.a[3];
+	result.a[7] =  t_0.a[1] * t_1.a[4];
+	result.a[8] =  t_0.a[1] * t_1.a[5];
+	result.a[9] =  t_0.a[2] * t_1.a[5];
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 einsum_a_bcd_abcd(const A_3& t_0, const AAA_3& t_1) {
+	AAAA_3 result;
+	result.a[0] =  t_0.a[0] * t_1.a[0];
+	result.a[1] =  t_0.a[0] * t_1.a[1];
+	result.a[2] =  t_0.a[0] * t_1.a[2];
+	result.a[3] =  t_0.a[0] * t_1.a[3];
+	result.a[4] =  t_0.a[0] * t_1.a[4];
+	result.a[5] =  t_0.a[0] * t_1.a[5];
+	result.a[6] =  t_0.a[0] * t_1.a[6];
+	result.a[7] =  t_0.a[0] * t_1.a[7];
+	result.a[8] =  t_0.a[0] * t_1.a[8];
+	result.a[9] =  t_0.a[0] * t_1.a[9];
+	result.a[10] =  t_0.a[1] * t_1.a[6];
+	result.a[11] =  t_0.a[1] * t_1.a[7];
+	result.a[12] =  t_0.a[1] * t_1.a[8];
+	result.a[13] =  t_0.a[1] * t_1.a[9];
+	result.a[14] =  t_0.a[2] * t_1.a[9];
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 einsum_a_bcde_abcde(const A_3& t_0, const AAAA_3& t_1) {
+	AAAAA_3 result;
+	result.a[0] =  t_0.a[0] * t_1.a[0];
+	result.a[1] =  t_0.a[0] * t_1.a[1];
+	result.a[2] =  t_0.a[0] * t_1.a[2];
+	result.a[3] =  t_0.a[0] * t_1.a[3];
+	result.a[4] =  t_0.a[0] * t_1.a[4];
+	result.a[5] =  t_0.a[0] * t_1.a[5];
+	result.a[6] =  t_0.a[0] * t_1.a[6];
+	result.a[7] =  t_0.a[0] * t_1.a[7];
+	result.a[8] =  t_0.a[0] * t_1.a[8];
+	result.a[9] =  t_0.a[0] * t_1.a[9];
+	result.a[10] =  t_0.a[0] * t_1.a[10];
+	result.a[11] =  t_0.a[0] * t_1.a[11];
+	result.a[12] =  t_0.a[0] * t_1.a[12];
+	result.a[13] =  t_0.a[0] * t_1.a[13];
+	result.a[14] =  t_0.a[0] * t_1.a[14];
+	result.a[15] =  t_0.a[1] * t_1.a[10];
+	result.a[16] =  t_0.a[1] * t_1.a[11];
+	result.a[17] =  t_0.a[1] * t_1.a[12];
+	result.a[18] =  t_0.a[1] * t_1.a[13];
+	result.a[19] =  t_0.a[1] * t_1.a[14];
+	result.a[20] =  t_0.a[2] * t_1.a[14];
+	return result;
+}
+
+ALWAYS_INLINE float einsum_a_a(const a_3& t_0, const A_3& t_1) {
+	float result;
+	result =  t_0.a[0] * t_1.a[0];
+	result += t_0.a[1] * t_1.a[1];
+	result += t_0.a[2] * t_1.a[2];
+	return result;
+}
+
+ALWAYS_INLINE float scale(const float& factor, const float& tensor) {
+	float result;
+	result = factor * tensor;
+	return result;
+}
+
+ALWAYS_INLINE A_3 einsum_0_a_0_a_known_0_2(const A_3& t_1) {
+	A_3 result;
+	result.a[0] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	return result;
+}
+
+ALWAYS_INLINE A_3 einsum_0_a_a(const float& t_0, const A_3& t_1) {
+	A_3 result;
+	result.a[0] =  t_0 * t_1.a[0];
+	result.a[1] =  t_0 * t_1.a[1];
+	result.a[2] =  t_0 * t_1.a[2];
+	return result;
+}
+
+ALWAYS_INLINE AA_3 einsum_0_ab_0_ab_known_0_2(const aa_3A_3& t_1) {
+	AA_3 result;
+	result.a[0] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	result.a[3] =  t_1.a[3];
+	result.a[4] =  t_1.a[4];
+	result.a[5] =  t_1.a[5];
+	return result;
+}
+
+ALWAYS_INLINE AA_3 einsum_0_ab_ab(const float& t_0, const AA_3& t_1) {
+	AA_3 result;
+	result.a[0] =  t_0 * t_1.a[0];
+	result.a[1] =  t_0 * t_1.a[1];
+	result.a[2] =  t_0 * t_1.a[2];
+	result.a[3] =  t_0 * t_1.a[3];
+	result.a[4] =  t_0 * t_1.a[4];
+	result.a[5] =  t_0 * t_1.a[5];
+	return result;
+}
+
+ALWAYS_INLINE AA_3 einsum_0_ab_ab_known_1(const float& t_0) {
+	AA_3 result;
+	result.a[0] =  t_0;
+	result.a[3] =  t_0;
+	result.a[5] =  t_0;
+	result.a[1] = 0;
+	result.a[2] = 0;
+	result.a[4] = 0;
+	return result;
+}
+
+ALWAYS_INLINE AA_3 add(const AA_3& lhs, const AA_3& rhs) {
+	AA_3 result;
+	result.a[0] = lhs.a[0] + rhs.a[0];
+	result.a[1] = lhs.a[1] + rhs.a[1];
+	result.a[2] = lhs.a[2] + rhs.a[2];
+	result.a[3] = lhs.a[3] + rhs.a[3];
+	result.a[4] = lhs.a[4] + rhs.a[4];
+	result.a[5] = lhs.a[5] + rhs.a[5];
+	return result;
+}
+
+ALWAYS_INLINE AAA_3 einsum_0_a_bc_abc_known_0_2(const A_3& t_1) {
+	AAA_3 result;
+	result.a[0] =  3 * t_1.a[0];
+	result.a[3] =  t_1.a[0];
+	result.a[5] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[6] =  3 * t_1.a[1];
+	result.a[8] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	result.a[7] =  t_1.a[2];
+	result.a[9] =  3 * t_1.a[2];
+	result.a[4] = 0;
+	return result;
+}
+
+ALWAYS_INLINE AAA_3 einsum_0_abc_abc(const float& t_0, const AAA_3& t_1) {
+	AAA_3 result;
+	result.a[0] =  t_0 * t_1.a[0];
+	result.a[1] =  t_0 * t_1.a[1];
+	result.a[2] =  t_0 * t_1.a[2];
+	result.a[3] =  t_0 * t_1.a[3];
+	result.a[4] =  t_0 * t_1.a[4];
+	result.a[5] =  t_0 * t_1.a[5];
+	result.a[6] =  t_0 * t_1.a[6];
+	result.a[7] =  t_0 * t_1.a[7];
+	result.a[8] =  t_0 * t_1.a[8];
+	result.a[9] =  t_0 * t_1.a[9];
+	return result;
+}
+
+ALWAYS_INLINE AAA_3 einsum_0_abc_0_abc_known_0_2(const AAA_3& t_1) {
+	AAA_3 result;
+	result.a[0] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	result.a[3] =  t_1.a[3];
+	result.a[4] =  t_1.a[4];
+	result.a[5] =  t_1.a[5];
+	result.a[6] =  t_1.a[6];
+	result.a[7] =  t_1.a[7];
+	result.a[8] =  t_1.a[8];
+	result.a[9] =  t_1.a[9];
+	return result;
+}
+
+ALWAYS_INLINE AAA_3 add(const AAA_3& lhs, const AAA_3& rhs) {
+	AAA_3 result;
+	result.a[0] = lhs.a[0] + rhs.a[0];
+	result.a[1] = lhs.a[1] + rhs.a[1];
+	result.a[2] = lhs.a[2] + rhs.a[2];
+	result.a[3] = lhs.a[3] + rhs.a[3];
+	result.a[4] = lhs.a[4] + rhs.a[4];
+	result.a[5] = lhs.a[5] + rhs.a[5];
+	result.a[6] = lhs.a[6] + rhs.a[6];
+	result.a[7] = lhs.a[7] + rhs.a[7];
+	result.a[8] = lhs.a[8] + rhs.a[8];
+	result.a[9] = lhs.a[9] + rhs.a[9];
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 einsum_0_ab_cd_abcd_known_0_2(const aa_3A_3& t_1) {
+	AAAA_3 result;
+	result.a[0] =  6 * t_1.a[0];
+	result.a[3] =  t_1.a[0];
+	result.a[5] =  t_1.a[0];
+	result.a[1] =  3 * t_1.a[1];
+	result.a[6] =  3 * t_1.a[1];
+	result.a[8] =  t_1.a[1];
+	result.a[2] =  3 * t_1.a[2];
+	result.a[7] =  t_1.a[2];
+	result.a[9] =  3 * t_1.a[2];
+	result.a[3] += t_1.a[3];
+	result.a[10] =  6 * t_1.a[3];
+	result.a[12] =  t_1.a[3];
+	result.a[4] =  t_1.a[4];
+	result.a[11] =  3 * t_1.a[4];
+	result.a[13] =  3 * t_1.a[4];
+	result.a[5] += t_1.a[5];
+	result.a[12] += t_1.a[5];
+	result.a[14] =  6 * t_1.a[5];
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 einsum_0_abcd_abcd(const float& t_0, const AAAA_3& t_1) {
+	AAAA_3 result;
+	result.a[0] =  t_0 * t_1.a[0];
+	result.a[1] =  t_0 * t_1.a[1];
+	result.a[2] =  t_0 * t_1.a[2];
+	result.a[3] =  t_0 * t_1.a[3];
+	result.a[4] =  t_0 * t_1.a[4];
+	result.a[5] =  t_0 * t_1.a[5];
+	result.a[6] =  t_0 * t_1.a[6];
+	result.a[7] =  t_0 * t_1.a[7];
+	result.a[8] =  t_0 * t_1.a[8];
+	result.a[9] =  t_0 * t_1.a[9];
+	result.a[10] =  t_0 * t_1.a[10];
+	result.a[11] =  t_0 * t_1.a[11];
+	result.a[12] =  t_0 * t_1.a[12];
+	result.a[13] =  t_0 * t_1.a[13];
+	result.a[14] =  t_0 * t_1.a[14];
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 einsum_0_abcd_abcd_known_1(const float& t_0) {
+	AAAA_3 result;
+	result.a[0] =  3 * t_0;
+	result.a[3] =  t_0;
+	result.a[5] =  t_0;
+	result.a[10] =  3 * t_0;
+	result.a[12] =  t_0;
+	result.a[14] =  3 * t_0;
+	result.a[1] = 0;
+	result.a[2] = 0;
+	result.a[4] = 0;
+	result.a[6] = 0;
+	result.a[7] = 0;
+	result.a[8] = 0;
+	result.a[9] = 0;
+	result.a[11] = 0;
+	result.a[13] = 0;
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 einsum_0_abcd_0_abcd_known_0_2(const AAAA_3& t_1) {
+	AAAA_3 result;
+	result.a[0] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	result.a[3] =  t_1.a[3];
+	result.a[4] =  t_1.a[4];
+	result.a[5] =  t_1.a[5];
+	result.a[6] =  t_1.a[6];
+	result.a[7] =  t_1.a[7];
+	result.a[8] =  t_1.a[8];
+	result.a[9] =  t_1.a[9];
+	result.a[10] =  t_1.a[10];
+	result.a[11] =  t_1.a[11];
+	result.a[12] =  t_1.a[12];
+	result.a[13] =  t_1.a[13];
+	result.a[14] =  t_1.a[14];
+	return result;
+}
+
+ALWAYS_INLINE AAAA_3 add(const AAAA_3& lhs, const AAAA_3& rhs) {
+	AAAA_3 result;
+	result.a[0] = lhs.a[0] + rhs.a[0];
+	result.a[1] = lhs.a[1] + rhs.a[1];
+	result.a[2] = lhs.a[2] + rhs.a[2];
+	result.a[3] = lhs.a[3] + rhs.a[3];
+	result.a[4] = lhs.a[4] + rhs.a[4];
+	result.a[5] = lhs.a[5] + rhs.a[5];
+	result.a[6] = lhs.a[6] + rhs.a[6];
+	result.a[7] = lhs.a[7] + rhs.a[7];
+	result.a[8] = lhs.a[8] + rhs.a[8];
+	result.a[9] = lhs.a[9] + rhs.a[9];
+	result.a[10] = lhs.a[10] + rhs.a[10];
+	result.a[11] = lhs.a[11] + rhs.a[11];
+	result.a[12] = lhs.a[12] + rhs.a[12];
+	result.a[13] = lhs.a[13] + rhs.a[13];
+	result.a[14] = lhs.a[14] + rhs.a[14];
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 einsum_0_a_bcde_abcde_known_0_2(const A_3& t_1) {
+	AAAAA_3 result;
+	result.a[0] =  15 * t_1.a[0];
+	result.a[3] =  3 * t_1.a[0];
+	result.a[5] =  3 * t_1.a[0];
+	result.a[10] =  3 * t_1.a[0];
+	result.a[12] =  t_1.a[0];
+	result.a[14] =  3 * t_1.a[0];
+	result.a[1] =  3 * t_1.a[1];
+	result.a[6] =  3 * t_1.a[1];
+	result.a[8] =  t_1.a[1];
+	result.a[15] =  15 * t_1.a[1];
+	result.a[17] =  3 * t_1.a[1];
+	result.a[19] =  3 * t_1.a[1];
+	result.a[2] =  3 * t_1.a[2];
+	result.a[7] =  t_1.a[2];
+	result.a[9] =  3 * t_1.a[2];
+	result.a[16] =  3 * t_1.a[2];
+	result.a[18] =  3 * t_1.a[2];
+	result.a[20] =  15 * t_1.a[2];
+	result.a[4] = 0;
+	result.a[11] = 0;
+	result.a[13] = 0;
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 einsum_0_abcde_abcde(const float& t_0, const AAAAA_3& t_1) {
+	AAAAA_3 result;
+	result.a[0] =  t_0 * t_1.a[0];
+	result.a[1] =  t_0 * t_1.a[1];
+	result.a[2] =  t_0 * t_1.a[2];
+	result.a[3] =  t_0 * t_1.a[3];
+	result.a[4] =  t_0 * t_1.a[4];
+	result.a[5] =  t_0 * t_1.a[5];
+	result.a[6] =  t_0 * t_1.a[6];
+	result.a[7] =  t_0 * t_1.a[7];
+	result.a[8] =  t_0 * t_1.a[8];
+	result.a[9] =  t_0 * t_1.a[9];
+	result.a[10] =  t_0 * t_1.a[10];
+	result.a[11] =  t_0 * t_1.a[11];
+	result.a[12] =  t_0 * t_1.a[12];
+	result.a[13] =  t_0 * t_1.a[13];
+	result.a[14] =  t_0 * t_1.a[14];
+	result.a[15] =  t_0 * t_1.a[15];
+	result.a[16] =  t_0 * t_1.a[16];
+	result.a[17] =  t_0 * t_1.a[17];
+	result.a[18] =  t_0 * t_1.a[18];
+	result.a[19] =  t_0 * t_1.a[19];
+	result.a[20] =  t_0 * t_1.a[20];
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 einsum_0_abc_de_abcde_known_0_2(const AAA_3& t_1) {
+	AAAAA_3 result;
+	result.a[0] =  10 * t_1.a[0];
+	result.a[3] =  t_1.a[0];
+	result.a[5] =  t_1.a[0];
+	result.a[1] =  6 * t_1.a[1];
+	result.a[6] =  3 * t_1.a[1];
+	result.a[8] =  t_1.a[1];
+	result.a[2] =  6 * t_1.a[2];
+	result.a[7] =  t_1.a[2];
+	result.a[9] =  3 * t_1.a[2];
+	result.a[3] += 3 * t_1.a[3];
+	result.a[10] =  6 * t_1.a[3];
+	result.a[12] =  t_1.a[3];
+	result.a[4] =  3 * t_1.a[4];
+	result.a[11] =  3 * t_1.a[4];
+	result.a[13] =  3 * t_1.a[4];
+	result.a[5] += 3 * t_1.a[5];
+	result.a[12] += t_1.a[5];
+	result.a[14] =  6 * t_1.a[5];
+	result.a[6] += t_1.a[6];
+	result.a[15] =  10 * t_1.a[6];
+	result.a[17] =  t_1.a[6];
+	result.a[7] += t_1.a[7];
+	result.a[16] =  6 * t_1.a[7];
+	result.a[18] =  3 * t_1.a[7];
+	result.a[8] += t_1.a[8];
+	result.a[17] += 3 * t_1.a[8];
+	result.a[19] =  6 * t_1.a[8];
+	result.a[9] += t_1.a[9];
+	result.a[18] += t_1.a[9];
+	result.a[20] =  10 * t_1.a[9];
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 einsum_0_abcde_0_abcde_known_0_2(const AAAAA_3& t_1) {
+	AAAAA_3 result;
+	result.a[0] =  t_1.a[0];
+	result.a[1] =  t_1.a[1];
+	result.a[2] =  t_1.a[2];
+	result.a[3] =  t_1.a[3];
+	result.a[4] =  t_1.a[4];
+	result.a[5] =  t_1.a[5];
+	result.a[6] =  t_1.a[6];
+	result.a[7] =  t_1.a[7];
+	result.a[8] =  t_1.a[8];
+	result.a[9] =  t_1.a[9];
+	result.a[10] =  t_1.a[10];
+	result.a[11] =  t_1.a[11];
+	result.a[12] =  t_1.a[12];
+	result.a[13] =  t_1.a[13];
+	result.a[14] =  t_1.a[14];
+	result.a[15] =  t_1.a[15];
+	result.a[16] =  t_1.a[16];
+	result.a[17] =  t_1.a[17];
+	result.a[18] =  t_1.a[18];
+	result.a[19] =  t_1.a[19];
+	result.a[20] =  t_1.a[20];
+	return result;
+}
+
+ALWAYS_INLINE AAAAA_3 add(const AAAAA_3& lhs, const AAAAA_3& rhs) {
+	AAAAA_3 result;
+	result.a[0] = lhs.a[0] + rhs.a[0];
+	result.a[1] = lhs.a[1] + rhs.a[1];
+	result.a[2] = lhs.a[2] + rhs.a[2];
+	result.a[3] = lhs.a[3] + rhs.a[3];
+	result.a[4] = lhs.a[4] + rhs.a[4];
+	result.a[5] = lhs.a[5] + rhs.a[5];
+	result.a[6] = lhs.a[6] + rhs.a[6];
+	result.a[7] = lhs.a[7] + rhs.a[7];
+	result.a[8] = lhs.a[8] + rhs.a[8];
+	result.a[9] = lhs.a[9] + rhs.a[9];
+	result.a[10] = lhs.a[10] + rhs.a[10];
+	result.a[11] = lhs.a[11] + rhs.a[11];
+	result.a[12] = lhs.a[12] + rhs.a[12];
+	result.a[13] = lhs.a[13] + rhs.a[13];
+	result.a[14] = lhs.a[14] + rhs.a[14];
+	result.a[15] = lhs.a[15] + rhs.a[15];
+	result.a[16] = lhs.a[16] + rhs.a[16];
+	result.a[17] = lhs.a[17] + rhs.a[17];
+	result.a[18] = lhs.a[18] + rhs.a[18];
+	result.a[19] = lhs.a[19] + rhs.a[19];
+	result.a[20] = lhs.a[20] + rhs.a[20];
+	return result;
+}
+
+ALWAYS_INLINE void get_multipole_derivatives(float& derivative_0, A_3& derivative_1, AA_3& derivative_2, AAA_3& derivative_3, AAAA_3& derivative_4, AAAAA_3& derivative_5, const a_3& point) {
+	// 
+	A_3 pA_power_1 = einsum_a_ab_out_b_known_1(point);
+	// fma: 6
+	aa_3A_3 pA_power_2 = einsum_a_b_ab_group_0_1(pA_power_1);
+	// fma: 10
+	AAA_3 pA_power_3 = einsum_a_bc_abc(pA_power_1, pA_power_2);
+	// fma: 15
+	AAAA_3 pA_power_4 = einsum_a_bcd_abcd(pA_power_1, pA_power_3);
+	// fma: 21
+	AAAAA_3 pA_power_5 = einsum_a_bcde_abcde(pA_power_1, pA_power_4);
+	// fma: 3
+	float tmp = einsum_a_a(point, pA_power_1);
+	// sqrt: 1
+	float g_0 = 1 / std::sqrt(tmp);
+	// fma: 1
+	float g_power_1 = scale(1, g_0);
+	// fma: 1
+	float g_power_2 = g_power_1 * g_0;
+	// fma: 1
+	float g_power_3 = g_power_2 * g_0;
+	// fma: 1
+	float g_power_4 = g_power_3 * g_0;
+	// fma: 1
+	float g_power_5 = g_power_4 * g_0;
+	// fma: 1
+	float g_power_6 = g_power_5 * g_0;
+	// fma: 1
+	float g_power_7 = g_power_6 * g_0;
+	// fma: 1
+	float g_power_8 = g_power_7 * g_0;
+	// fma: 1
+	float g_power_9 = g_power_8 * g_0;
+	// fma: 1
+	float g_power_10 = g_power_9 * g_0;
+	// fma: 1
+	float g_power_11 = g_power_10 * g_0;
+	// fma: 1
+	float tmp_01 = scale(1, g_power_1);
+	// 
+	float g_0_01 = tmp_01;
+	// fma: 1
+	float tmp_02 = scale(-1, g_power_3);
+	// 
+	A_3 tmp_03 = einsum_0_a_0_a_known_0_2(pA_power_1);
+	// fma: 3
+	A_3 g_1 = einsum_0_a_a(tmp_02, tmp_03);
+	// fma: 1
+	float tmp_04 = scale(3, g_power_5);
+	// 
+	AA_3 tmp_05 = einsum_0_ab_0_ab_known_0_2(pA_power_2);
+	// fma: 6
+	AA_3 tmp_06 = einsum_0_ab_ab(tmp_04, tmp_05);
+	// fma: 1
+	float tmp_07 = scale(-1, g_power_3);
+	// 
+	AA_3 tmp_08 = einsum_0_ab_ab_known_1(tmp_07);
+	// fma: 6
+	AA_3 g_2 = add(tmp_06, tmp_08);
+	// fma: 1
+	float tmp_09 = scale(3, g_power_5);
+	// fma: 3
+	AAA_3 tmp_10 = einsum_0_a_bc_abc_known_0_2(pA_power_1);
+	// fma: 10
+	AAA_3 tmp_11 = einsum_0_abc_abc(tmp_09, tmp_10);
+	// fma: 1
+	float tmp_12 = scale(-15, g_power_7);
+	// 
+	AAA_3 tmp_13 = einsum_0_abc_0_abc_known_0_2(pA_power_3);
+	// fma: 10
+	AAA_3 tmp_14 = einsum_0_abc_abc(tmp_12, tmp_13);
+	// fma: 10
+	AAA_3 g_3 = add(tmp_11, tmp_14);
+	// fma: 1
+	float tmp_15 = scale(-15, g_power_7);
+	// fma: 9
+	AAAA_3 tmp_16 = einsum_0_ab_cd_abcd_known_0_2(pA_power_2);
+	// fma: 15
+	AAAA_3 tmp_17 = einsum_0_abcd_abcd(tmp_15, tmp_16);
+	// fma: 1
+	float tmp_18 = scale(3, g_power_5);
+	// fma: 3
+	AAAA_3 tmp_19 = einsum_0_abcd_abcd_known_1(tmp_18);
+	// fma: 1
+	float tmp_20 = scale(1.05e+02f, g_power_9);
+	// 
+	AAAA_3 tmp_21 = einsum_0_abcd_0_abcd_known_0_2(pA_power_4);
+	// fma: 15
+	AAAA_3 tmp_22 = einsum_0_abcd_abcd(tmp_20, tmp_21);
+	// fma: 15
+	AAAA_3 tmp_23 = add(tmp_17, tmp_19);
+	// fma: 15
+	AAAA_3 g_4 = add(tmp_23, tmp_22);
+	// fma: 1
+	float tmp_24 = scale(-15, g_power_7);
+	// fma: 15
+	AAAAA_3 tmp_25 = einsum_0_a_bcde_abcde_known_0_2(pA_power_1);
+	// fma: 21
+	AAAAA_3 tmp_26 = einsum_0_abcde_abcde(tmp_24, tmp_25);
+	// fma: 1
+	float tmp_27 = scale(1.05e+02f, g_power_9);
+	// fma: 21
+	AAAAA_3 tmp_28 = einsum_0_abc_de_abcde_known_0_2(pA_power_3);
+	// fma: 21
+	AAAAA_3 tmp_29 = einsum_0_abcde_abcde(tmp_27, tmp_28);
+	// fma: 1
+	float remaining_factor = scale(-9.45e+02f, g_power_11);
+	// 
+	AAAAA_3 tmp_30 = einsum_0_abcde_0_abcde_known_0_2(pA_power_5);
+	// fma: 21
+	AAAAA_3 term = einsum_0_abcde_abcde(remaining_factor, tmp_30);
+	// fma: 21
+	AAAAA_3 tmp_31 = add(tmp_26, tmp_29);
+	// fma: 21
+	AAAAA_3 g_5 = add(tmp_31, term);
+	derivative_0 = g_0_01;
+	derivative_1 = g_1;
+	derivative_2 = g_2;
+	derivative_3 = g_3;
+	derivative_4 = g_4;
+	derivative_5 = g_5;
+	// fma: 339, sqrt: 1
+}
+