From 26f7d3a7169af732bde67488226fff4a6fba5996 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Mon, 19 Feb 2024 16:31:21 +0100
Subject: [PATCH 01/15] Updated quaternion and tensor types. Added qualifier

---
 src/utils/quaternion.cuh | 196 ++++++++++++++++++++-------------------
 src/utils/tensor.cuh     |  26 +++++-
 2 files changed, 124 insertions(+), 98 deletions(-)

diff --git a/src/utils/quaternion.cuh b/src/utils/quaternion.cuh
index 2c384549..bdaec2a4 100644
--- a/src/utils/quaternion.cuh
+++ b/src/utils/quaternion.cuh
@@ -1,7 +1,7 @@
 /* P. Palacios Alonso 2021
 Some quaternion algebra and useful functions
 Notation:
-   n(real)  - Scalar part 
+   n(real)  - Scalar part
    v(real3) - vectorial part
    q(real4) - quaternion
  */
@@ -13,146 +13,150 @@ Notation:
 
 namespace uammd{
   class Quat{
-  public:
-    
-    real n;
-    real3 v;
-    
-    QUATTR Quat();
-    QUATTR Quat(real4 q);
-    QUATTR Quat(real n, real3 v);
-    QUATTR Quat(real n, real vx, real vy, real vz);
-
-    QUATTR Quat operator+(Quat q1);
-    QUATTR void operator+=(Quat q);
-    QUATTR Quat operator-(Quat q);
-    QUATTR void operator-=(Quat q);
-    QUATTR Quat operator*(Quat q);
-    QUATTR void operator*=(Quat q);
-    QUATTR Quat operator*(real scalar);
-    QUATTR void operator*=(real scalar);
-    QUATTR Quat operator/(real scalar);
-    QUATTR void operator/=(real scalar);
-    QUATTR void operator=(real4 q);
-    
-    QUATTR real3 getVx(); //Returns the first vector of the reference system encoded by the quaternion
-    QUATTR real3 getVy(); //Returns the second vector of the reference system encoded by the quaternion
-    QUATTR real3 getVz(); //Returns the third vector of the reference system encoded by the quaternion
-    QUATTR real4 to_real4();
-
-    QUATTR Quat getConjugate();
+    public:
+
+      real n;
+      real3 v;
+
+      QUATTR Quat();
+      QUATTR Quat(const real4& q);
+      QUATTR Quat(real n, const real3& v);
+      QUATTR Quat(real n, real vx, real vy, real vz);
+
+      QUATTR Quat operator+  (const Quat& q) const;
+      QUATTR void operator+= (const Quat& q);
+      QUATTR Quat operator-  (const Quat& q) const;
+      QUATTR void operator-= (const Quat& q);
+      QUATTR Quat operator*  (const Quat& q) const;
+      QUATTR void operator*= (const Quat& q);
+      QUATTR Quat operator*  (real scalar) const;
+      QUATTR void operator*= (real scalar);
+      QUATTR Quat operator/  (real scalar) const;
+      QUATTR void operator/= (real scalar);
+      QUATTR void operator=  (const real4& q);
+
+      QUATTR real3 getVx() const;
+      QUATTR real3 getVy() const;
+      QUATTR real3 getVz() const;
+      QUATTR real4 to_real4() const;
+
+      QUATTR Quat getConjugate() const;
   };
-  
+
   QUATTR Quat::Quat(){
     n = real(1.0);
     v = real3();
   }
-  
-  QUATTR Quat::Quat(real4 q){
+
+  QUATTR Quat::Quat(const real4& q){
     n = q.x;
     v.x = q.y;
     v.y = q.z;
     v.z = q.w;
   }
-  
-  QUATTR Quat::Quat(real n, real3 v){
+
+  QUATTR Quat::Quat(real n, const real3& v){
     this->n = n;
     this->v = v;
   }
-  
-  QUATTR Quat::Quat(real n, real vx, real vy, real vz){
+
+  QUATTR Quat::Quat(real n,
+                    real vx,real vy,real vz){
     this -> n = n;
     v.x = vx;
     v.y = vy;
-    v.z = vz;	
+    v.z = vz;
   }
-  
-  QUATTR Quat Quat::operator+(Quat q){
-    return Quat(n+q.n,v+q.v);    
+
+  QUATTR Quat Quat::operator+(const Quat& q) const {
+    return Quat(n+q.n,v+q.v);
   }
 
-  QUATTR void Quat::operator+=(Quat q){
+  QUATTR void Quat::operator+=(const Quat& q){
     n+=q.n;
     v+=q.v;
   }
-  
-  QUATTR Quat Quat::operator-(Quat q){
-    return Quat(n-q.n,v-q.v);    
-    }
-  
-  QUATTR void Quat::operator-=(Quat q){
+
+  QUATTR Quat Quat::operator-(const Quat& q) const {
+    return Quat(n-q.n,v-q.v);
+  }
+
+  QUATTR void Quat::operator-=(const Quat& q){
     n-=q.n;
     v-=q.v;
   }
-  
-  QUATTR Quat Quat::operator*(Quat q){
+
+  QUATTR Quat Quat::operator*(const Quat& q) const {
     /*
-      Product of two quaternions:
-      q3 = q1*q2 = (n1*n2 - v1*v2, n1*v2 + n2*v1 + v1 x v2)
-    */      
-    return Quat(n*q.n-dot(v,q.v),n*q.v+v*q.n+cross(v,q.v));    
+       Product of two quaternions:
+       q3 = q1*q2 = (n1*n2 - v1*v2, n1*v2 + n2*v1 + v1 x v2)
+     */
+    return Quat(n*q.n-dot(v,q.v),n*q.v+v*q.n+cross(v,q.v));
   }
-  
-  QUATTR void Quat::operator*=(Quat q){
-    n=n*q.n-dot(v,q.v);
-    v=n*q.v+v*q.n+cross(q.v,v);
+
+  QUATTR void Quat::operator*=(const Quat& q){
+    real  n_new = n*q.n-dot(v,q.v);
+    real3 v_new = n*q.v+v*q.n+cross(v,q.v);
+
+    n = n_new;
+    v = v_new;
   }
-  
-  QUATTR Quat Quat::operator*(real scalar){
-    return Quat(scalar*n,scalar*v);    
+
+  QUATTR Quat Quat::operator*(real scalar) const {
+    return Quat(scalar*n,scalar*v);
   }
-  
+
   QUATTR void Quat::operator*=(real scalar){
-      n*=scalar;
-      v*=scalar;
+    n*=scalar;
+    v*=scalar;
   }
-  
-  QUATTR Quat operator*(real scalar, Quat q){
-    return  Quat(scalar*q.n,scalar*q.v);    
+
+  QUATTR Quat operator*(real scalar, const Quat& q){
+    return  Quat(scalar*q.n,scalar*q.v);
   }
-  
-  QUATTR Quat Quat::operator/(real scalar){
-    return Quat(n/scalar,v/scalar);    
+
+  QUATTR Quat Quat::operator/(real scalar) const {
+    return Quat(n/scalar,v/scalar);
   }
-  
+
   QUATTR void Quat::operator/=(real scalar){
     n/=scalar;
     v/=scalar;
   }
-  
-  QUATTR void Quat::operator=(real4 q){
+
+  QUATTR void Quat::operator=(const real4& q){
     n = q.x;
     v.x = q.y;
     v.y = q.z;
     v.z = q.w;
   }
-  
-  QUATTR real3 Quat::getVx(){
+
+  QUATTR real3 Quat::getVx() const {
     real a = n;
     real b = v.x;
     real c = v.y;
     real d = v.z;
-    real3 vx = make_real3(a*a+b*b-c*c-d*d,2*(b*c+a*d),2*(b*d-a*c));
+    real3 vx = make_real3(a*a+b*b-c*c-d*d,real(2.0)*(b*c+a*d),real(2.0)*(b*d-a*c));
     return vx;
   }
-  
-  QUATTR real3 Quat::getVy(){
+
+  QUATTR real3 Quat::getVy() const {
     real a = n;
     real b = v.x;
     real c = v.y;
     real d = v.z;
-    return make_real3(2*(b*c-a*d),a*a-b*b+c*c-d*d,2*(c*d+a*b));
+    return make_real3(real(2.0)*(b*c-a*d),a*a-b*b+c*c-d*d,real(2.0)*(c*d+a*b));
   }
-  
-  QUATTR real3 Quat::getVz(){
+
+  QUATTR real3 Quat::getVz() const {
     real a = n;
     real b = v.x;
     real c = v.y;
     real d = v.z;
-    return make_real3(2*(b*d+a*c),2*(c*d-a*b),a*a-b*b-c*c+d*d);  
+    return make_real3(real(2.0)*(b*d+a*c),real(2.0)*(c*d-a*b),a*a-b*b-c*c+d*d);
   }
-  
-  QUATTR real4 Quat::to_real4(){
+
+  QUATTR real4 Quat::to_real4() const {
     real4 r4;
     r4.x = n;
     r4.y = v.x;
@@ -160,32 +164,32 @@ namespace uammd{
     r4.w = v.z;
     return r4;
   }
-  
+
   QUATTR real4 make_real4(Quat q){
     return q.to_real4();
   }
 
-  QUATTR Quat Quat::getConjugate(){
+  QUATTR Quat Quat::getConjugate() const {
     return Quat(n,-v);
   }
 
-  /* 
-     Returns the quaternion that encondes a rotation of ang radians 
-     around the axis vrot 	 
+  /*
+     Returns the quaternion that encondes a rotation of ang radians
+     around the axis vrot
      q = (cos(phi/2),vrot·sin(phi/2))
-  */
+   */
   QUATTR Quat rotVec2Quaternion(real3 vrot, real phi){
     real norm = (dot(vrot,vrot));
-    if (norm==0) return Quat(1.0,0.,0.,0.);
+    if (norm==real(0.0)) return Quat(1.0, 0., 0., 0.);
     vrot*=rsqrt(norm); // The rotation axis must be a unitary vector
     real cphi2, sphi2;
     real* cphi2_ptr = &cphi2;
     real* sphi2_ptr = &sphi2;
-    sincos(phi*0.5,sphi2_ptr,cphi2_ptr);
+    sincos(phi*real(0.5),sphi2_ptr,cphi2_ptr);
     Quat q = Quat(cphi2,sphi2*vrot);
     return q;
   }
-  
+
   QUATTR Quat rotVec2Quaternion(real3 vrot){
     // If no angle is given the rotation angle is the modulus of vrot
     real phi = sqrt(dot(vrot,vrot));
@@ -197,10 +201,10 @@ namespace uammd{
      To speed up the computation, we write the rotation in the next form:
      v' = v + 2*q_n*(q_v x v) + 2*q_v * (q_v x v).
      See https://fgiesen.wordpress.com/2019/02/09/rotating-a-single-vector-using-a-quaternion/
-  */
-  QUATTR real3 rotateVector(Quat q, real3 v){
-    real3 aux = 2*cross(q.v,v);
-    return v+q.n*aux+cross(q.v,aux);    
+   */
+  QUATTR real3 rotateVector(const Quat& q, const real3& v){
+    real3 aux = real(2.0)*cross(q.v,v);
+    return v+q.n*aux+cross(q.v,aux);
   }
 }
 
diff --git a/src/utils/tensor.cuh b/src/utils/tensor.cuh
index 0acba92c..e67c729b 100644
--- a/src/utils/tensor.cuh
+++ b/src/utils/tensor.cuh
@@ -36,9 +36,15 @@ namespace uammd{
       this->zz = t.zz;
     }
 
-    VECATTR real3 diag(){return {xx,yy,zz};}
+    VECATTR real3 diag() const {return {xx,yy,zz};}
 
-    VECATTR real trace(){return xx+yy+zz;}
+    VECATTR real trace() const {return xx+yy+zz;}
+
+    VECATTR tensor3 transpose() const {
+      return tensor3(xx,yx,zx,
+                     xy,yy,zy,
+                     xz,yz,zz);
+    }
 
   };
 
@@ -152,6 +158,22 @@ namespace uammd{
     return sub;
   }
 
+  //Define the unary minus operator
+
+  VECATTR  tensor3 operator -(const tensor3 &a){
+    tensor3 sub;
+    sub.xx = -a.xx;
+    sub.yx = -a.yx;
+    sub.zx = -a.zx;
+    sub.xy = -a.xy;
+    sub.yy = -a.yy;
+    sub.zy = -a.zy;
+    sub.xz = -a.xz;
+    sub.yz = -a.yz;
+    sub.zz = -a.zz;
+    return sub;
+  }
+
   VECATTR  void operator -=(tensor3 &a, const real &b){
     a.xx -= b;
     a.yx -= b;

From b0b86e4781c617000206b084b7939e181520a1a1 Mon Sep 17 00:00:00 2001
From: PabloPalaciosAlonso <pablo.pal@hotmail.com>
Date: Tue, 20 Feb 2024 14:37:06 +0100
Subject: [PATCH 02/15] Extend BVP modules to work either with real or complex
 numbers

---
 .../BDHI/DoublyPeriodic/DPStokesSlab.cuh      |   2 +-
 .../DoublyPeriodic/StokesSlab/Correction.cuh  |   2 +-
 .../StokesSlab/initialization.cu              |   2 +-
 .../DoublyPeriodic/PoissonSlab/BVPPoisson.cuh |   4 +-
 .../BVPSchurComplementMatrices.cuh            |  25 +-
 src/misc/BoundaryValueProblem/BVPSolver.cuh   | 111 ++++--
 src/misc/BoundaryValueProblem/KBPENTA.cuh     |  15 +-
 src/misc/BoundaryValueProblem/MatrixUtils.h   | 183 ++++++----
 src/utils/atomics.cuh                         |  28 +-
 src/utils/vector.cuh                          | 335 ++++++++++++++++++
 test/misc/bvp/bvp.cu                          |   2 +-
 11 files changed, 565 insertions(+), 144 deletions(-)

diff --git a/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh b/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
index a02a22b4..aad075a4 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
+++ b/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
@@ -169,7 +169,7 @@ else throw std::runtime_error("[DPStokesSlab] Can only average in direction X (0
       real gw;
       real tolerance;
       WallMode mode;
-      shared_ptr<BVP::BatchedBVPHandler> bvpSolver;
+      shared_ptr<BVP::BatchedBVPHandler<real>> bvpSolver;
 
     };
 
diff --git a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
index f6e5f6cf..c7735e3c 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
+++ b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
@@ -506,7 +506,7 @@ namespace uammd{
     }
 
     auto computeZeroModeBoundaryConditions(int nz, real H, WallMode mode){
-      BVP::SchurBoundaryCondition bcs(nz, H);
+      BVP::SchurBoundaryCondition<real> bcs(nz, H);
       if(mode == WallMode::bottom){
 	correction_ns::TopBoundaryConditions top(0, H);
 	correction_ns::BottomBoundaryConditions bot(0, H);
diff --git a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
index d89ed298..d16b5bf3 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
+++ b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
@@ -144,7 +144,7 @@ namespace uammd{
       auto botBC = thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), botdispatch);
       int numberSystems = (nk.x/2+1)*nk.y;
       int nz = grid.cellDim.z;
-      this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler>(klist, topBC, botBC, numberSystems, halfH, nz);
+      this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler<real>>(klist, topBC, botBC, numberSystems, halfH, nz);
       CudaCheckError();
     }
 
diff --git a/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh b/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
index 161ff7ab..b1aea296 100644
--- a/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
+++ b/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
@@ -90,7 +90,7 @@ namespace uammd{
     }
 
     class BVPPoissonSlab{
-      std::shared_ptr<BVP::BatchedBVPHandler> bvpSolver;
+      std::shared_ptr<BVP::BatchedBVPHandler<real>> bvpSolver;
       real2 Lxy;
       real H;
       int3 cellDim;
@@ -150,7 +150,7 @@ namespace uammd{
 						   BoundaryConditionsDispatch<BottomBoundaryConditions, decltype(klist)>(klist, H));
 
 	int numberSystems = (nk.x/2+1)*nk.y;
-	this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler>(klist, topBC, bottomBC, numberSystems, H, cellDim.z);
+	this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler<real>>(klist, topBC, bottomBC, numberSystems, H, cellDim.z);
 	CudaCheckError();
       }
     };
diff --git a/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh b/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
index 4611db81..02ddc15c 100644
--- a/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
+++ b/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
@@ -117,6 +117,7 @@ namespace uammd{
       }
     };
 
+    template <typename T>
     class SchurBoundaryCondition{
       int nz;
       real H;
@@ -125,8 +126,8 @@ namespace uammd{
       SchurBoundaryCondition(int nz, real H):nz(nz), H(H), bcs(nz){}
 
       template<class TopBC, class BottomBC>
-      std::vector<real> computeBoundaryConditionMatrix(const TopBC &top, const BottomBC &bottom){
-	std::vector<real> CandD(2*nz+4, 0);
+      std::vector<T> computeBoundaryConditionMatrix(const TopBC &top, const BottomBC &bottom){
+	std::vector<T> CandD(2*nz+4, 0);
 	auto topRow = computeTopRow(top, bottom);
 	auto bottomRow = computeBottomRow(top, bottom);
 	std::copy(topRow.begin(), topRow.end()-2, CandD.begin());
@@ -141,8 +142,8 @@ namespace uammd{
     private:
 
       template<class TopBC, class BottomBC>
-      std::vector<real> computeTopRow(const TopBC &top, const BottomBC &bottom){
-	std::vector<real> topRow(nz+2, 0);
+      std::vector<T> computeTopRow(const TopBC &top, const BottomBC &bottom){
+	std::vector<T> topRow(nz+2, 0);
 	auto tfi = bcs.topFirstIntegral();
 	auto tsi = bcs.topSecondIntegral();
 	auto tfiFactor = top.getFirstIntegralFactor();
@@ -156,8 +157,8 @@ namespace uammd{
       }
 
       template<class TopBC, class BottomBC>
-      std::vector<real> computeBottomRow(const TopBC &top, const BottomBC &bottom){
-	std::vector<real> bottomRow(nz+2, 0);
+      std::vector<T> computeBottomRow(const TopBC &top, const BottomBC &bottom){
+	std::vector<T> bottomRow(nz+2, 0);
 	auto bfi = bcs.bottomFirstIntegral();
 	auto bsi = bcs.bottomSecondIntegral();
 	auto bfiFactor = bottom.getFirstIntegralFactor();
@@ -172,10 +173,11 @@ namespace uammd{
 
     };
 
-    std::vector<real> computeSecondIntegralMatrix(real k, real H, int nz){
-      std::vector<real> A(nz*nz, 0);
+    template<class T>
+    std::vector<T> computeSecondIntegralMatrix(T k, real H, int nz){
+      std::vector<T> A(nz*nz, 0);
       SecondIntegralMatrix sim(nz);
-      real kH2 = k*k*H*H;
+      T kH2 = k*k*H*H;
       fori(0, nz){
 	forj(0,nz){
 	  A[i+nz*j] = (i==j) - kH2*sim.getElement(i, j);
@@ -184,9 +186,10 @@ namespace uammd{
       return std::move(A);
     }
 
-    std::vector<real> computeInverseSecondIntegralMatrix(real k, real H, int nz){
+    template<class T>
+    std::vector<T> computeInverseSecondIntegralMatrix(T k, real H, int nz){
       if(k==0){
-	std::vector<real> invA(nz*nz, 0);
+	std::vector<T> invA(nz*nz, 0);
 	fori(0, nz){
 	  invA[i+nz*i] = 1;
 	}
diff --git a/src/misc/BoundaryValueProblem/BVPSolver.cuh b/src/misc/BoundaryValueProblem/BVPSolver.cuh
index 4e54b094..7959fd9b 100644
--- a/src/misc/BoundaryValueProblem/BVPSolver.cuh
+++ b/src/misc/BoundaryValueProblem/BVPSolver.cuh
@@ -42,39 +42,40 @@ namespace uammd{
   namespace BVP{
 
     namespace detail{
+      template <typename U>
       class SubsystemSolver{
 	int nz;
 	real H;
-	StorageHandle<real> CinvA_storage;
-	StorageHandle<real4> CinvABmD_storage;
+	StorageHandle<U> CinvA_storage;
+	StorageHandle<U> CinvABmD_storage;
 
       public:
 
 	SubsystemSolver(int nz, real H):nz(nz), H(H){}
 
 	void registerRequiredStorage(StorageRegistration &memoryManager){
-	  CinvA_storage =    memoryManager.registerStorageRequirement<real>(2*nz+2);
-	  CinvABmD_storage = memoryManager.registerStorageRequirement<real4>(1);
+	  CinvA_storage =    memoryManager.registerStorageRequirement<U>(2*nz+2);
+	  CinvABmD_storage = memoryManager.registerStorageRequirement<U>(4);
 	}
 
 	template<class TopBC, class BottomBC>
-	void precompute(real k, const TopBC &top, const BottomBC &bottom, StorageRetriever &memoryManager){
+	void precompute(U k, const TopBC &top, const BottomBC &bottom, StorageRetriever &memoryManager){
 	  auto CinvA_it = memoryManager.retrieveStorage(CinvA_storage);
 	  auto CinvABmD_it = memoryManager.retrieveStorage(CinvABmD_storage);
 	  auto invA = computeInverseSecondIntegralMatrix(k, H, nz);
-	  SchurBoundaryCondition bcs(nz, H);
+	  SchurBoundaryCondition<U> bcs(nz, H);
 	  auto CandD = bcs.computeBoundaryConditionMatrix(top, bottom);
-	  real4 D = make_real4(CandD[2*nz], CandD[2*nz+1], CandD[2*nz+2], CandD[2*nz+3]);
+	  U D[4] = {CandD[2*nz], CandD[2*nz+1], CandD[2*nz+2], CandD[2*nz+3]};
 	  auto CinvA = matmul(CandD, nz, 2, invA, nz, nz);
 	  std::copy(CinvA.begin(), CinvA.end(), CinvA_it);
-	  real4 CinvAB;
-	  real B00 = -k*k*H*H;
-	  real B11 = -k*k*H*H;
-	  CinvAB.x = CinvA[0]*B00;
-	  CinvAB.y = CinvA[1]*B11;
-	  CinvAB.z = CinvA[0+nz]*B00;
-	  CinvAB.w = CinvA[1+nz]*B11;
-	  CinvABmD_it[0] = CinvAB - D;
+	  U CinvAB[4];
+	  U B00 = -k*k*H*H;
+	  U B11 = -k*k*H*H;
+	  CinvAB[0] = CinvA[0]*B00;
+	  CinvAB[1] = CinvA[1]*B11;
+	  CinvAB[2] = CinvA[0+nz]*B00;
+	  CinvAB[3] = CinvA[1+nz]*B11;
+	  fori(0, 4) CinvABmD_it[i] = CinvAB[i] - D[i];
 	}
 
 	template<class T, class FnIterator>
@@ -83,7 +84,10 @@ namespace uammd{
 					   StorageRetriever &memoryManager){
 	  const auto CinvA = memoryManager.retrieveStorage(CinvA_storage);
 	  const auto CinvAfmab = computeRightHandSide(alpha, beta, fn, CinvA);
-	  const real4 CinvABmD = *(memoryManager.retrieveStorage(CinvABmD_storage));
+	  const auto CinvABmD_it = memoryManager.retrieveStorage(CinvABmD_storage);
+	  U CinvABmD[4] = {CinvABmD_it[0], CinvABmD_it[1],
+				       CinvABmD_it[2], CinvABmD_it[3]};
+
 	  const auto c0d0 = solveSubsystem(CinvABmD, CinvAfmab);
 	  return c0d0;
 	}
@@ -91,8 +95,8 @@ namespace uammd{
       private:
 
 	template<class T>
-	__device__ thrust::pair<T,T> solveSubsystem(real4 CinvABmD, thrust::pair<T,T> CinvAfmab) const{
-	  auto c0d0 = solve2x2System(CinvABmD, CinvAfmab);
+	__device__ thrust::pair<T,T> solveSubsystem(U CinvABmD[4], thrust::pair<T,T> CinvAfmab) const{
+	  auto c0d0 = solve2x2System<T>(CinvABmD, CinvAfmab);
 	  return c0d0;
 	}
 
@@ -113,25 +117,33 @@ namespace uammd{
 
       };
 
+      template<typename U>
       class PentadiagonalSystemSolver{
 	int nz;
 	real H;
-	KBPENTA_mod pentasolve;
+	KBPENTA_mod<U> pentasolve;
       public:
 
+	static_assert(
+		      std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
+		      std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
+		      "PentadiagonalSystemSolver is expected to work only with real numbers or thrust::complex<> numbers"
+		      );
+	
 	PentadiagonalSystemSolver(int nz, real H):
 	  nz(nz), H(H), pentasolve(nz){}
-
+	
+      
 	void registerRequiredStorage(StorageRegistration &memoryManager){
 	  pentasolve.registerRequiredStorage(memoryManager);
 	}
 
-	void precompute(real k, StorageRetriever &memoryManager){
-	  real diagonal[nz];
-	  real diagonal_p2[nz]; diagonal_p2[nz-2] = diagonal_p2[nz-1] = 0;
-	  real diagonal_m2[nz]; diagonal_m2[0] = diagonal_m2[1] = 0;
+	void precompute(U k, StorageRetriever &memoryManager){
+	  U diagonal[nz];
+	  U diagonal_p2[nz]; diagonal_p2[nz-2] = diagonal_p2[nz-1] = 0;
+	  U diagonal_m2[nz]; diagonal_m2[0] = diagonal_m2[1] = 0;
 	  SecondIntegralMatrix sim(nz);
-	  const real kH2 = k*k*H*H;
+	  const U kH2 = k*k*H*H;
 	  for(int i = 0; i<nz; i++){
 	    diagonal[i] = 1.0 - kH2*sim.getElement(i,i);
 	    if(i<nz-2) diagonal_p2[i] = -kH2*sim.getElement(i+2, i);
@@ -148,23 +160,31 @@ namespace uammd{
       };
     }
 
+    template<typename U>
     class BoundaryValueProblemSolver{
-      detail::PentadiagonalSystemSolver pent;
-      detail::SubsystemSolver sub;
-      StorageHandle<real> waveVector;
+      detail::PentadiagonalSystemSolver<U> pent;
+      detail::SubsystemSolver<U> sub;
+      StorageHandle<U> waveVector;
       int nz;
       real H;
     public:
+
+      static_assert(
+		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
+		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
+		    "BoundaryValueProblemSolver is expected to work only with real numbers or thrust::complex<> numbers"
+		    );
+      
       BoundaryValueProblemSolver(int nz, real H): nz(nz), H(H), sub(nz, H), pent(nz, H){}
 
       void registerRequiredStorage(StorageRegistration &mem){
-	waveVector = mem.registerStorageRequirement<real>(1);
+	waveVector = mem.registerStorageRequirement<U>(1);
 	sub.registerRequiredStorage(mem);
 	pent.registerRequiredStorage(mem);
       }
 
       template<class TopBC, class BottomBC>
-      void precompute(StorageRetriever &mem, real k, const TopBC &top, const BottomBC &bot){
+      void precompute(StorageRetriever &mem, U k, const TopBC &top, const BottomBC &bot){
 	auto k_access = mem.retrieveStorage(waveVector);
 	k_access[0] = k;
 	pent.precompute(k, mem);
@@ -177,10 +197,10 @@ namespace uammd{
 			    AnIterator& an,
 			    CnIterator& cn,
 			    StorageRetriever &mem){
-        const real k = *(mem.retrieveStorage(waveVector));
+        const auto k = *(mem.retrieveStorage(waveVector));
 	T c0, d0;
 	thrust::tie(c0, d0) = sub.solve(fn, alpha, beta, mem);
-	const real kH2 = k*k*H*H;
+	const auto kH2 = k*k*H*H;
 	fn[0] += kH2*c0;
 	fn[1] += kH2*d0;
 	pent.solve(fn, an, mem);
@@ -201,17 +221,25 @@ namespace uammd{
 
     };
 
+    template<typename U>
     class BatchedBVPHandler;
 
+    template<typename U>
     struct BatchedBVPGPUSolver{
     private:
       int numberSystems;
-      BoundaryValueProblemSolver bvpSolver;
+      BoundaryValueProblemSolver<U> bvpSolver;
       char* gpuMemory;
-      friend class BatchedBVPHandler;
-      BatchedBVPGPUSolver(int numberSystems, BoundaryValueProblemSolver bvpSolver, char *raw):
+      friend class BatchedBVPHandler<U>;
+      BatchedBVPGPUSolver(int numberSystems, BoundaryValueProblemSolver<U> bvpSolver, char *raw):
 	numberSystems(numberSystems), bvpSolver(bvpSolver), gpuMemory(raw){}
     public:
+      
+      static_assert(
+		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
+		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
+		    "BatchedBVPGPUHandler is expected to work only with real numbers or thrust::complex<> numbers"
+		    );
 
       template<class T, class FnIterator, class AnIterator, class CnIterator>
       __device__ void solve(int instance,
@@ -225,12 +253,19 @@ namespace uammd{
 
     };
 
+    template <typename U>
     class BatchedBVPHandler{
       int numberSystems;
-      BoundaryValueProblemSolver bvp;
+      BoundaryValueProblemSolver<U> bvp;
       thrust::device_vector<char> gpuMemory;
     public:
 
+      static_assert(
+		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
+		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
+		    "BatchedBVPHandler is expected to work only with real numbers or thrust::complex<> numbers"
+		    );
+
       template<class WaveVectorIterator, class BatchedTopBC, class BatchedBottomBC>
       BatchedBVPHandler(const WaveVectorIterator &klist,
 			BatchedTopBC top, BatchedBottomBC bot,
@@ -240,9 +275,9 @@ namespace uammd{
 	precompute(klist, top, bot);
       }
 
-      BatchedBVPGPUSolver getGPUSolver(){
+      BatchedBVPGPUSolver<U> getGPUSolver(){
 	auto raw = thrust::raw_pointer_cast(gpuMemory.data());
-	BatchedBVPGPUSolver d_solver(numberSystems, bvp, raw);
+	BatchedBVPGPUSolver<U> d_solver(numberSystems, bvp, raw);
 	return d_solver;
       }
 
diff --git a/src/misc/BoundaryValueProblem/KBPENTA.cuh b/src/misc/BoundaryValueProblem/KBPENTA.cuh
index 636b217b..0bd69733 100644
--- a/src/misc/BoundaryValueProblem/KBPENTA.cuh
+++ b/src/misc/BoundaryValueProblem/KBPENTA.cuh
@@ -6,20 +6,27 @@ namespace uammd{
   namespace BVP{
 
 //Algorithm adapted from  http://dx.doi.org/10.1080/00207160802326507 for a special case of only three diagonals being non zero
+    template<typename U>
     class KBPENTA_mod{
-      StorageHandle<real> storageHandle;
+      StorageHandle<U> storageHandle;
       int nz;
     public:
 
+      static_assert(
+		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
+		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
+		    "KBPENTA_mode is expected to work only with real numbers or thrust::complex<> numbers"
+		    );
+
       KBPENTA_mod(int nz): nz(nz){}
 
       void registerRequiredStorage(StorageRegistration &memoryManager){
-	storageHandle =  memoryManager.registerStorageRequirement<real>(3*nz+2);
+	storageHandle =  memoryManager.registerStorageRequirement<U>(3*nz+2);
       }
 
-      void store(real *diagonal, real *diagonal_p2, real *diagonal_m2, StorageRetriever &memoryManager){
+      void store(U *diagonal, U *diagonal_p2, U *diagonal_m2, StorageRetriever &memoryManager){
 	auto storage = memoryManager.retrieveStorage(storageHandle);
-	std::vector<real> beta(nz+1, 0);
+	std::vector<U> beta(nz+1, 0);
 	beta[0] = 0;
 	beta[1] = diagonal[nz-nz];
 	beta[2] = diagonal[nz-(nz-1)];
diff --git a/src/misc/BoundaryValueProblem/MatrixUtils.h b/src/misc/BoundaryValueProblem/MatrixUtils.h
index 7a1689af..25903c71 100644
--- a/src/misc/BoundaryValueProblem/MatrixUtils.h
+++ b/src/misc/BoundaryValueProblem/MatrixUtils.h
@@ -1,5 +1,5 @@
 /*Raul P. Pelaez 2019-2021. Boundary Value Problem Matrix algebra utilities
- */
+*/
 #ifndef BVP_MATRIX_UTILS_H
 #define BVP_MATRIX_UTILS_H
 #include "utils/cufftPrecisionAgnostic.h"
@@ -12,95 +12,129 @@
 #else
 #include<lapacke.h>
 #endif
-
+#include<thrust/complex.h>
 namespace uammd{
   namespace BVP{
-
+    using complex = thrust::complex<real>;
     template<class T> struct LapackeUAMMD;
     template<>
-    struct LapackeUAMMD<float>{
-      template<class ...T> static int getrf(T... args){return LAPACKE_sgetrf(args...);}
-      template<class ...T> static int getri(T... args){return LAPACKE_sgetri(args...);}
-    };
+      struct LapackeUAMMD<float>{
+        template<class ...T> static int getrf(T... args){return LAPACKE_sgetrf(args...);}
+        template<class ...T> static int getri(T... args){return LAPACKE_sgetri(args...);}
+      };
     template<>
-    struct LapackeUAMMD<double>{
-      template<class ...T> static int getrf(T... args){return LAPACKE_dgetrf(args...);}
-      template<class ...T> static int getri(T... args){return LAPACKE_dgetri(args...);}
-    };
+      struct LapackeUAMMD<double>{
+        template<class ...T> static int getrf(T... args){return LAPACKE_dgetrf(args...);}
+        template<class ...T> static int getri(T... args){return LAPACKE_dgetri(args...);}
+      };
 
     template<>
-    struct LapackeUAMMD<lapack_complex_float>{
-      template<class ...T> static int getrf(T... args){return LAPACKE_cgetrf(args...);}
-      template<class ...T> static int getri(T... args){return LAPACKE_cgetri(args...);}
-    };
+      struct LapackeUAMMD<lapack_complex_float>{
+        template<class ...T> static int getrf(T... args){return LAPACKE_cgetrf(args...);}
+        template<class ...T> static int getri(T... args){return LAPACKE_cgetri(args...);}
+      };
     template<>
-    struct LapackeUAMMD<lapack_complex_double>{
-      template<class ...T> static int getrf(T... args){return LAPACKE_zgetrf(args...);}
-      template<class ...T> static int getri(T... args){return LAPACKE_zgetri(args...);}
-    };
+      struct LapackeUAMMD<lapack_complex_double>{
+        template<class ...T> static int getrf(T... args){return LAPACKE_zgetrf(args...);}
+        template<class ...T> static int getri(T... args){return LAPACKE_zgetri(args...);}
+      };
     template<>
-    struct LapackeUAMMD<cufftComplex_t<float>>{
-      static int getrf(int matrix_layout, lapack_int n, lapack_int m,
-		       cufftComplex_t<float>* a, lapack_int lda,
-		       lapack_int* ipiv){
-	return LAPACKE_cgetrf(matrix_layout, n, m, (lapack_complex_float*)(a), lda, ipiv);
-      }
-      static int getri(int matrix_layout, lapack_int n,
-		       cufftComplex_t<float>* a, lapack_int lda,
-		       lapack_int* ipiv){
-	return LAPACKE_cgetri(matrix_layout, n, (lapack_complex_float*)(a), lda, ipiv);
-      }
-    };
+      struct LapackeUAMMD<cufftComplex_t<float>>{
+        static int getrf(int matrix_layout, lapack_int n, lapack_int m,
+            cufftComplex_t<float>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_cgetrf(matrix_layout, n, m, (lapack_complex_float*)(a), lda, ipiv);
+        }
+        static int getri(int matrix_layout, lapack_int n,
+            cufftComplex_t<float>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_cgetri(matrix_layout, n, (lapack_complex_float*)(a), lda, ipiv);
+        }
+      };
     template<>
-    struct LapackeUAMMD<cufftComplex_t<double>>{
-      static int getrf(int matrix_layout, lapack_int n, lapack_int m,
-		       cufftComplex_t<double>* a, lapack_int lda,
-		       lapack_int* ipiv){
-	return LAPACKE_zgetrf(matrix_layout, n, m, (lapack_complex_double*)(a), lda, ipiv);
-      }
-      static int getri(int matrix_layout, lapack_int n,
-		       cufftComplex_t<double>* a, lapack_int lda,
-		       lapack_int* ipiv){
-	return LAPACKE_zgetri(matrix_layout, n, (lapack_complex_double*)(a), lda, ipiv);
-      }
-    };
+      struct LapackeUAMMD<cufftComplex_t<double>>{
+        static int getrf(int matrix_layout, lapack_int n, lapack_int m,
+            cufftComplex_t<double>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_zgetrf(matrix_layout, n, m, (lapack_complex_double*)(a), lda, ipiv);
+        }
+        static int getri(int matrix_layout, lapack_int n,
+            cufftComplex_t<double>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_zgetri(matrix_layout, n, (lapack_complex_double*)(a), lda, ipiv);
+        }
+      };
+    template<>
+      struct LapackeUAMMD<thrust::complex<float>>{
+        static int getrf(int matrix_layout, lapack_int n, lapack_int m,
+            thrust::complex<float>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_cgetrf(matrix_layout, n, m, (lapack_complex_float*)(a), lda, ipiv);
+        }
+        static int getri(int matrix_layout, lapack_int n,
+            thrust::complex<float>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_cgetri(matrix_layout, n, (lapack_complex_float*)(a), lda, ipiv);
+        }
+      };
+    template<>
+      struct LapackeUAMMD<thrust::complex<double>>{
+        static int getrf(int matrix_layout, lapack_int n, lapack_int m,
+            thrust::complex<double>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_zgetrf(matrix_layout, n, m, (lapack_complex_double*)(a), lda, ipiv);
+        }
+        static int getri(int matrix_layout, lapack_int n,
+            thrust::complex<double>* a, lapack_int lda,
+            lapack_int* ipiv){
+          return LAPACKE_zgetri(matrix_layout, n, (lapack_complex_double*)(a), lda, ipiv);
+        }
+      };
 
 
     template<class T>
-    std::vector<T> invertSquareMatrix(const std::vector<T> &A, lapack_int N){
-      lapack_int pivotArray[N];
-      int errorHandler;
-      auto invA = A;
-      errorHandler = LapackeUAMMD<T>::getrf(LAPACK_ROW_MAJOR, N, N, invA.data(), N, pivotArray);
-      if(errorHandler){
-	throw std::runtime_error("Lapacke getrf failed with error code: " + std::to_string(errorHandler));
+      std::vector<T> invertSquareMatrix(const std::vector<T> &A, lapack_int N){
+        lapack_int pivotArray[N];
+        int errorHandler;
+        auto invA = A;
+        errorHandler = LapackeUAMMD<T>::getrf(LAPACK_ROW_MAJOR, N, N, invA.data(), N, pivotArray);
+        if(errorHandler){
+          throw std::runtime_error("Lapacke getrf failed with error code: " + std::to_string(errorHandler));
+        }
+        errorHandler = LapackeUAMMD<T>::getri(LAPACK_ROW_MAJOR, N, invA.data(), N, pivotArray);
+        if(errorHandler){
+          throw std::runtime_error("Lapacke getri failed with error code: " + std::to_string(errorHandler));
+        }
+        return invA;
       }
-      errorHandler = LapackeUAMMD<T>::getri(LAPACK_ROW_MAJOR, N, invA.data(), N, pivotArray);
-      if(errorHandler){
-	throw std::runtime_error("Lapacke getri failed with error code: " + std::to_string(errorHandler));
-      }
-      return invA;
-    }
 
     template<class T, class T2>
-    std::vector<typename T::value_type> matmul(const T &A, int ncol_a, int nrow_a,
-					       const T2 &B, int ncol_b, int nrow_b){
-      std::vector<typename T::value_type> C;
-      C.resize(ncol_b*nrow_a);
-      for(int i = 0; i<nrow_a; i++){
-	for(int j = 0; j<ncol_b; j++){
-	  real tmp = 0;
-	  for(int k = 0; k<ncol_a; k++){
-	    tmp += A[k+ncol_a*i] * B [j+ncol_b*k];
-	  }
-	  C[j+ncol_b*i] = tmp;
-	}
+      auto matmul(const T &A, int ncol_a, int nrow_a,
+          const T2 &B, int ncol_b, int nrow_b){
+        using type = typename T::value_type;
+        std::vector<type> C;
+        C.resize(ncol_b*nrow_a);
+        for(int i = 0; i<nrow_a; i++){
+          for(int j = 0; j<ncol_b; j++){
+            type tmp{};
+            for(int k = 0; k<ncol_a; k++){
+              tmp += A[k+ncol_a*i] * B [j+ncol_b*k];
+            }
+            C[j+ncol_b*i] = tmp;
+          }
+        }
+        return C;
       }
-      return C;
-    }
-
     //Solves the linear system A*x = b
-    //Given A as a real4 (2x2 matrix) and b as two numbers of an arbitrary type (could be complex, real...)
+    //Given A (2x2 matrix) and b as two numbers of an arbitrary type (could be complex, real...)
+    template<class T, class R>
+    __device__ thrust::pair<T,T> solve2x2System(thrust::complex<R> A[4], thrust::pair<T,T> b){
+      const auto det  = A[0]*A[3] - A[1]*A[2];
+      const T c0 = (b.first*A[3] - A[1]*b.second)/det;
+      const T d0 = (b.second*A[0] - b.first*A[2])/det;
+      return thrust::make_pair(c0, d0);
+    }
+ 
     template<class T>
     __device__ thrust::pair<T,T> solve2x2System(real4 A, thrust::pair<T,T> b){
       const real det  = A.x*A.w - A.y*A.z;
@@ -109,6 +143,11 @@ namespace uammd{
       return thrust::make_pair(c0, d0);
     }
 
+    template<class T>
+    __device__ thrust::pair<T,T> solve2x2System(real A[4], thrust::pair<T,T> b){
+      real4 A_r4 = {A[0], A[1], A[2], A[3]};
+      return solve2x2System(A_r4, b);
+    }
   }
 }
 #endif
diff --git a/src/utils/atomics.cuh b/src/utils/atomics.cuh
index fc592524..6797debd 100644
--- a/src/utils/atomics.cuh
+++ b/src/utils/atomics.cuh
@@ -7,8 +7,6 @@ namespace uammd{
   template<class T>
   inline __device__ T atomicAdd(T* address, T val){ return ::atomicAdd(address, val);}
 
-
-#ifndef SINGLE_PRECISION
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 600
   inline __device__ double atomicAdd(double* address, double val){
     unsigned long long int* address_as_ull =
@@ -23,7 +21,20 @@ namespace uammd{
     return __longlong_as_double(old);
   }
 #endif
-#endif
+
+    inline __device__ float2 atomicAdd(float2* address, float2 val){
+    float2 newval;
+    if(val.x) newval.x = atomicAdd(&(*address).x, val.x);
+    if(val.y) newval.y = atomicAdd(&(*address).y, val.y);
+    return newval;
+  }
+
+  inline __device__ double2 atomicAdd(double2* address, double2 val){
+    double2 newval;
+    if(val.x) newval.x = atomicAdd(&(*address).x, val.x);
+    if(val.y) newval.y = atomicAdd(&(*address).y, val.y);
+    return newval;
+  }
 
   inline __device__ real4 atomicAdd(real4* address, real4 val){
     real4 newval;
@@ -41,21 +52,12 @@ namespace uammd{
     if(val.z) newval.z = atomicAdd(&(*address).z, val.z);
     return newval;
   }
-
-  inline __device__ real2 atomicAdd(real2* address, real2 val){
-    real2 newval;
-    if(val.x) newval.x = atomicAdd(&(*address).x, val.x);
-    if(val.y) newval.y = atomicAdd(&(*address).y, val.y);
-    return newval;
-  }
-
-
+  
   template<class T, class T2>
   inline __device__ T2 atomicAdd(T &ref, T2 val){
     return atomicAdd(&ref, val);
   }
 
-
   template<class T, class T2>
   inline __device__ T2 atomicAdd(thrust::tuple<T&,T&> &&refs, T2 val){
     T2 newval;
diff --git a/src/utils/vector.cuh b/src/utils/vector.cuh
index df4ed98c..43a4330f 100644
--- a/src/utils/vector.cuh
+++ b/src/utils/vector.cuh
@@ -976,4 +976,339 @@ namespace uammd{
   }
 }
 
+namespace uammd{
+////////////////////////////////////VEC2///////////////////////////////////////
+  template<class T>
+  struct vec2{
+    T x,y;
+  };
+
+  template<class T>
+  vec2<T> make_vec2(T x, T y){ return {x, y};}
+
+  template<class T>
+  VECATTR  vec2<T> operator +(const vec2<T> &a, const vec2<T> &b){
+    return {a.x + b.x, a.y + b.y};
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator +(const vec2<T> &a, const T &b){
+    return {a.x + b, a.y + b};
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator -(const vec2<T> &a, const vec2<T> &b){
+    return {a.x - b.x, a.y - b.y};
+  }
+
+
+  template<class T>
+  VECATTR  vec2<T> operator -(const vec2<T> &a, const T &b){
+    return {a.x - b, a.y - b};
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator -(const T &b, const vec2<T> &a){
+    return {b-a.x, b-a.y};
+  }
+
+
+  template<class T>
+  VECATTR  vec2<T> operator *(const vec2<T> &a, const vec2<T> &b){
+    return {a.x * b.x, a.y * b.y};
+  }
+
+
+  template<class T>
+  VECATTR  vec2<T> operator *(const vec2<T> &a, const T &b){
+    return {a.x * b, a.y * b};
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator /(const vec2<T> &a, const vec2<T> &b){
+    return {a.x / b.x, a.y / b.y};
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator /(const vec2<T> &a, const T &b){
+    return {a.x/b, a.y/b};
+  }
+
+
+  template<class T>
+  VECATTR  vec2<T> operator /(const T &b, const vec2<T> &a){
+    return {b / a.x, b / a.y};
+  }
+
+  template<class T>
+  VECATTR  void operator +=(vec2<T> &a, const vec2<T> &b){
+    a = a + b;
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator +(const T &b, const vec2<T> &a){return a+b;}
+
+  template<class T>
+  VECATTR  void operator +=(vec2<T> &a, const T &b){
+    a = a+b;
+  }
+
+  template<class T>
+  VECATTR  void operator -=(vec2<T> &a, const vec2<T> &b){
+    a = a-b;
+  }
+  template<class T>
+  VECATTR  void operator -=(vec2<T> &a, const T &b){
+    a=a-b;
+  }
+  template<class T>
+  VECATTR  void operator *=(vec2<T> &a, const vec2<T> &b){
+    a = a*b;
+  }
+
+  template<class T>
+  VECATTR  vec2<T> operator *(const T &b, const vec2<T> &a){
+    return a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator *=(vec2<T> &a, const T &b){
+    a=a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec2<T> &a, const vec2<T> &b){
+    a = a/b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec2<T> &a, const T &b){
+    a = a/b;
+  }
+
+////////////////////////////////////VEC3///////////////////////////////////////
+  template<class T>
+  struct vec3{
+    T x,y,z;
+  };
+
+  template<class T>
+  vec3<T> make_vec3(T x, T y, T z){ return {x, y, z};}
+
+  template<class T>
+  VECATTR  vec3<T> operator +(const vec3<T> &a, const vec3<T> &b){
+    return {a.x + b.x, a.y + b.y, a.z + b.z};
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator +(const vec3<T> &a, const T &b){
+    return {a.x + b, a.y + b, a.z + b};
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator -(const vec3<T> &a, const vec3<T> &b){
+    return {a.x - b.x, a.y - b.y, a.z - b.z};
+  }
+
+
+  template<class T>
+  VECATTR  vec3<T> operator -(const vec3<T> &a, const T &b){
+    return {a.x - b, a.y - b, a.z - b};
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator -(const T &b, const vec3<T> &a){
+    return {b-a.x, b-a.y, b-a.z};
+  }
+
+
+  template<class T>
+  VECATTR  vec3<T> operator *(const vec3<T> &a, const vec3<T> &b){
+    return {a.x * b.x, a.y * b.y, a.z * b.z};
+  }
+
+
+  template<class T>
+  VECATTR  vec3<T> operator *(const vec3<T> &a, const T &b){
+    return {a.x * b, a.y * b, a.z * b};
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator /(const vec3<T> &a, const vec3<T> &b){
+    return {a.x / b.x, a.y / b.y, a.z / b.z};
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator /(const vec3<T> &a, const T &b){
+    return {a.x/b, a.y/b, a.z/b};
+  }
+
+
+  template<class T>
+  VECATTR  vec3<T> operator /(const T &b, const vec3<T> &a){
+    return {b / a.x, b / a.y, b / a.z};
+  }
+
+
+
+  template<class T>
+  VECATTR  void operator +=(vec3<T> &a, const vec3<T> &b){
+    a = a + b;
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator +(const T &b, const vec3<T> &a){return a+b;}
+
+  template<class T>
+  VECATTR  void operator +=(vec3<T> &a, const T &b){
+    a = a+b;
+  }
+
+  template<class T>
+  VECATTR  void operator -=(vec3<T> &a, const vec3<T> &b){
+    a = a-b;
+  }
+  template<class T>
+  VECATTR  void operator -=(vec3<T> &a, const T &b){
+    a=a-b;
+  }
+  template<class T>
+  VECATTR  void operator *=(vec3<T> &a, const vec3<T> &b){
+    a = a*b;
+  }
+
+  template<class T>
+  VECATTR  vec3<T> operator *(const T &b, const vec3<T> &a){
+    return a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator *=(vec3<T> &a, const T &b){
+    a=a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec3<T> &a, const vec3<T> &b){
+    a = a/b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec3<T> &a, const T &b){
+    a = a/b;
+  }
+
+  /////////////////////////////////VEC4//////////////////////////
+
+  template<class T>
+  struct vec4{
+    T x,y,z,w;
+  };
+
+
+
+  template<class T>
+  vec4<T> make_vec4(T x, T y, T z, T w){ return {x, y, z, w};}
+
+  template<class T>
+  VECATTR  vec4<T> operator +(const vec4<T> &a, const vec4<T> &b){
+    return {a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w};
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator +(const vec4<T> &a, const T &b){
+    return {a.x + b, a.y + b, a.z + b, a.w + b};
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator -(const vec4<T> &a, const vec4<T> &b){
+    return {a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w};
+  }
+
+
+  template<class T>
+  VECATTR  vec4<T> operator -(const vec4<T> &a, const T &b){
+    return {a.x - b, a.y - b, a.z - b, a.w - b};
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator -(const T &b, const vec4<T> &a){
+    return {b-a.x, b-a.y, b-a.z, b-a.w};
+  }
+
+
+  template<class T>
+  VECATTR  vec4<T> operator *(const vec4<T> &a, const vec4<T> &b){
+    return {a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w};
+  }
+
+
+  template<class T>
+  VECATTR  vec4<T> operator *(const vec4<T> &a, const T &b){
+    return {a.x * b, a.y * b, a.z * b, a.w * b};
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator /(const vec4<T> &a, const vec4<T> &b){
+    return {a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w};
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator /(const vec4<T> &a, const T &b){
+    return {a.x/b, a.y/b, a.z/b, a.w/b};
+  }
+
+
+  template<class T>
+  VECATTR  vec4<T> operator /(const T &b, const vec4<T> &a){
+    return {b / a.x, b / a.y, b / a.z, b / a.w};
+  }
+
+
+
+  template<class T>
+  VECATTR  void operator +=(vec4<T> &a, const vec4<T> &b){
+    a = a + b;
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator +(const T &b, const vec4<T> &a){return a+b;}
+
+  template<class T>
+  VECATTR  void operator +=(vec4<T> &a, const T &b){
+    a = a+b;
+  }
+
+  template<class T>
+  VECATTR  void operator -=(vec4<T> &a, const vec4<T> &b){
+    a = a-b;
+  }
+  template<class T>
+  VECATTR  void operator -=(vec4<T> &a, const T &b){
+    a=a-b;
+  }
+  template<class T>
+  VECATTR  void operator *=(vec4<T> &a, const vec4<T> &b){
+    a = a*b;
+  }
+
+  template<class T>
+  VECATTR  vec4<T> operator *(const T &b, const vec4<T> &a){
+    return a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator *=(vec4<T> &a, const T &b){
+    a=a*b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec4<T> &a, const vec4<T> &b){
+    a = a/b;
+  }
+
+  template<class T>
+  VECATTR  void operator /=(vec4<T> &a, const T &b){
+    a = a/b;
+  }
+}
 #endif
diff --git a/test/misc/bvp/bvp.cu b/test/misc/bvp/bvp.cu
index 3f55625f..8aea4ae8 100644
--- a/test/misc/bvp/bvp.cu
+++ b/test/misc/bvp/bvp.cu
@@ -202,7 +202,7 @@ auto getParameters(){
 }
 
 auto createBVP(std::vector<real> klist, Parameters par){
-  auto bvp = std::make_shared<BVP::BatchedBVPHandler> (klist,
+  auto bvp = std::make_shared<BVP::BatchedBVPHandler<real>> (klist,
 			     make_boundary_dispatcher<TopBoundaryConditions>(klist, par.H),
 			     make_boundary_dispatcher<BottomBoundaryConditions>(klist, par.H),
 			     klist.size(), par.H, par.nz);

From 88b87ea2f1253cffec911de4e47238c992e677ef Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Tue, 20 Feb 2024 14:59:24 +0100
Subject: [PATCH 03/15] Fixed BVP type issues

---
 src/misc/BoundaryValueProblem/BVPSolver.cuh | 40 ++++++++-------------
 src/misc/BoundaryValueProblem/KBPENTA.cuh   | 12 +++----
 test/misc/bvp/bvp.cu                        |  2 +-
 3 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/src/misc/BoundaryValueProblem/BVPSolver.cuh b/src/misc/BoundaryValueProblem/BVPSolver.cuh
index 7959fd9b..83b4c2a7 100644
--- a/src/misc/BoundaryValueProblem/BVPSolver.cuh
+++ b/src/misc/BoundaryValueProblem/BVPSolver.cuh
@@ -124,16 +124,10 @@ namespace uammd{
 	KBPENTA_mod<U> pentasolve;
       public:
 
-	static_assert(
-		      std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
-		      std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
-		      "PentadiagonalSystemSolver is expected to work only with real numbers or thrust::complex<> numbers"
-		      );
-	
 	PentadiagonalSystemSolver(int nz, real H):
 	  nz(nz), H(H), pentasolve(nz){}
-	
-      
+
+
 	void registerRequiredStorage(StorageRegistration &memoryManager){
 	  pentasolve.registerRequiredStorage(memoryManager);
 	}
@@ -169,12 +163,6 @@ namespace uammd{
       real H;
     public:
 
-      static_assert(
-		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
-		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
-		    "BoundaryValueProblemSolver is expected to work only with real numbers or thrust::complex<> numbers"
-		    );
-      
       BoundaryValueProblemSolver(int nz, real H): nz(nz), H(H), sub(nz, H), pent(nz, H){}
 
       void registerRequiredStorage(StorageRegistration &mem){
@@ -221,6 +209,10 @@ namespace uammd{
 
     };
 
+    using BoundaryValueProblemSolverReal          = BoundaryValueProblemSolver<real>;
+    using BoundaryValueProblemSolverComplex       = BoundaryValueProblemSolver<thrust::complex<real>>;
+    using BoundaryValueProblemSolverComplexDouble = BoundaryValueProblemSolver<thrust::complex<double>>;
+
     template<typename U>
     class BatchedBVPHandler;
 
@@ -234,12 +226,6 @@ namespace uammd{
       BatchedBVPGPUSolver(int numberSystems, BoundaryValueProblemSolver<U> bvpSolver, char *raw):
 	numberSystems(numberSystems), bvpSolver(bvpSolver), gpuMemory(raw){}
     public:
-      
-      static_assert(
-		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
-		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
-		    "BatchedBVPGPUHandler is expected to work only with real numbers or thrust::complex<> numbers"
-		    );
 
       template<class T, class FnIterator, class AnIterator, class CnIterator>
       __device__ void solve(int instance,
@@ -253,6 +239,10 @@ namespace uammd{
 
     };
 
+    using BatchedBVPGPUSolverReal          = BatchedBVPGPUSolver<real>;
+    using BatchedBVPGPUSolverComplex       = BatchedBVPGPUSolver<thrust::complex<real>>;
+    using BatchedBVPGPUSolverComplexDouble = BatchedBVPGPUSolver<thrust::complex<double>>;
+
     template <typename U>
     class BatchedBVPHandler{
       int numberSystems;
@@ -260,12 +250,6 @@ namespace uammd{
       thrust::device_vector<char> gpuMemory;
     public:
 
-      static_assert(
-		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
-		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
-		    "BatchedBVPHandler is expected to work only with real numbers or thrust::complex<> numbers"
-		    );
-
       template<class WaveVectorIterator, class BatchedTopBC, class BatchedBottomBC>
       BatchedBVPHandler(const WaveVectorIterator &klist,
 			BatchedTopBC top, BatchedBottomBC bot,
@@ -302,6 +286,10 @@ namespace uammd{
 
     };
 
+    using BatchedBVPHandlerReal          = BatchedBVPHandler<real>;
+    using BatchedBVPHandlerComplex       = BatchedBVPHandler<thrust::complex<real>>;
+    using BatchedBVPHandlerComplexDouble = BatchedBVPHandler<thrust::complex<double>>;
+
   }
 
 }
diff --git a/src/misc/BoundaryValueProblem/KBPENTA.cuh b/src/misc/BoundaryValueProblem/KBPENTA.cuh
index 0bd69733..b80a5c84 100644
--- a/src/misc/BoundaryValueProblem/KBPENTA.cuh
+++ b/src/misc/BoundaryValueProblem/KBPENTA.cuh
@@ -5,19 +5,13 @@
 namespace uammd{
   namespace BVP{
 
-//Algorithm adapted from  http://dx.doi.org/10.1080/00207160802326507 for a special case of only three diagonals being non zero
+    //Algorithm adapted from  http://dx.doi.org/10.1080/00207160802326507 for a special case of only three diagonals being non zero
     template<typename U>
     class KBPENTA_mod{
       StorageHandle<U> storageHandle;
       int nz;
     public:
 
-      static_assert(
-		    std::is_same<U, float>::value  || std::is_same<U, thrust::complex<float>>::value ||
-		    std::is_same<U, double>::value || std::is_same<U, thrust::complex<double>>::value,
-		    "KBPENTA_mode is expected to work only with real numbers or thrust::complex<> numbers"
-		    );
-
       KBPENTA_mod(int nz): nz(nz){}
 
       void registerRequiredStorage(StorageRegistration &memoryManager){
@@ -63,6 +57,10 @@ namespace uammd{
 
     };
 
+    using KBPENTA_mod_real           = KBPENTA_mod<real>;
+    using KBPENTA_mod_complex        = KBPENTA_mod<thrust::complex<real>>;
+    using KBPENTA_mod_complex_double = KBPENTA_mod<thrust::complex<double>>;
+
   }
 }
 #endif
diff --git a/test/misc/bvp/bvp.cu b/test/misc/bvp/bvp.cu
index 8aea4ae8..825da1ae 100644
--- a/test/misc/bvp/bvp.cu
+++ b/test/misc/bvp/bvp.cu
@@ -202,7 +202,7 @@ auto getParameters(){
 }
 
 auto createBVP(std::vector<real> klist, Parameters par){
-  auto bvp = std::make_shared<BVP::BatchedBVPHandler<real>> (klist,
+  auto bvp = std::make_shared<BVP::BatchedBVPHandlerReal> (klist,
 			     make_boundary_dispatcher<TopBoundaryConditions>(klist, par.H),
 			     make_boundary_dispatcher<BottomBoundaryConditions>(klist, par.H),
 			     klist.size(), par.H, par.nz);

From acb189e42a7c504ed4c04c40dae3deb4e9609a2a Mon Sep 17 00:00:00 2001
From: PabloPalaciosAlonso <pablo.pal@hotmail.com>
Date: Tue, 20 Feb 2024 16:40:10 +0100
Subject: [PATCH 04/15] Add alias to call BVP classes

---
 docs/BoundaryValueProblemSolver.rst           | 30 +++++++----
 .../BDHI/DoublyPeriodic/DPStokesSlab.cuh      |  2 +-
 .../DoublyPeriodic/StokesSlab/Correction.cuh  |  2 +-
 .../StokesSlab/initialization.cu              |  2 +-
 .../DoublyPeriodic/PoissonSlab/BVPPoisson.cuh |  4 +-
 .../BVPSchurComplementMatrices.cuh            |  8 ++-
 src/misc/BoundaryValueProblem/BVPSolver.cuh   | 51 +++++++++----------
 src/misc/BoundaryValueProblem/KBPENTA.cuh     |  3 +-
 8 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/docs/BoundaryValueProblemSolver.rst b/docs/BoundaryValueProblemSolver.rst
index 04867405..d2f95bf4 100644
--- a/docs/BoundaryValueProblemSolver.rst
+++ b/docs/BoundaryValueProblemSolver.rst
@@ -19,8 +19,7 @@ With the boundary conditions:
 
 Being :math:`\alpha_n,\beta_n` some arbitrary parameters. These parameters can take any value (including zero) as long as the resulting BVP remains well defined. For instance, making  :math:`\alpha_1,\beta_1` equal to zero at the same time results in a system with no unique solution.
 
-Both :math:`y(z)` and :math:`f(z)` can be complex-valued. In the main branches of the UAMMD repository only :math:`\alpha_2,\beta_2` can be complex numbers (the rest of the parameters, including :math:`k`, must be real valued). The branch complex_bvp contains an adaptation of the solver that allows every parameter to be complex. As a side note, should the need arise in the future, it would be fairly easy to modify the solver as to make the type of the parameters a template argument.
-
+Both :math:`y(z)` and :math:`f(z)` can be real or complex-valued as well as :math:`\alpha_2,\beta_2,k` and any other parameter.
 .. hint::
 
    Initialization of the batched BVP solver requires inverting a dense matrix for each value of :math:`k`, which can become quite expensive. The solver tries to mitigate this cost by inverting these matrices in parallel, but experience suggests that letting it use more than 4 cores is counter-productive.
@@ -37,10 +36,9 @@ Usage
 ------
 
 Every function and class in the BVP solver source code lies under the :cpp:`uammd::BVP` namespace.
-The BVP solver library exposes two main classes:
-
+The BVP solver library exposes two main template classes , designed to manage both real and complex-valued numbers. To ensure consistent behavior and avoid unexpected results, it is recommended to use the complex number types defined in the Thrust library, specifically thrust::complex<real>, when working with complex numbers.
 
-.. cpp:class:: BatchedBVPHandler
+.. cpp:class:: template<class T> BatchedBVPHandler<T>
 
    Used to initialize and hold the information for a group of boundary value problems. Each subproblem can have different parameters (mainly :math:`\alpha_n,\beta_n,k` in the above equations.
    This class cannot be used in device code, its only used for initialization. See :cpp:any:`BatchedBVPGPUSolver`.
@@ -56,14 +54,14 @@ The BVP solver library exposes two main classes:
 		  :param H: The location of the boundary conditions (goes from -H to H).
 		  :param nz: Number of elements of a subproblem (all subproblems must have the same size).
 
-   .. cpp:function::  BatchedBVPGPUSolver getGPUSolver();
+   .. cpp:function::  template<class T> BatchedBVPGPUSolver_impl<T> getGPUSolver();
 
       Provides an instance of the solver to be used in the GPU.
 
-.. cpp:class:: BatchedBVPGPUSolver
+.. cpp:class:: template<class T> BatchedBVPGPUSolver<T>
 
-   While :cpp:any:`BatchedBVPHandler` is used to initialize and store the different subproblems, this class is used to actually solve the subproblems in a CUDA kernel.
-   This class has no public constructors, the only way to get an instance to it is via :cpp:any:`BatchedBVPHandler::getGPUSolver`.
+   While :cpp:any:`BatchedBVPHandler<T>` is used to initialize and store the different subproblems, this class is used to actually solve the subproblems in a CUDA kernel.
+   This class has no public constructors, the only way to get an instance to it is via :cpp:any:`BatchedBVPHandler<T>::getGPUSolver`.
 
    .. cpp:function:: template<class T, class FnIterator, class AnIterator, class CnIterator>   __device__ void solve(int instance,		    const FnIterator& fn,			    T alpha_2, T beta_2,			    AnIterator& an,			    CnIterator& cn);
 
@@ -93,6 +91,20 @@ Initialization requires an iterator to a special type of functor that provides t
 
       Returns :math:`\alpha_1` or :math:`\beta_1`, depending on which BC this class represents.
 
+Aliases for Real and Complex Types
+----------------------------------
+
+To facilitate the use of the BVP solver with real and complex numbers, the following aliases are defined:
+
+.. code-block:: cpp
+
+    using BatchedBVPHandlerReal     = BatchedBVPHandler_impl<real>;
+    using BatchedBVPHandlerrComplex = BatchedBVPHandler_impl<thrust::complex<real>>;
+
+    using BatchedBVPGPUSolverReal    = BatchedBVPGPUSolver_impl<real>;
+    using BatchedBVPGPUSolverComplex = BatchedBVPGPUSolver_impl<thrust::complex<real>>;
+
+These aliases allow for a more intuitive and type-safe way to work with the BVP solver for different numerical types.
 
 Example
 ++++++++
diff --git a/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh b/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
index aad075a4..58aa5dd7 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
+++ b/src/Integrator/BDHI/DoublyPeriodic/DPStokesSlab.cuh
@@ -169,7 +169,7 @@ else throw std::runtime_error("[DPStokesSlab] Can only average in direction X (0
       real gw;
       real tolerance;
       WallMode mode;
-      shared_ptr<BVP::BatchedBVPHandler<real>> bvpSolver;
+      shared_ptr<BVP::BatchedBVPHandlerReal> bvpSolver;
 
     };
 
diff --git a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
index c7735e3c..a0a2a07d 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
+++ b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/Correction.cuh
@@ -506,7 +506,7 @@ namespace uammd{
     }
 
     auto computeZeroModeBoundaryConditions(int nz, real H, WallMode mode){
-      BVP::SchurBoundaryCondition<real> bcs(nz, H);
+      BVP::SchurBoundaryConditionReal bcs(nz, H);
       if(mode == WallMode::bottom){
 	correction_ns::TopBoundaryConditions top(0, H);
 	correction_ns::BottomBoundaryConditions bot(0, H);
diff --git a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
index d16b5bf3..253a9ba0 100644
--- a/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
+++ b/src/Integrator/BDHI/DoublyPeriodic/StokesSlab/initialization.cu
@@ -144,7 +144,7 @@ namespace uammd{
       auto botBC = thrust::make_transform_iterator(thrust::make_counting_iterator<int>(0), botdispatch);
       int numberSystems = (nk.x/2+1)*nk.y;
       int nz = grid.cellDim.z;
-      this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler<real>>(klist, topBC, botBC, numberSystems, halfH, nz);
+      this->bvpSolver = std::make_shared<BVP::BatchedBVPHandlerReal>(klist, topBC, botBC, numberSystems, halfH, nz);
       CudaCheckError();
     }
 
diff --git a/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh b/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
index b1aea296..42c3e2d9 100644
--- a/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
+++ b/src/Interactor/DoublyPeriodic/PoissonSlab/BVPPoisson.cuh
@@ -90,7 +90,7 @@ namespace uammd{
     }
 
     class BVPPoissonSlab{
-      std::shared_ptr<BVP::BatchedBVPHandler<real>> bvpSolver;
+      std::shared_ptr<BVP::BatchedBVPHandlerReal> bvpSolver;
       real2 Lxy;
       real H;
       int3 cellDim;
@@ -150,7 +150,7 @@ namespace uammd{
 						   BoundaryConditionsDispatch<BottomBoundaryConditions, decltype(klist)>(klist, H));
 
 	int numberSystems = (nk.x/2+1)*nk.y;
-	this->bvpSolver = std::make_shared<BVP::BatchedBVPHandler<real>>(klist, topBC, bottomBC, numberSystems, H, cellDim.z);
+	this->bvpSolver = std::make_shared<BVP::BatchedBVPHandlerReal>(klist, topBC, bottomBC, numberSystems, H, cellDim.z);
 	CudaCheckError();
       }
     };
diff --git a/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh b/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
index 02ddc15c..d405493c 100644
--- a/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
+++ b/src/misc/BoundaryValueProblem/BVPSchurComplementMatrices.cuh
@@ -118,12 +118,12 @@ namespace uammd{
     };
 
     template <typename T>
-    class SchurBoundaryCondition{
+    class SchurBoundaryCondition_impl{
       int nz;
       real H;
       BCRows bcs;
     public:
-      SchurBoundaryCondition(int nz, real H):nz(nz), H(H), bcs(nz){}
+      SchurBoundaryCondition_impl(int nz, real H):nz(nz), H(H), bcs(nz){}
 
       template<class TopBC, class BottomBC>
       std::vector<T> computeBoundaryConditionMatrix(const TopBC &top, const BottomBC &bottom){
@@ -173,6 +173,10 @@ namespace uammd{
 
     };
 
+    using SchurBoundaryConditionReal          = SchurBoundaryCondition_impl<real>;
+    using SchurBoundaryConditionComplex       = SchurBoundaryCondition_impl<thrust::complex<real>>;
+    using SchurBoundaryConditionComplexDouble = SchurBoundaryCondition_impl<thrust::complex<double>>;
+
     template<class T>
     std::vector<T> computeSecondIntegralMatrix(T k, real H, int nz){
       std::vector<T> A(nz*nz, 0);
diff --git a/src/misc/BoundaryValueProblem/BVPSolver.cuh b/src/misc/BoundaryValueProblem/BVPSolver.cuh
index 83b4c2a7..bd23c2b3 100644
--- a/src/misc/BoundaryValueProblem/BVPSolver.cuh
+++ b/src/misc/BoundaryValueProblem/BVPSolver.cuh
@@ -63,7 +63,7 @@ namespace uammd{
 	  auto CinvA_it = memoryManager.retrieveStorage(CinvA_storage);
 	  auto CinvABmD_it = memoryManager.retrieveStorage(CinvABmD_storage);
 	  auto invA = computeInverseSecondIntegralMatrix(k, H, nz);
-	  SchurBoundaryCondition<U> bcs(nz, H);
+	  SchurBoundaryCondition_impl<U> bcs(nz, H);
 	  auto CandD = bcs.computeBoundaryConditionMatrix(top, bottom);
 	  U D[4] = {CandD[2*nz], CandD[2*nz+1], CandD[2*nz+2], CandD[2*nz+3]};
 	  auto CinvA = matmul(CandD, nz, 2, invA, nz, nz);
@@ -155,7 +155,7 @@ namespace uammd{
     }
 
     template<typename U>
-    class BoundaryValueProblemSolver{
+    class BoundaryValueProblemSolver_impl{
       detail::PentadiagonalSystemSolver<U> pent;
       detail::SubsystemSolver<U> sub;
       StorageHandle<U> waveVector;
@@ -163,7 +163,7 @@ namespace uammd{
       real H;
     public:
 
-      BoundaryValueProblemSolver(int nz, real H): nz(nz), H(H), sub(nz, H), pent(nz, H){}
+      BoundaryValueProblemSolver_impl(int nz, real H): nz(nz), H(H), sub(nz, H), pent(nz, H){}
 
       void registerRequiredStorage(StorageRegistration &mem){
 	waveVector = mem.registerStorageRequirement<U>(1);
@@ -209,21 +209,20 @@ namespace uammd{
 
     };
 
-    using BoundaryValueProblemSolverReal          = BoundaryValueProblemSolver<real>;
-    using BoundaryValueProblemSolverComplex       = BoundaryValueProblemSolver<thrust::complex<real>>;
-    using BoundaryValueProblemSolverComplexDouble = BoundaryValueProblemSolver<thrust::complex<double>>;
-
+    using BoundaryValueProblemSolverReal          = BoundaryValueProblemSolver_impl<real>;
+    using BoundaryValueProblemSolverComplex       = BoundaryValueProblemSolver_impl<thrust::complex<real>>;
+    
     template<typename U>
-    class BatchedBVPHandler;
+    class BatchedBVPHandler_impl;
 
     template<typename U>
-    struct BatchedBVPGPUSolver{
+    struct BatchedBVPGPUSolver_impl{
     private:
       int numberSystems;
-      BoundaryValueProblemSolver<U> bvpSolver;
+      BoundaryValueProblemSolver_impl<U> bvpSolver;
       char* gpuMemory;
-      friend class BatchedBVPHandler<U>;
-      BatchedBVPGPUSolver(int numberSystems, BoundaryValueProblemSolver<U> bvpSolver, char *raw):
+      friend class BatchedBVPHandler_impl<U>;
+      BatchedBVPGPUSolver_impl(int numberSystems, BoundaryValueProblemSolver_impl<U> bvpSolver, char *raw):
 	numberSystems(numberSystems), bvpSolver(bvpSolver), gpuMemory(raw){}
     public:
 
@@ -239,29 +238,28 @@ namespace uammd{
 
     };
 
-    using BatchedBVPGPUSolverReal          = BatchedBVPGPUSolver<real>;
-    using BatchedBVPGPUSolverComplex       = BatchedBVPGPUSolver<thrust::complex<real>>;
-    using BatchedBVPGPUSolverComplexDouble = BatchedBVPGPUSolver<thrust::complex<double>>;
-
+    using BatchedBVPGPUSolverReal          = BatchedBVPGPUSolver_impl<real>;
+    using BatchedBVPGPUSolverComplex       = BatchedBVPGPUSolver_impl<thrust::complex<real>>;
+    
     template <typename U>
-    class BatchedBVPHandler{
+    class BatchedBVPHandler_impl{
       int numberSystems;
-      BoundaryValueProblemSolver<U> bvp;
+      BoundaryValueProblemSolver_impl<U> bvp;
       thrust::device_vector<char> gpuMemory;
     public:
 
       template<class WaveVectorIterator, class BatchedTopBC, class BatchedBottomBC>
-      BatchedBVPHandler(const WaveVectorIterator &klist,
-			BatchedTopBC top, BatchedBottomBC bot,
-			int numberSystems, real H, int nz):
+      BatchedBVPHandler_impl(const WaveVectorIterator &klist,
+			     BatchedTopBC top, BatchedBottomBC bot,
+			     int numberSystems, real H, int nz):
 	numberSystems(numberSystems),
 	bvp(nz, H){
 	precompute(klist, top, bot);
       }
 
-      BatchedBVPGPUSolver<U> getGPUSolver(){
+      BatchedBVPGPUSolver_impl<U> getGPUSolver(){
 	auto raw = thrust::raw_pointer_cast(gpuMemory.data());
-	BatchedBVPGPUSolver<U> d_solver(numberSystems, bvp, raw);
+	BatchedBVPGPUSolver_impl<U> d_solver(numberSystems, bvp, raw);
 	return d_solver;
       }
 
@@ -286,10 +284,9 @@ namespace uammd{
 
     };
 
-    using BatchedBVPHandlerReal          = BatchedBVPHandler<real>;
-    using BatchedBVPHandlerComplex       = BatchedBVPHandler<thrust::complex<real>>;
-    using BatchedBVPHandlerComplexDouble = BatchedBVPHandler<thrust::complex<double>>;
-
+    using BatchedBVPHandlerReal          = BatchedBVPHandler_impl<real>;
+    using BatchedBVPHandlerComplex       = BatchedBVPHandler_impl<thrust::complex<real>>;
+    
   }
 
 }
diff --git a/src/misc/BoundaryValueProblem/KBPENTA.cuh b/src/misc/BoundaryValueProblem/KBPENTA.cuh
index b80a5c84..52f67c0b 100644
--- a/src/misc/BoundaryValueProblem/KBPENTA.cuh
+++ b/src/misc/BoundaryValueProblem/KBPENTA.cuh
@@ -59,8 +59,7 @@ namespace uammd{
 
     using KBPENTA_mod_real           = KBPENTA_mod<real>;
     using KBPENTA_mod_complex        = KBPENTA_mod<thrust::complex<real>>;
-    using KBPENTA_mod_complex_double = KBPENTA_mod<thrust::complex<double>>;
-
+    
   }
 }
 #endif

From 56eb7235e843f399f5d78350ec4c3111a65739af Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Fri, 7 Jun 2024 18:42:18 +0200
Subject: [PATCH 05/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/System/Log.h      | 2 +-
 src/System/System.h   | 8 ++++----
 src/utils/cxx_utils.h | 2 +-
 src/utils/exception.h | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/System/Log.h b/src/System/Log.h
index e2cd189d..5d9e8f62 100644
--- a/src/System/Log.h
+++ b/src/System/Log.h
@@ -18,7 +18,7 @@ namespace uammd{
     constexpr int maxLogLevel = 6;
 #endif
 
-    ElementType getLogLevelInfo(int level){
+    inline ElementType getLogLevelInfo(int level){
       static const std::map<int, ElementType> printMap{
 	{CRITICAL , std::make_tuple(stderr, "\e[101m[CRITICAL] ", "\e[0m\n")},
 	{ERROR , std::make_tuple(stderr, "\e[91m[ERROR] \e[0m", "\n")},
diff --git a/src/System/System.h b/src/System/System.h
index 4eb7cd83..a10e37c9 100644
--- a/src/System/System.h
+++ b/src/System/System.h
@@ -68,8 +68,8 @@ namespace uammd{
     SystemParameters sysPar;
     Timer tim;
 
-    void printWelcome();
-    void printFarewell();
+    inline void printWelcome();
+    inline void printFarewell();
 
     int m_argc = 0;
     char ** m_argv = nullptr;
@@ -202,7 +202,7 @@ namespace uammd{
 
   };
 
-  void System::printWelcome(){
+  inline void System::printWelcome(){
     std::string separator;
     fori(0,60) separator += "━";
     separator += "┓";
@@ -223,7 +223,7 @@ namespace uammd{
     log<System::MESSAGE>("Computation started at %s", std::ctime(&time));
   }
 
-  void System::printFarewell(){
+  inline void System::printFarewell(){
     cudaDeviceSynchronize();
     std::time_t time = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
     log<System::MESSAGE>("Computation finished at %s",std::ctime(&time));
diff --git a/src/utils/cxx_utils.h b/src/utils/cxx_utils.h
index 5dbe9a72..a44d2c3a 100644
--- a/src/utils/cxx_utils.h
+++ b/src/utils/cxx_utils.h
@@ -27,7 +27,7 @@ namespace uammd{
   }
   namespace printUtils{
     //Transform size in bytes to a pretty string in B, KB, MB...
-    std::string prettySize(size_t size) {
+    inline std::string prettySize(size_t size) {
       static const char *SIZES[] = { "B", "KB", "MB", "GB" };
       uint div = 0;
       size_t rem = 0;
diff --git a/src/utils/exception.h b/src/utils/exception.h
index 5b424531..1c68172f 100644
--- a/src/utils/exception.h
+++ b/src/utils/exception.h
@@ -10,7 +10,7 @@
 namespace uammd{
   using exception = std::exception;
 
-  void backtrace_nested_exception(const uammd::exception& e, int level =  0){
+  inline void backtrace_nested_exception(const uammd::exception& e, int level =  0){
 
     Logging::log<Logging::EXCEPTION>(std::string(level,' ') + "level " +
 				     std::to_string(level) + " exception: " + e.what());

From 40e6fa017e64b7ca2cec1652b54575ae13a31dfd Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Tue, 18 Jun 2024 19:01:17 +0200
Subject: [PATCH 06/15] Now preable.h path can be given by its path

---
 src/uammd.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/uammd.cuh b/src/uammd.cuh
index e3fa8639..fc402041 100644
--- a/src/uammd.cuh
+++ b/src/uammd.cuh
@@ -9,7 +9,11 @@
 #include "System/System.h"
 
 #ifdef UAMMD_EXTENSIONS
-#include "../extensions/preamble.h"
+  #ifdef UAMMD_EXTENSIONS_PREAMBLE
+    #include UAMMD_EXTENSIONS_PREAMBLE
+  #else
+    #include "../extensions/preamble.h"
+  #endif
 #endif
 
 #include "ParticleData/ParticleData.cuh"

From 17492dd6a82a39ef1dda81274285f0cbf0954541 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Tue, 18 Jun 2024 19:19:53 +0200
Subject: [PATCH 07/15] Fixed extensions include

---
 src/uammd.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/uammd.cuh b/src/uammd.cuh
index fc402041..07fbcaa2 100644
--- a/src/uammd.cuh
+++ b/src/uammd.cuh
@@ -8,9 +8,13 @@
 #include "global/defines.h"
 #include "System/System.h"
 
+// Helper macro to handle indirect inclusion
+#define STRINGIFY(x) #x
+#define INCLUDE_FILE(x) STRINGIFY(x)
+
 #ifdef UAMMD_EXTENSIONS
   #ifdef UAMMD_EXTENSIONS_PREAMBLE
-    #include UAMMD_EXTENSIONS_PREAMBLE
+    #include INCLUDE_FILE(UAMMD_EXTENSIONS_PREAMBLE)
   #else
     #include "../extensions/preamble.h"
   #endif

From 7ef9b4fbd397a237aa284783537772268cb64835 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Wed, 19 Jun 2024 16:04:19 +0200
Subject: [PATCH 08/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/ParticleData/ParticleGroup.cuh | 5 +++--
 src/utils/ParticleSorter.cuh       | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ParticleData/ParticleGroup.cuh b/src/ParticleData/ParticleGroup.cuh
index 0318a696..1122ab74 100644
--- a/src/ParticleData/ParticleGroup.cuh
+++ b/src/ParticleData/ParticleGroup.cuh
@@ -135,7 +135,7 @@ namespace uammd{
 
   namespace ParticleGroup_ns{
     //Updates the indices of the particles in a group using pd->getIdOrderedIndices()
-    __global__ void updateGroupIndices(//An array that stores the indices of the particles in the group per id.
+    inline __global__ void updateGroupIndices(//An array that stores the indices of the particles in the group per id.
 				       const int * __restrict__ id2index,
 				       //Out: the current ParticleData indices of the particles in the group
 				       int * __restrict__ particlesIndices,
@@ -511,7 +511,8 @@ namespace uammd{
   }
 
   namespace ParticleGroup_ns{
-    __global__  void IdsFromIndices(const int *indices, const int *index2Id, int* groupParticleIds, int N){
+
+    inline __global__  void IdsFromIndices(const int *indices, const int *index2Id, int* groupParticleIds, int N){
       int tid = blockIdx.x*blockDim.x + threadIdx.x;
       if(tid>=N) return;
       int index = indices[tid];
diff --git a/src/utils/ParticleSorter.cuh b/src/utils/ParticleSorter.cuh
index b50a826e..1e968410 100644
--- a/src/utils/ParticleSorter.cuh
+++ b/src/utils/ParticleSorter.cuh
@@ -79,7 +79,7 @@ namespace uammd{
       }
     };
 
-    int clz(uint n){
+    inline int clz(uint n){
       n |= (n >>  1);
       n |= (n >>  2);
       n |= (n >>  4);

From d1c37f9b9069b5055801ded690a12a811bf56214 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Wed, 19 Jun 2024 16:42:09 +0200
Subject: [PATCH 09/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/ParticleData/ParticleGroup.cuh | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/ParticleData/ParticleGroup.cuh b/src/ParticleData/ParticleGroup.cuh
index 1122ab74..f246e1f6 100644
--- a/src/ParticleData/ParticleGroup.cuh
+++ b/src/ParticleData/ParticleGroup.cuh
@@ -372,7 +372,7 @@ namespace uammd{
   };
 
   template<class ParticleSelector>
-  ParticleGroup::ParticleGroup(ParticleSelector selector,
+  inline ParticleGroup::ParticleGroup(ParticleSelector selector,
 			       std::shared_ptr<ParticleData> pd, std::string name):
     pd(pd), sys(pd->getSystem()), name(name){
     sys->log<System::MESSAGE>("[ParticleGroup] Group %s created with selector %s",
@@ -398,7 +398,7 @@ namespace uammd{
 
   //Specialization of a particle group with an All selector
   template<>
-  ParticleGroup::ParticleGroup(particle_selector::All selector,
+  inline ParticleGroup::ParticleGroup(particle_selector::All selector,
 			       std::shared_ptr<ParticleData> pd,
 			       std::string name):
     pd(pd), sys(pd->getSystem()), name(name){
@@ -411,13 +411,13 @@ namespace uammd{
 			      name.c_str(), numberParticles);
   }
 
-  ParticleGroup::ParticleGroup(std::shared_ptr<ParticleData> pd,
+  inline ParticleGroup::ParticleGroup(std::shared_ptr<ParticleData> pd,
 			       std::string name):
     ParticleGroup(particle_selector::All(), pd, name){}
 
   //Specialization of an empty particle group
   template<>
-  ParticleGroup::ParticleGroup(particle_selector::None selector,
+  inline ParticleGroup::ParticleGroup(particle_selector::None selector,
 			       std::shared_ptr<ParticleData> pd,
 			       std::string name):
     pd(pd), sys(pd->getSystem()), name(name){
@@ -430,7 +430,7 @@ namespace uammd{
 
   //Constructor of ParticleGroup when an ID list is provided
   template<class InputIterator>
-  ParticleGroup::ParticleGroup(InputIterator begin, InputIterator end,
+  inline ParticleGroup::ParticleGroup(InputIterator begin, InputIterator end,
 			       std::shared_ptr<ParticleData> pd,
 			       std::string name):
     pd(pd), sys(pd->getSystem()), name(name){
@@ -455,7 +455,7 @@ namespace uammd{
 
   //This is trivial with  pd->getIdOrderedIndices()!
   //Handle a reordering of the particles (which invalids the previous relation between IDs and indices)
-  void ParticleGroup::computeIndexList(bool forceUpdate){
+  inline void ParticleGroup::computeIndexList(bool forceUpdate){
     if(numberParticles==0) return;
     if(this->needsIndexListUpdate || forceUpdate){//Update only if needed
       sys->log<System::DEBUG>("[ParticleGroup] Updating group %s after last particle sorting", name.c_str());
@@ -474,7 +474,7 @@ namespace uammd{
     }
   }
   //Add particles to the group via an array with ids
-  void ParticleGroup::addParticlesById(access::location loc, const int *ids, int N){
+  inline void ParticleGroup::addParticlesById(access::location loc, const int *ids, int N){
     sys->log<System::DEBUG1>("[ParticleGroup] Adding %d particles to group %s via ids!", N, name.c_str());
     int numberParticlesPrev = numberParticles;
     numberParticles += N;
@@ -522,7 +522,7 @@ namespace uammd{
 
   }
   //Add particles to the group via an array with the current indices of the particles in pd (faster)
-  void ParticleGroup::addParticlesByCurrentIndex(access::location loc, const int *indices, int N){
+  inline void ParticleGroup::addParticlesByCurrentIndex(access::location loc, const int *indices, int N){
     sys->log<System::DEBUG1>("[ParticleGroup] Adding %d particles to group %s via indices!", N, name.c_str());
     if(N==0) return;
     int numberParticlesPrev = numberParticles;

From 7cc88e5a2d53e9affd452cc5543363f0794443ff Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Wed, 19 Jun 2024 16:53:22 +0200
Subject: [PATCH 10/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/ParticleData/ParticleData.cuh | 6 +++---
 src/utils/Grid.cuh                | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ParticleData/ParticleData.cuh b/src/ParticleData/ParticleData.cuh
index 2bbe2ccd..33630fee 100644
--- a/src/ParticleData/ParticleData.cuh
+++ b/src/ParticleData/ParticleData.cuh
@@ -316,7 +316,7 @@ namespace uammd{
 #define INIT_PROPERTIES_T(NAME, name) ,  name(BOOST_PP_STRINGIZE(NAME), sys)
 #define INIT_PROPERTIES(r,data, tuple) INIT_PROPERTIES_T(PROPNAME_CAPS(tuple), PROPNAME(tuple))
 
-  ParticleData::ParticleData(int numberParticles, shared_ptr<System> sys):
+  inline ParticleData::ParticleData(int numberParticles, shared_ptr<System> sys):
     numberParticles(numberParticles),
     originalOrderIndexCPUNeedsUpdate(true),
     sys(sys)
@@ -334,7 +334,7 @@ namespace uammd{
   }
 
   //Sort the particles to improve data locality
-  void ParticleData::sortParticles(cudaStream_t st=0){
+  inline void ParticleData::sortParticles(cudaStream_t st=0){
     sys->log<System::DEBUG>("[ParticleData] Sorting particles...");
 
     {
@@ -364,7 +364,7 @@ namespace uammd{
 
 
 
-  void ParticleData::changeNumParticles(int Nnew){
+  inline void ParticleData::changeNumParticles(int Nnew){
     sys->log<System::CRITICAL>("[ParticleData] CHANGE PARTICLES FUNCTIONALITY NOT IMPLEMENTED YET!!!");
     sys->log<System::DEBUG>("[ParticleData] Adding/Removing particles...");
     this->numberParticles = Nnew;
diff --git a/src/utils/Grid.cuh b/src/utils/Grid.cuh
index 5405b50e..243d29f1 100644
--- a/src/utils/Grid.cuh
+++ b/src/utils/Grid.cuh
@@ -123,7 +123,7 @@ namespace uammd{
   };
 
   //Looks for the closest (equal or greater) number of nodes of the form 2^a*3^b*5^c*7^d*11^e
-  int3 nextFFTWiseSize3D(int3 size){
+  inline int3 nextFFTWiseSize3D(int3 size){
     int* cdim = &size.x;
     using integral = uint64_t;
     integral max_dim = std::max({size.x, size.y, size.z});

From e3db524de131a772e1bee616dbfb3d6b4f317336 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Sun, 23 Jun 2024 15:08:28 +0200
Subject: [PATCH 11/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/Interactor/NeighbourList/BasicList/BasicListBase.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Interactor/NeighbourList/BasicList/BasicListBase.cuh b/src/Interactor/NeighbourList/BasicList/BasicListBase.cuh
index 568213a7..71e6426a 100644
--- a/src/Interactor/NeighbourList/BasicList/BasicListBase.cuh
+++ b/src/Interactor/NeighbourList/BasicList/BasicListBase.cuh
@@ -34,6 +34,7 @@ This class maintains a CellList and uses it to generate a neighbour list. It sho
 
 namespace uammd{
   namespace BasicNeighbourList_ns{
+    inline
     __global__ void fillBasicNeighbourList(CellList_ns::NeighbourContainer ni,
 			      int *neighbourList, int* numberNeighbours,
 			      int maxNeighboursPerParticle,

From e96f1a91c1d285b6222ea76c731462ea34d953fa Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Tue, 25 Jun 2024 11:44:11 +0200
Subject: [PATCH 12/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/Integrator/BDHI/BDHI_FCM.cu               | 13 ++++++++++---
 src/Integrator/BDHI/FCM/FCM_impl.cuh          | 17 +++++++++++++++++
 src/Integrator/BDHI/FCM/utils.cuh             |  7 +++++++
 src/Integrator/VerletNVE.cu                   |  9 +++++++++
 src/Interactor/Potential/DPD.cuh              |  2 ++
 src/misc/IBM_kernels.cuh                      |  5 ++++-
 src/misc/LanczosAlgorithm/LanczosAlgorithm.cu |  6 ++++++
 src/utils/cublasDebug.h                       |  2 +-
 src/utils/cufftDebug.h                        |  1 +
 src/utils/cufftPrecisionAgnostic.h            |  9 +++++++++
 src/utils/curandDebug.h                       |  1 +
 11 files changed, 67 insertions(+), 5 deletions(-)

diff --git a/src/Integrator/BDHI/BDHI_FCM.cu b/src/Integrator/BDHI/BDHI_FCM.cu
index 61287b01..4d2e6a14 100644
--- a/src/Integrator/BDHI/BDHI_FCM.cu
+++ b/src/Integrator/BDHI/BDHI_FCM.cu
@@ -4,6 +4,7 @@
 #include"BDHI_FCM.cuh"
 namespace uammd{
   namespace BDHI{
+    inline
     auto FCMIntegrator::computeHydrodynamicDisplacements(){
       auto pos = pd->getPos(access::location::gpu, access::mode::read);
       auto force = pd->getForce(access::location::gpu, access::mode::read);
@@ -15,6 +16,7 @@ namespace uammd{
 						   temperature, 1.0/sqrt(dt), st);
     }
 
+    inline
     void FCMIntegrator::updateInteractors(){
       for(auto forceComp: interactors) forceComp->updateSimulationTime(steps*dt);
       if(steps==1){
@@ -26,6 +28,7 @@ namespace uammd{
       }
     }
 
+    inline
     void FCMIntegrator::resetForces(){
       int numberParticles = pg->getNumberParticles();
       auto force = pd->getForce(access::location::gpu, access::mode::write);
@@ -34,6 +37,7 @@ namespace uammd{
       CudaCheckError();
     }
 
+    inline
     void FCMIntegrator::resetTorques(){
       int numberParticles = pg->getNumberParticles();
       auto torque = pd->getTorque(access::location::gpu, access::mode::write);
@@ -42,6 +46,7 @@ namespace uammd{
       CudaCheckError();
     }
 
+    inline
     void FCMIntegrator::computeCurrentForces(){
       resetForces();
       if (pd->isDirAllocated()) resetTorques();
@@ -56,6 +61,7 @@ namespace uammd{
       /*With all the terms computed, update the positions*/
       /*T=0 case is templated*/
       template<class IndexIterator>
+      inline
       __global__ void integrateEulerMaruyamaD(real4* pos,
 					      real4* dir,
 					      IndexIterator indexIterator,
@@ -75,7 +81,7 @@ namespace uammd{
 	/*Write to global memory*/
 	pos[i] = make_real4(p,c);
 	/*Update the orientation*/
-	if(dir){ 
+	if(dir){
 	  Quat dirc = dir[i];
 	  //printf("W %f %f %f\n", angularV[id].x, angularV[id].y, angularV[id].z);
 	  //printf("V %f %f %f\n", linearV[id].x, linearV[id].y, linearV[id].z);
@@ -86,6 +92,7 @@ namespace uammd{
       }
     }
 
+    inline
     void FCMIntegrator::forwardTime(){
       steps++;
       sys->log<System::DEBUG1>("[BDHI::FCM] Performing integration step %d", steps);
@@ -97,12 +104,12 @@ namespace uammd{
       auto angularVelocities = disp.second;
       auto indexIter = pg->getIndexIterator(access::location::gpu);
       auto pos = pd->getPos(access::location::gpu, access::mode::readwrite);
-      auto dir = pd->getDirIfAllocated(access::location::gpu, access::mode::readwrite);      
+      auto dir = pd->getDirIfAllocated(access::location::gpu, access::mode::readwrite);
       real3* d_linearV = thrust::raw_pointer_cast(linearVelocities.data());
       real3* d_angularV = dir.raw()?thrust::raw_pointer_cast(angularVelocities.data()):nullptr;
       int BLOCKSIZE = 128; /*threads per block*/
       int nthreads = BLOCKSIZE<numberParticles?BLOCKSIZE:numberParticles;
-      int nblocks = numberParticles/nthreads + ((numberParticles%nthreads!=0)?1:0);      
+      int nblocks = numberParticles/nthreads + ((numberParticles%nthreads!=0)?1:0);
       FCM_ns::integrateEulerMaruyamaD<<<nblocks, nthreads, 0, st>>>(pos.raw(),
 								    dir.raw(),
 								    indexIter,
diff --git a/src/Integrator/BDHI/FCM/FCM_impl.cuh b/src/Integrator/BDHI/FCM/FCM_impl.cuh
index 989b0ba3..36fd9890 100644
--- a/src/Integrator/BDHI/FCM/FCM_impl.cuh
+++ b/src/Integrator/BDHI/FCM/FCM_impl.cuh
@@ -224,6 +224,7 @@ namespace uammd{
       };
 
       template<class IterPos, class IterForce, class Kernel>
+      inline
       cached_vector<real3> spreadForces(IterPos& pos, IterForce& force,
 					int numberParticles,
 					std::shared_ptr<Kernel> kernel,
@@ -248,6 +249,7 @@ namespace uammd{
       };
 
       template<class T, class T3, class Container>
+      inline
       auto getCoordinateVector(Container &v, int coord){
 	cached_vector<T> v_a(v.size());
 	T* ptr= (T*)thrust::raw_pointer_cast(v.data());
@@ -264,7 +266,9 @@ namespace uammd{
 	  return {thrust::get<0>(a), thrust::get<1>(a),thrust::get<2>(a)};
 	}
       };
+
       template<class T3, class Container>
+      inline
       auto interleave(Container &a, Container &b, Container &c){
 	auto zip = thrust::make_zip_iterator(thrust::make_tuple(a.begin(), b.begin(), c.begin()));
 	cached_vector<T3> res(a.size());
@@ -272,6 +276,7 @@ namespace uammd{
 	return res;
       }
 
+      inline
       cached_vector<complex3> forwardTransform(cached_vector<real3>& gridReal,
 						    int3 n,
 						    cufftHandle plan, cudaStream_t st){
@@ -286,6 +291,7 @@ namespace uammd{
       }
 
 
+      inline
       __global__ void addTorqueCurl(complex3 *gridTorquesFourier, complex3* gridVelsFourier, Grid grid){
 	int id = blockDim.x*blockIdx.x + threadIdx.x;
 	const int3 nk = grid.cellDim;
@@ -307,6 +313,7 @@ namespace uammd{
 
 
       template<class IterPos, class IterTorque, class Kernel>
+      inline
       void addSpreadTorquesFourier(IterPos& pos, IterTorque& torque, int numberParticles,
 				   Grid grid,
 				   std::shared_ptr<Kernel> kernel,
@@ -339,6 +346,7 @@ namespace uammd{
 	 Output:gridVels = B·FFTf·S·F -> B \propto (I-k^k/|k|^2)
        */
       /*A thread per fourier node*/
+      inline
       __global__ void forceFourier2Vel(const complex3 * gridForces, /*Input array*/
 				       complex3 * gridVels, /*Output array, can be the same as input*/
 				       real vis,
@@ -361,6 +369,7 @@ namespace uammd{
         gridVels[id] = projectFourier(k2, dk, factor)*(B/real(ncells.x*ncells.y*ncells.z));
       }
 
+      inline
       void convolveFourier(cached_vector<complex3>& gridVelsFourier, real viscosity, Grid grid, cudaStream_t st){
 	System::log<System::DEBUG2>("[BDHI::FCM] Wave space velocity scaling");
 	/*Scale the wave space grid forces, transforming in velocities -> B·FFT·S·F*/
@@ -379,6 +388,7 @@ namespace uammd{
 	This kernel gets v_k = gridVelsFourier = B·FFtt·S·F as input and adds 1/√σ·√B(k)·dWw.
 	Keeping special care that v_k = v*_{N-k}, which implies that dWw_k = dWw*_{N-k}
       */
+      inline
       __global__ void fourierBrownianNoise(complex3 * gridVelsFourier,
 					   Grid grid,
 					   real prefactor,/* sqrt(2·T/dt)*/
@@ -451,6 +461,7 @@ namespace uammd{
 	}
 }
 
+      inline
       void addBrownianNoise(cached_vector<complex3>& gridVelsFourier,
 			    real temperature, real viscosity, real prefactor,
 			    uint seed,
@@ -487,6 +498,7 @@ namespace uammd{
 	}
       }
 
+      inline
       cached_vector<real3> inverseTransform(cached_vector<complex3>& gridFourier,
 					    int3 n, cufftHandle plan, cudaStream_t st){
 	int nx = 2*(n.x/2+1);
@@ -501,6 +513,7 @@ namespace uammd{
       }
 
       template<class IterPos, class Kernel>
+      inline
       cached_vector<real3> interpolateVelocity(IterPos& pos, cached_vector<real3>& gridVels,
 					       Grid grid, std::shared_ptr<Kernel> kernel,
 					       int numberParticles, cudaStream_t st){
@@ -524,6 +537,7 @@ namespace uammd{
       // = 0.5( i*k_y*V_z - i*k_z(V_y), i*k_z(V_x) - i*k_x*V_z, i*k_x*V_y - i*k_y*V_x)
       //Overwrite the output vector with the angular velocities in Fourier space
       //The input velocity vector is overwritten
+      inline
       __global__ void computeVelocityCurlFourier(const complex3 *gridVelsFourier,
 						 complex3* gridAngVelsFourier,
 						 Grid grid){
@@ -549,6 +563,7 @@ namespace uammd{
 	gridAngVelsFourier[id] = gridAng;
       }
 
+      inline
       cached_vector<complex3> computeGridAngularVelocityFourier(cached_vector<complex3>& gridVelsFourier,
 								Grid grid,  cudaStream_t st){
 	const int3 n = grid.cellDim;
@@ -565,6 +580,7 @@ namespace uammd{
       }
 
       template<class IterPos, class Kernel>
+      inline
       cached_vector<real3> interpolateAngularVelocity(IterPos& pos,
 						      cached_vector<real3>& gridAngVels,
 						      Grid grid,
@@ -584,6 +600,7 @@ namespace uammd{
     }
 
     template<class Kernel, class KernelTorque>
+    inline
     std::pair<cached_vector<real3>, cached_vector<real3>>
     FCM_impl<Kernel, KernelTorque>::computeHydrodynamicDisplacements(real4* pos,
 								     real4* force, real4* torque,
diff --git a/src/Integrator/BDHI/FCM/utils.cuh b/src/Integrator/BDHI/FCM/utils.cuh
index c3d628ab..00cddd2d 100644
--- a/src/Integrator/BDHI/FCM/utils.cuh
+++ b/src/Integrator/BDHI/FCM/utils.cuh
@@ -22,6 +22,7 @@ namespace uammd{
       using complex = cufftComplex_t<real>;
       using complex3 = cufftComplex3_t<real>;
 
+      inline
       __device__ int3 indexToWaveNumber(int i, int3 nk){
 	int ikx = i%(nk.x/2+1);
 	int iky = (i/(nk.x/2+1))%nk.y;
@@ -32,10 +33,12 @@ namespace uammd{
 	return make_int3(ikx, iky, ikz);
       }
 
+      inline
       __device__ real3 waveNumberToWaveVector(int3 ik, real3 L){
 	return (real(2.0)*real(M_PI)/L)*make_real3(ik.x, ik.y, ik.z);
       }
 
+      inline
       __device__ real3 getGradientFourier(int3 ik, int3 nk, real3 L){
 	const bool isUnpairedX = ik.x == (nk.x - ik.x);
 	const bool isUnpairedY = ik.y == (nk.y - ik.y);
@@ -55,6 +58,7 @@ namespace uammd{
 	  unpaired modes set to zero
 	fr is the factor to project
       */
+      inline
       __device__ real3 projectFourier(real k2, real3 dk, real3 fr){
 	const real invk2 = real(1.0)/k2;
 	real3 vr = fr - dk*dot(fr, dk*invk2);
@@ -68,6 +72,7 @@ namespace uammd{
 	  unpaired modes set to zero
 	fr is the factor to project
       */
+      inline
       __device__ complex3 projectFourier(real k2, real3 dk, complex3 factor){
 	real3 re = projectFourier(k2, dk, make_real3(factor.x.x, factor.y.x, factor.z.x));
 	real3 imag = projectFourier(k2, dk, make_real3(factor.x.y, factor.y.y, factor.z.y));
@@ -77,6 +82,7 @@ namespace uammd{
 
       /*Compute gaussian complex noise dW, std = prefactor -> ||z||^2 = <x^2>/sqrt(2)+<y^2>/sqrt(2) = prefactor*/
       /*A complex random number for each direction*/
+      inline
       __device__ complex3 generateNoise(real prefactor, uint id, uint seed1, uint seed2){	  //Uncomment to use uniform numbers instead of gaussian
 	Saru saru(id, seed1, seed2);
 	complex3 noise;
@@ -91,6 +97,7 @@ namespace uammd{
 	return noise;
       }
 
+      inline
       __device__ bool isNyquistWaveNumber(int3 cell, int3 ncells){
 	/*Beware of nyquist points! They only appear with even cell dimensions
 	  There are 8 nyquist points at most (cell=0,0,0 is excluded at the start of the kernel)
diff --git a/src/Integrator/VerletNVE.cu b/src/Integrator/VerletNVE.cu
index 8689e602..698eb7fe 100644
--- a/src/Integrator/VerletNVE.cu
+++ b/src/Integrator/VerletNVE.cu
@@ -30,6 +30,7 @@
 
 namespace uammd{
 
+  inline
   VerletNVE::VerletNVE(shared_ptr<ParticleGroup> pg, VerletNVE::Parameters par):
     Integrator(pg, "VerletNVE"),
     dt(par.dt), energy(par.energy), is2D(par.is2D),
@@ -56,6 +57,7 @@ namespace uammd{
     CudaSafeCall(cudaStreamCreate(&stream));
   }
 
+  inline
   VerletNVE::~VerletNVE(){
     cudaStreamDestroy(stream);
   }
@@ -88,6 +90,7 @@ namespace uammd{
       }
   }
 
+  inline
   void VerletNVE::initializeVelocities(){
     //In the first step, compute energy in the system
     //in order to adapt the initial kinetic energy to match the input total energy
@@ -131,6 +134,7 @@ namespace uammd{
   }
 
   template<int step>
+  inline
   void VerletNVE::callIntegrate(){
     int numberParticles = pg->getNumberParticles();
     int Nthreads = 128;
@@ -152,6 +156,7 @@ namespace uammd{
 								       numberParticles, dt, is2D);
   }
 
+  inline
   void VerletNVE::resetForces(){
     int numberParticles = pg->getNumberParticles();
     auto force = pd->getForce(access::location::gpu, access::mode::write);
@@ -159,6 +164,7 @@ namespace uammd{
     thrust::fill(thrust::cuda::par.on(stream), force_gr, force_gr + numberParticles, real4());
   }
 
+  inline
   void VerletNVE::firstStepPreparation(){
     if(initVelocities){
       initializeVelocities();
@@ -173,6 +179,7 @@ namespace uammd{
   }
 
   //Move the particles in my group 1 dt in time.
+  inline
   void VerletNVE::forwardTime(){
     steps++;
     sys->log<System::DEBUG1>("[VerletNVE] Performing integration step %d", steps);
@@ -191,6 +198,7 @@ namespace uammd{
 
   namespace verletnve_ns{
     template<class VelocityIterator, class MassIterator, class EnergyIterator>
+    inline
     __global__ void sumKineticEnergy(const VelocityIterator vel,
 				     EnergyIterator energy,
 				     const MassIterator mass,
@@ -203,6 +211,7 @@ namespace uammd{
   };
 
 
+  inline
   real VerletNVE::sumEnergy(){
     int numberParticles = pg->getNumberParticles();
     auto vel_gr = pg->getPropertyIterator(pd->getVel(access::location::gpu, access::mode::read));
diff --git a/src/Interactor/Potential/DPD.cuh b/src/Interactor/Potential/DPD.cuh
index 6a8f544c..7860ec76 100644
--- a/src/Interactor/Potential/DPD.cuh
+++ b/src/Interactor/Potential/DPD.cuh
@@ -168,11 +168,13 @@ namespace uammd{
     };
 
     template<>
+    inline
     void DPD_impl<DefaultDissipation>::printGamma(){
       System::log<System::MESSAGE>("[Potential::DPD] gamma: %f", gamma.gamma);
     }
 
     template<class T>
+    inline
     void DPD_impl<T>::printGamma(){
       System::log<System::MESSAGE>("[Potential::DPD] Using %s for dissipation", type_name<T>().c_str());
     }
diff --git a/src/misc/IBM_kernels.cuh b/src/misc/IBM_kernels.cuh
index 22ba9214..904ebd23 100644
--- a/src/misc/IBM_kernels.cuh
+++ b/src/misc/IBM_kernels.cuh
@@ -41,6 +41,7 @@ namespace uammd{
 
       //Sum all values in a container using Kahan Summation, which increases floating point accuracy
       template<class Container>
+      inline
       auto kahanSum(Container &v){
 	auto c = v[0]*0;
 	auto sum = c;
@@ -55,6 +56,7 @@ namespace uammd{
 
       //Integrate the function foo(x) from x=rmin to x=rmax using the Simpson rule with Nr intervals
       template<class Foo>
+      inline
       auto integrate(Foo foo, real rmin, real rmax, int Nr){
 	using T = decltype(foo(rmin));
 	if(Nr%2 == 1) Nr++; //Need an even number of points
@@ -74,6 +76,7 @@ namespace uammd{
     }
 
     //[1] Taken from https://arxiv.org/pdf/1712.04732.pdf
+    inline
     __host__ __device__ real BM(real zz, real alpha, real beta){
       const real z = zz/alpha;
       const real z2 = z*z;
@@ -114,7 +117,7 @@ namespace uammd{
 	const real invh;
 	static constexpr int support = 3;
 	threePoint(real h):invh(1.0/h){}
-	__host__ __device__ real phi(real rr, real3 pos = real3()) const{
+	inline __host__ __device__ real phi(real rr, real3 pos = real3()) const{
 	  const real r = fabs(rr)*invh;
 	  if(r<real(0.5)){
 	    constexpr real onediv3 = real(1/3.0);
diff --git a/src/misc/LanczosAlgorithm/LanczosAlgorithm.cu b/src/misc/LanczosAlgorithm/LanczosAlgorithm.cu
index 015b0d87..fa198513 100644
--- a/src/misc/LanczosAlgorithm/LanczosAlgorithm.cu
+++ b/src/misc/LanczosAlgorithm/LanczosAlgorithm.cu
@@ -193,16 +193,19 @@ namespace uammd{
       };
     }
 
+    inline
     Solver::Solver():
       check_convergence_steps(3){
       //Init cuBLAS for Lanczos process
       CublasSafeCall(cublasCreate_v2(&cublas_handle));
     }
 
+    inline
     Solver::~Solver(){
       CublasSafeCall(cublasDestroy(cublas_handle));
     }
 
+    inline
     real Solver::runIterations(MatrixDot *dot, real *Bz, const real*z, int numberIterations, int N){
       oldBz.resize((N+1), real());
       /*Lanczos iterations for Krylov decomposition*/
@@ -221,6 +224,7 @@ namespace uammd{
       return currentResidual;
     }
 
+    inline
     int Solver::run(MatrixDot *dot, real *Bz, const real*z, real tolerance, int N, cudaStream_t st){
       oldBz.resize((N+1));
       detail::device_fill(oldBz.begin(), oldBz.end(), real());
@@ -247,6 +251,7 @@ namespace uammd{
       throw std::runtime_error("[Lanczos] Could not converge");
     }
 
+    inline
     real Solver::computeError(real *Bz, int N, int iter){
       /*Compute error as in eq 27 in [1]
 	Error = ||Bz_i - Bz_{i-1}||_2 / ||Bz_{i-1}||_2
@@ -272,6 +277,7 @@ namespace uammd{
       return Error;
     }
 
+    inline
     void Solver::registerRequiredStepsForConverge(int steps_needed){
       this->lastRunRequiredSteps = steps_needed;
       if(steps_needed-2 > check_convergence_steps){
diff --git a/src/utils/cublasDebug.h b/src/utils/cublasDebug.h
index 9293087a..a50b14ef 100644
--- a/src/utils/cublasDebug.h
+++ b/src/utils/cublasDebug.h
@@ -8,7 +8,7 @@
 #define CublasSafeCall(err) __cublasSafeCall(err, __FILE__, __LINE__)
 
 
-
+inline
 const char* cublasGetErrorString(cublasStatus_t status){
   switch(status){
   case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
diff --git a/src/utils/cufftDebug.h b/src/utils/cufftDebug.h
index 07e63abc..3056bb62 100644
--- a/src/utils/cufftDebug.h
+++ b/src/utils/cufftDebug.h
@@ -12,6 +12,7 @@
 #define CufftSafeCall(err) __cufftSafeCall(err, __FILE__, __LINE__)
 
 namespace uammd{
+  inline
   const char* cufftGetErrorString(cufftResult_t err){
     switch (err) {
     case CUFFT_INVALID_PLAN:   return "CUFFT_INVALID_PLAN\n";
diff --git a/src/utils/cufftPrecisionAgnostic.h b/src/utils/cufftPrecisionAgnostic.h
index 953778b2..e5b6f6b0 100644
--- a/src/utils/cufftPrecisionAgnostic.h
+++ b/src/utils/cufftPrecisionAgnostic.h
@@ -28,14 +28,17 @@ namespace uammd{
   template<> struct CUFFT_Real2Complex<float>{static constexpr cufftType value=CUFFT_R2C;};
 
   template<class prec>
+  inline
   cufftResult cufftExecReal2Complex(cufftHandle &plan, cufftReal_t<prec>* d_in, cufftComplex_t<prec> *d_out);
 
   template<>
+  inline
   cufftResult cufftExecReal2Complex<float>(cufftHandle &plan, cufftReal_t<float>* d_in, cufftComplex_t<float> *d_out){
     return cufftExecR2C(plan, d_in, d_out);
   }
 
   template<>
+  inline
   cufftResult cufftExecReal2Complex<double>(cufftHandle &plan, cufftReal_t<double>* d_in, cufftComplex_t<double> *d_out){
     return cufftExecD2Z(plan, d_in, d_out);
   }
@@ -45,14 +48,17 @@ namespace uammd{
   template<> struct CUFFT_Complex2Real<float>{static constexpr cufftType value=CUFFT_C2R;};
 
   template<class real>
+  inline
   cufftResult cufftExecComplex2Real(cufftHandle &plan, cufftComplex_t<real>* d_in, cufftReal_t<real> *d_out);
 
   template<>
+  inline
   cufftResult cufftExecComplex2Real<float>(cufftHandle &plan, cufftComplex_t<float> *d_in, cufftReal_t<float> *d_out){
     return cufftExecC2R(plan, d_in, d_out);
   }
 
   template<>
+  inline
   cufftResult cufftExecComplex2Real<double>(cufftHandle &plan, cufftComplex_t<double> *d_in, cufftReal_t<double> *d_out){
     return cufftExecZ2D(plan, d_in, d_out);
   }
@@ -62,14 +68,17 @@ namespace uammd{
   template<> struct CUFFT_Complex2Complex<float>{static constexpr cufftType value=CUFFT_C2C;};
 
   template<class real>
+  inline
   cufftResult cufftExecComplex2Complex(cufftHandle &plan, cufftComplex_t<real>* d_in, cufftComplex_t<real> *d_out, int direction);
 
   template<>
+  inline
   cufftResult cufftExecComplex2Complex<float>(cufftHandle &plan, cufftComplex_t<float> *d_in, cufftComplex_t<float> *d_out, int direction){
     return cufftExecC2C(plan, d_in, d_out, direction);
   }
 
   template<>
+  inline
   cufftResult cufftExecComplex2Complex<double>(cufftHandle &plan, cufftComplex_t<double> *d_in, cufftComplex_t<double> *d_out, int direction){
     return cufftExecZ2Z(plan, d_in, d_out, direction);
   }
diff --git a/src/utils/curandDebug.h b/src/utils/curandDebug.h
index 0f3b2b95..567a90c0 100644
--- a/src/utils/curandDebug.h
+++ b/src/utils/curandDebug.h
@@ -8,6 +8,7 @@
 
 #define CurandSafeCall(err) __curandSafeCall(err, __FILE__, __LINE__)
 
+inline
 const char* curandGetErrorString(curandStatus_t err){
   switch (err) {
   case CURAND_STATUS_VERSION_MISMATCH:    return "CURAND_STATUS_VERSION_MISMATCH";

From afb0b2010bbc8579a13d49b9c20495b9028f22a9 Mon Sep 17 00:00:00 2001
From: PabloIbannez <p.ibanez.fre@gmail.com>
Date: Fri, 26 Jul 2024 17:09:05 +0200
Subject: [PATCH 13/15] Made functions inline to resolve multiple definition
 issues during compilation and linking

---
 src/misc/BoundaryValueProblem/BVPMemory.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/misc/BoundaryValueProblem/BVPMemory.cuh b/src/misc/BoundaryValueProblem/BVPMemory.cuh
index 1fafcdc9..d4f49227 100644
--- a/src/misc/BoundaryValueProblem/BVPMemory.cuh
+++ b/src/misc/BoundaryValueProblem/BVPMemory.cuh
@@ -19,7 +19,7 @@ namespace uammd{
     };
 
     namespace detail{
-      size_t computeExtraAlignment(size_t addressOffset, size_t alignment){
+    inline size_t computeExtraAlignment(size_t addressOffset, size_t alignment){
 	size_t extraAlignment = 0;
 	bool isMissAligned = addressOffset%alignment != 0;
 	if(isMissAligned)
@@ -120,4 +120,4 @@ namespace uammd{
 
   }
 }
-#endif
\ No newline at end of file
+#endif

From 8b7436e609b2b05e5775285f2ea202f43f1b0188 Mon Sep 17 00:00:00 2001
From: pabloibannez <p.ibanez.fre@gmail.com>
Date: Sun, 6 Oct 2024 17:06:31 +0200
Subject: [PATCH 14/15] Updated blas include

---
 src/misc/BoundaryValueProblem/MatrixUtils.h | 2 +-
 src/misc/lapack_and_blas_defines.h          | 4 ++--
 src/uammd.cuh                               | 6 +-----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/misc/BoundaryValueProblem/MatrixUtils.h b/src/misc/BoundaryValueProblem/MatrixUtils.h
index 25903c71..d9d062e4 100644
--- a/src/misc/BoundaryValueProblem/MatrixUtils.h
+++ b/src/misc/BoundaryValueProblem/MatrixUtils.h
@@ -10,7 +10,7 @@
 #ifdef USE_MKL
 #include<mkl.h>
 #else
-#include<lapacke.h>
+#include<openblas/lapacke.h>
 #endif
 #include<thrust/complex.h>
 namespace uammd{
diff --git a/src/misc/lapack_and_blas_defines.h b/src/misc/lapack_and_blas_defines.h
index 6df2ee72..af27d0a1 100644
--- a/src/misc/lapack_and_blas_defines.h
+++ b/src/misc/lapack_and_blas_defines.h
@@ -4,8 +4,8 @@
 #ifdef USE_MKL
 #include<mkl.h>
 #else
-#include<lapacke.h>
-#include<cblas.h>
+#include<openblas/lapacke.h>
+#include<openblas/cblas.h>
 #endif
 #ifdef SINGLE_PRECISION
 #define LAPACKE_steqr LAPACKE_ssteqr
diff --git a/src/uammd.cuh b/src/uammd.cuh
index 07fbcaa2..fc402041 100644
--- a/src/uammd.cuh
+++ b/src/uammd.cuh
@@ -8,13 +8,9 @@
 #include "global/defines.h"
 #include "System/System.h"
 
-// Helper macro to handle indirect inclusion
-#define STRINGIFY(x) #x
-#define INCLUDE_FILE(x) STRINGIFY(x)
-
 #ifdef UAMMD_EXTENSIONS
   #ifdef UAMMD_EXTENSIONS_PREAMBLE
-    #include INCLUDE_FILE(UAMMD_EXTENSIONS_PREAMBLE)
+    #include UAMMD_EXTENSIONS_PREAMBLE
   #else
     #include "../extensions/preamble.h"
   #endif

From d6cf1a4baa16629a3da2fb2b9508ce6be5bb34fd Mon Sep 17 00:00:00 2001
From: Pablo Ibanez Freire <pablo@ibannez.ftmc.uam.es>
Date: Thu, 14 Nov 2024 12:19:49 +0100
Subject: [PATCH 15/15] Fixed PSE

---
 src/Integrator/BDHI/PSE/initialization.cu | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/Integrator/BDHI/PSE/initialization.cu b/src/Integrator/BDHI/PSE/initialization.cu
index 06f0dd8e..883674a5 100644
--- a/src/Integrator/BDHI/PSE/initialization.cu
+++ b/src/Integrator/BDHI/PSE/initialization.cu
@@ -6,7 +6,7 @@ Initialization
 #include"Integrator/BDHI/BDHI_PSE.cuh"
 namespace uammd{
   namespace BDHI{
-    namespace pse_ns{      
+    namespace pse_ns{
       void checkInputValidity(BDHI::PSE::Parameters par){
 	real3 L = par.box.boxSize;
 	if(L.x == real(0.0) && L.y == real(0.0) && L.z == real(0.0)){
@@ -16,10 +16,10 @@ namespace uammd{
 	if(L.x != L.y || L.y != L.z || L.x != L.z){
 	  System::log<System::WARNING>("[BDHI::PSE] Non cubic boxes are not really tested!");
 	}
-	
-	
+
+
       }
-      
+
       long double computeSelfMobility(PSE::Parameters par){
 	//O(a^8) accuracy. See Hashimoto 1959.
 	//With a Gaussian this expression has a minimum deviation from measuraments of 7e-7*rh at L=64*rh.
@@ -34,11 +34,12 @@ namespace uammd{
 	long double a6pref = 16.0l*M_PIl*M_PIl/45.0l + 630.0L*b*b;
 	return  1.0l/(6.0l*M_PIl*par.viscosity*rh)*(1.0l-c*a+(4.0l/3.0l)*M_PIl*a3-a6pref*a3*a3);
       }
-      
+
     }
-  
-    PSE::PSE(shared_ptr<ParticleGroup> pg, Parameters par):
-      pg(pg), hydrodynamicRadius(par.hydrodynamicRadius){
+
+PSE::PSE(shared_ptr<ParticleGroup> pg, Parameters par):
+      pg(pg), hydrodynamicRadius(par.hydrodynamicRadius),
+      temperature(par.temperature), dt(par.dt){
       System::log<System::MESSAGE>("[BDHI::PSE] Initialized");
       this->M0 = pse_ns::computeSelfMobility(par);
       System::log<System::MESSAGE>("[BDHI::PSE] Self mobility: %f", M0);