diff --git a/VERSION b/VERSION
index de6ee6401..b3d91f9cf 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-5.8.7
+5.9.0
diff --git a/src/Beta_projectors/beta_projectors_base.hpp b/src/Beta_projectors/beta_projectors_base.hpp
index 10ec0fc43..63477f687 100644
--- a/src/Beta_projectors/beta_projectors_base.hpp
+++ b/src/Beta_projectors/beta_projectors_base.hpp
@@ -38,7 +38,7 @@ extern "C" void create_beta_gk_gpu(int                   num_atoms,
                                    double_complex*       beta_gk);
 #endif
 
-enum beta_desc_idx 
+enum beta_desc_idx
 {
     nbf      = 0,
     offset   = 1,
@@ -416,13 +416,13 @@ inline void Beta_projectors_base::local_inner_aux<double_complex>(double_complex
 {
     utils::timer t1("sirius::Beta_projectors_base::local_inner_aux");
     linalg2(ctx_.blas_linalg_t()).gemm('C', 'N', nbeta__, n__, num_gkvec_loc(),
-            &linalg_const<double_complex>::one(),
-            beta_pw_coeffs_a_ptr__,
-            num_gkvec_loc(),
-            phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__),
-            phi__.pw_coeffs(ispn__).prime().ld(),
-            &linalg_const<double_complex>::zero(),
-            beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld());
+                                       &linalg_const<double_complex>::one(),
+                                       beta_pw_coeffs_a_ptr__,
+                                       num_gkvec_loc(),
+                                       phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__),
+                                       phi__.pw_coeffs(ispn__).prime().ld(),
+                                       &linalg_const<double_complex>::zero(),
+                                       beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld());
 
     auto pp = utils::get_env<int>("SIRIUS_PRINT_PERFORMANCE");
     if (pp && gkvec_.comm().rank() == 0) {
@@ -443,13 +443,13 @@ inline void Beta_projectors_base::local_inner_aux<double>(double* beta_pw_coeffs
                                                           matrix<double>& beta_phi__) const
 {
     linalg2(ctx_.blas_linalg_t()).gemm('C', 'N', nbeta__, n__, 2 * num_gkvec_loc(),
-            &linalg_const<double>::two(),
-            beta_pw_coeffs_a_ptr__,
-            2 * num_gkvec_loc(),
-            reinterpret_cast<double const*>(phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__)),
-            2 * phi__.pw_coeffs(ispn__).prime().ld(),
-            &linalg_const<double>::zero(),
-            beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld());
+                                       &linalg_const<double>::two(),
+                                       beta_pw_coeffs_a_ptr__,
+                                       2 * num_gkvec_loc(),
+                                       reinterpret_cast<double const*>(phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__)),
+                                       2 * phi__.pw_coeffs(ispn__).prime().ld(),
+                                       &linalg_const<double>::zero(),
+                                       beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld());
 
     /* rank 0 has to do some extra work for Gamma-point case */
     if (gkvec_.comm().rank() == 0) {
diff --git a/src/Hamiltonian/non_local_operator.hpp b/src/Hamiltonian/non_local_operator.hpp
index 63361d0c7..5a5c33899 100644
--- a/src/Hamiltonian/non_local_operator.hpp
+++ b/src/Hamiltonian/non_local_operator.hpp
@@ -33,92 +33,80 @@ namespace sirius {
 template <typename T>
 class Non_local_operator
 {
-    protected:
-        Simulation_context const& ctx_;
+  protected:
+    Simulation_context const& ctx_;
 
-        device_t pu_;
+    device_t pu_;
 
-        int packed_mtrx_size_;
+    int packed_mtrx_size_;
 
-        mdarray<int, 1> packed_mtrx_offset_;
+    mdarray<int, 1> packed_mtrx_offset_;
 
-        /// Non-local operator matrix.
-        mdarray<T, 2> op_;
+    /// Non-local operator matrix.
+    mdarray<T, 2> op_;
 
-        mdarray<T, 1> work_;
+    mdarray<T, 1> work_;
 
-        bool is_null_{false};
+    bool is_null_{false};
 
-        /* copy assigment operrator is forbidden */
-        Non_local_operator& operator=(Non_local_operator const& src) = delete;
-        /* copy constructor is forbidden */
-        Non_local_operator(Non_local_operator const& src) = delete;
+    /* copy assigment operrator is forbidden */
+    Non_local_operator& operator=(Non_local_operator const& src) = delete;
+    /* copy constructor is forbidden */
+    Non_local_operator(Non_local_operator const& src) = delete;
 
-    public:
-        Non_local_operator(Simulation_context const& ctx__)
-            : ctx_(ctx__)
-        {
-            PROFILE("sirius::Non_local_operator::Non_local_operator");
-            pu_                 = this->ctx_.processing_unit();
-            auto& uc            = this->ctx_.unit_cell();
-            packed_mtrx_offset_ = mdarray<int, 1>(uc.num_atoms());
-            packed_mtrx_size_   = 0;
-            for (int ia = 0; ia < uc.num_atoms(); ia++) {
-                int nbf                 = uc.atom(ia).mt_basis_size();
-                packed_mtrx_offset_(ia) = packed_mtrx_size_;
-                packed_mtrx_size_ += nbf * nbf;
-            }
+  public:
+    Non_local_operator(Simulation_context const& ctx__)
+        : ctx_(ctx__)
+    {
+        PROFILE("sirius::Non_local_operator::Non_local_operator");
+        pu_                 = this->ctx_.processing_unit();
+        auto& uc            = this->ctx_.unit_cell();
+        packed_mtrx_offset_ = mdarray<int, 1>(uc.num_atoms());
+        packed_mtrx_size_   = 0;
+        for (int ia = 0; ia < uc.num_atoms(); ia++) {
+            int nbf                 = uc.atom(ia).mt_basis_size();
+            packed_mtrx_offset_(ia) = packed_mtrx_size_;
+            packed_mtrx_size_ += nbf * nbf;
+        }
 
-            switch (pu_) {
-                case device_t::GPU: {
-                    packed_mtrx_offset_.allocate(memory_t::device);
-                    packed_mtrx_offset_.copy_to(memory_t::device);
-                    break;
-                }
-                case device_t::CPU: break;
+        switch (pu_) {
+            case device_t::GPU: {
+                packed_mtrx_offset_.allocate(memory_t::device).copy_to(memory_t::device);
+                break;
+            }
+            case device_t::CPU: {
+                break;
             }
         }
+    }
 
-        ~Non_local_operator()
-        {
-        }
+    ~Non_local_operator()
+    {
+    }
 
-        inline void apply(int chunk__,
-                          int ispn_block__,
-                          Wave_functions& op_phi__,
-                          int idx0__,
-                          int n__,
-                          Beta_projectors_base& beta_,
-                          matrix<T>& beta_phi__);
-
-        inline void apply_one_atom(int chunk__,
-                                   int ispn_block__,
-                                   Wave_functions& op_phi__,
-                                   int idx0__,
-                                   int n__,
-                                   Beta_projectors_base& beta_,
-                                   matrix<T>& beta_phi__,
-                                   int i);
-
-        inline T operator()(int xi1__, int xi2__, int ia__)
-        {
-            return (*this)(xi1__, xi2__, 0, ia__);
-        }
+    /// Apply chunk of beta-projectors to all wave functions.
+    inline void apply(int chunk__, int ispn_block__, Wave_functions& op_phi__, int idx0__, int n__,
+                      Beta_projectors_base& beta__, matrix<T>& beta_phi__);
 
-        inline T operator()(int xi1__, int xi2__, int ispn__, int ia__)
-        {
-            int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size();
-            return op_(packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__);
-        }
+    /// Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions.
+    inline void apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__, int idx0__, int n__,
+                      Beta_projectors_base& beta__, matrix<T>& beta_phi__);
+
+    inline T operator()(int xi1__, int xi2__, int ia__)
+    {
+        return (*this)(xi1__, xi2__, 0, ia__);
+    }
+
+    inline T operator()(int xi1__, int xi2__, int ispn__, int ia__)
+    {
+        int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size();
+        return op_(packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__);
+    }
 };
 
-template<>
-inline void Non_local_operator<double_complex>::apply(int chunk__,
-                                                      int ispn_block__,
-                                                      Wave_functions& op_phi__,
-                                                      int idx0__,
-                                                      int n__,
-                                                      Beta_projectors_base& beta_,
+template <>
+inline void Non_local_operator<double_complex>::apply(int chunk__, int ispn_block__, Wave_functions& op_phi__,
+                                                      int idx0__, int n__, Beta_projectors_base& beta__,
                                                       matrix<double_complex>& beta_phi__)
 {
     PROFILE("sirius::Non_local_operator::apply");
@@ -129,9 +117,9 @@ inline void Non_local_operator<double_complex>::apply(int chunk__,
 
     int jspn = ispn_block__ & 1;
 
-    auto& beta_gk     = beta_.pw_coeffs_a();
-    int num_gkvec_loc = beta_.num_gkvec_loc();
-    int nbeta         = beta_.chunk(chunk__).num_beta_;
+    auto& beta_gk     = beta__.pw_coeffs_a();
+    int num_gkvec_loc = beta__.num_gkvec_loc();
+    int nbeta         = beta__.chunk(chunk__).num_beta_;
 
     if (static_cast<size_t>(nbeta * n__) > work_.size()) {
         work_ = mdarray<double_complex, 1>(nbeta * n__);
@@ -145,28 +133,27 @@ inline void Non_local_operator<double_complex>::apply(int chunk__,
     switch (pu_) {
         case device_t::CPU: {
             mem = memory_t::host;
-            la = linalg_t::blas;
+            la  = linalg_t::blas;
             break;
         }
         case device_t::GPU: {
             mem = memory_t::device;
-            la = linalg_t::cublas;
+            la  = linalg_t::cublas;
             break;
         }
     }
 
     /* compute O * <beta|phi> for atoms in a chunk */
     #pragma omp parallel for
-    for (int i = 0; i < beta_.chunk(chunk__).num_atoms_; i++) {
+    for (int i = 0; i < beta__.chunk(chunk__).num_atoms_; i++) {
         /* number of beta functions for a given atom */
-        int nbf  = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i);
-        int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i);
-        int ia   = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i);
+        int nbf  = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, i);
+        int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, i);
+        int ia   = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, i);
         linalg2(la).gemm('N', 'N', nbf, n__, nbf, &linalg_const<double_complex>::one(),
-                         op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf,
-                         beta_phi__.at(mem, offs, 0), nbeta,
-                         &linalg_const<double_complex>::zero(),
-                         work_.at(mem, offs), nbeta, stream_id(omp_get_thread_num()));
+                         op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf, beta_phi__.at(mem, offs, 0), nbeta,
+                         &linalg_const<double_complex>::zero(), work_.at(mem, offs), nbeta,
+                         stream_id(omp_get_thread_num()));
     }
     switch (pu_) {
         case device_t::GPU: {
@@ -176,16 +163,16 @@ inline void Non_local_operator<double_complex>::apply(int chunk__,
             acc::sync_stream(stream_id(omp_get_thread_num()));
 #endif
             break;
-
         }
-        case device_t::CPU: break;
+        case device_t::CPU: {
+            break;
+        }
     }
 
     /* compute <G+k|beta> * O * <beta|phi> and add to op_phi */
     linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', num_gkvec_loc, n__, nbeta,
                                        &linalg_const<double_complex>::one(),
-                                       beta_gk.at(mem), num_gkvec_loc,
-                                       work_.at(mem), nbeta,
+                                       beta_gk.at(mem), num_gkvec_loc, work_.at(mem), nbeta,
                                        &linalg_const<double_complex>::one(),
                                        op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__),
                                        op_phi__.pw_coeffs(jspn).prime().ld());
@@ -196,27 +183,23 @@ inline void Non_local_operator<double_complex>::apply(int chunk__,
             acc::sync_stream(stream_id(-1));
 #endif
             break;
-
         }
-        case device_t::CPU: break;
+        case device_t::CPU: {
+            break;
+        }
     }
 }
 
-template<>
-inline void Non_local_operator<double_complex>::apply_one_atom(int chunk__,
-                                                               int ispn_block__,
-                                                               Wave_functions& op_phi__,
-                                                               int idx0__,
-                                                               int n__,
-                                                               Beta_projectors_base& beta_,
-                                                               matrix<double_complex>& beta_phi__,
-                                                               const int i)
+template <>
+inline void Non_local_operator<double_complex>::apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__,
+                                                      int idx0__, int n__, Beta_projectors_base& beta__,
+                                                      matrix<double_complex>& beta_phi__)
 {
     int jspn = ispn_block__ & 1;
 
-    auto& beta_gk     = beta_.pw_coeffs_a();
-    int num_gkvec_loc = beta_.num_gkvec_loc();
-    int nbeta         = beta_.chunk(chunk__).num_beta_;
+    auto& beta_gk     = beta__.pw_coeffs_a();
+    int num_gkvec_loc = beta__.num_gkvec_loc();
+    int nbeta         = beta__.chunk(chunk__).num_beta_;
 
     if (static_cast<size_t>(nbeta * n__) > work_.size()) {
         work_ = mdarray<double_complex, 1>(nbeta * n__);
@@ -225,69 +208,56 @@ inline void Non_local_operator<double_complex>::apply_one_atom(int chunk__,
         }
     }
 
-    int nbf  = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i);
-    int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i);
-    int ia   = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i);
+    int nbf  = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, ia__);
+    int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, ia__);
+    int ia   = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, ia__);
 
-    work_.zero();
+    /* setup linear algebra parameters */
+    memory_t mem{memory_t::none};
+    linalg_t la{linalg_t::none};
     switch (pu_) {
-    case device_t::CPU: {
-        linalg<CPU>::gemm(0, 0, nbf,
-                          n__, nbf,
-                          op_.at(memory_t::host, packed_mtrx_offset_(ia), ispn_block__),
-                          nbf,
-                          beta_phi__.at(memory_t::host, offs, 0),
-                          nbeta,
-                          work_.at(memory_t::host),
-                          nbf);
-        /* compute <G+k|beta> * O * <beta|phi> and add to op_phi */
-        linalg<CPU>::gemm(0, 0,
-                          num_gkvec_loc,
-                          n__, nbf,
-                          linalg_const<double_complex>::one(),
-                          beta_gk.at(memory_t::host, 0, offs),
-                          num_gkvec_loc,
-                          work_.at(memory_t::host), nbf,
-                          linalg_const<double_complex>::one(),
-                          op_phi__.pw_coeffs(jspn).prime().at(memory_t::host, 0, idx0__),
-                          op_phi__.pw_coeffs(jspn).prime().ld());
-
-        break;
+        case device_t::CPU: {
+            mem = memory_t::host;
+            la  = linalg_t::blas;
+            break;
+        }
+        case device_t::GPU: {
+            mem = memory_t::device;
+            la  = linalg_t::cublas;
+            break;
+        }
     }
-    case device_t::GPU: {
+
+    linalg2(la).gemm('N', 'N', nbf, n__, nbf,
+                     &linalg_const<double_complex>::one(),
+                     op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf,
+                     beta_phi__.at(mem, offs, 0), nbeta,
+                     &linalg_const<double_complex>::zero(),
+                     work_.at(mem), nbf);
+
+    linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', num_gkvec_loc, n__, nbf,
+                                       &linalg_const<double_complex>::one(),
+                                       beta_gk.at(mem, 0, offs), num_gkvec_loc,
+                                       work_.at(mem), nbf,
+                                       &linalg_const<double_complex>::one(),
+                                       op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__),
+                                       op_phi__.pw_coeffs(jspn).prime().ld());
+    switch (pu_) {
+        case device_t::CPU: {
+            break;
+        }
+        case device_t::GPU: {
 #ifdef __GPU
-        linalg<GPU>::gemm(0, 0, nbf,
-                          n__, nbf,
-                          op_.at(memory_t::device, packed_mtrx_offset_(ia), ispn_block__),
-                          nbf,
-                          beta_phi__.at(memory_t::device, offs, 0),
-                          nbeta,
-                          work_.at(memory_t::device),
-                          nbf, -1);
-        linalg<GPU>::gemm(0, 0,
-                          num_gkvec_loc,
-                          n__, nbf,
-                          &linalg_const<double_complex>::one(),
-                          beta_gk.at(memory_t::device, 0, offs),
-                          beta_gk.ld(), work_.at(memory_t::device), nbf,
-                          &linalg_const<double_complex>::one(),
-                          op_phi__.pw_coeffs(jspn).prime().at(memory_t::device, 0, idx0__),
-                          op_phi__.pw_coeffs(jspn).prime().ld());
-        acc::sync_stream(stream_id(-1));
+            acc::sync_stream(stream_id(-1));
 #endif
-        break;
-    }
+            break;
+        }
     }
 }
 
-template<>
-inline void Non_local_operator<double>::apply(int chunk__,
-                                              int ispn_block__,
-                                              Wave_functions& op_phi__,
-                                              int idx0__,
-                                              int n__,
-                                              Beta_projectors_base& beta_,
-                                              matrix<double>& beta_phi__)
+template <>
+inline void Non_local_operator<double>::apply(int chunk__, int ispn_block__, Wave_functions& op_phi__, int idx0__,
+                                              int n__, Beta_projectors_base& beta__, matrix<double>& beta_phi__)
 {
     PROFILE("sirius::Non_local_operator::apply");
 
@@ -297,9 +267,9 @@ inline void Non_local_operator<double>::apply(int chunk__,
 
     int jspn = ispn_block__ & 1;
 
-    auto& beta_gk     = beta_.pw_coeffs_a();
-    int num_gkvec_loc = beta_.num_gkvec_loc();
-    int nbeta         = beta_.chunk(chunk__).num_beta_;
+    auto& beta_gk     = beta__.pw_coeffs_a();
+    int num_gkvec_loc = beta__.num_gkvec_loc();
+    int nbeta         = beta__.chunk(chunk__).num_beta_;
 
     if (static_cast<size_t>(nbeta * n__) > work_.size()) {
         work_ = mdarray<double, 1>(nbeta * n__);
@@ -308,66 +278,77 @@ inline void Non_local_operator<double>::apply(int chunk__,
         }
     }
 
-    /* compute O * <beta|phi> */
-    #pragma omp parallel for
-    for (int i = 0; i < beta_.chunk(chunk__).num_atoms_; i++) {
-        /* number of beta functions for a given atom */
-        int nbf  = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i);
-        int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i);
-        int ia   = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i);
-
-        switch (pu_) {
-            case CPU: {
-                linalg<CPU>::gemm(0, 0, nbf, n__, nbf, op_.at(memory_t::host, packed_mtrx_offset_(ia), ispn_block__), nbf,
-                                  beta_phi__.at(memory_t::host, offs, 0), nbeta, work_.at(memory_t::host, offs), nbeta);
-                break;
-            }
-            case GPU: {
-                #ifdef __GPU
-                linalg<GPU>::gemm(0, 0, nbf, n__, nbf, op_.at(memory_t::device, packed_mtrx_offset_(ia), ispn_block__), nbf,
-                                  beta_phi__.at(memory_t::device, offs, 0), nbeta, work_.at(memory_t::device, offs), nbeta, omp_get_thread_num());
-                break;
-                #endif
-            }
+    /* setup linear algebra parameters */
+    memory_t mem{memory_t::none};
+    linalg_t la{linalg_t::none};
+    switch (pu_) {
+        case device_t::CPU: {
+            mem = memory_t::host;
+            la  = linalg_t::blas;
+            break;
+        }
+        case device_t::GPU: {
+            mem = memory_t::device;
+            la  = linalg_t::cublas;
+            break;
         }
     }
 
-    /* compute <G+k|beta> * O * <beta|phi> and add to op_phi */
+    /* compute O * <beta|phi> for atoms in a chunk */
+    #pragma omp parallel for
+    for (int i = 0; i < beta__.chunk(chunk__).num_atoms_; i++) {
+        /* number of beta functions for a given atom */
+        int nbf  = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, i);
+        int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, i);
+        int ia   = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, i);
+        linalg2(la).gemm('N', 'N', nbf, n__, nbf,
+                         &linalg_const<double>::one(),
+                         op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf,
+                         beta_phi__.at(mem, offs, 0), nbeta,
+                         &linalg_const<double>::zero(),
+                         work_.at(mem, offs), nbeta,
+                         stream_id(omp_get_thread_num()));
+    }
     switch (pu_) {
-        case CPU: {
-            linalg<CPU>::gemm(0, 0, 2 * num_gkvec_loc, n__, nbeta, 1.0, reinterpret_cast<double*>(beta_gk.at(memory_t::host)),
-                              2 * num_gkvec_loc, work_.at(memory_t::host), nbeta, 1.0,
-                              reinterpret_cast<double*>(op_phi__.pw_coeffs(jspn).prime().at(memory_t::host, 0, idx0__)),
-                              2 * op_phi__.pw_coeffs(jspn).prime().ld());
-            break;
-        }
-        case GPU: {
-            #ifdef __GPU
+        case device_t::GPU: {
+#ifdef __GPU
             /* wait for previous zgemms */
             #pragma omp parallel
             acc::sync_stream(stream_id(omp_get_thread_num()));
+#endif
+            break;
+        }
+        case device_t::CPU: {
+            break;
+        }
+    }
 
-            linalg<GPU>::gemm(0, 0, 2 * num_gkvec_loc, n__, nbeta, &linalg_const<double>::one(),
-                              reinterpret_cast<double*>(beta_gk.template at(memory_t::device)), 2 * num_gkvec_loc, work_.at(memory_t::device), nbeta,
-                              &linalg_const<double>::one(),
-                              reinterpret_cast<double*>(op_phi__.pw_coeffs(jspn).prime().at(memory_t::device, 0, idx0__)),
-                              2 * num_gkvec_loc);
+    /* compute <G+k|beta> * O * <beta|phi> and add to op_phi */
+    linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', 2 * num_gkvec_loc, n__, nbeta,
+                                       &linalg_const<double>::one(),
+                                       reinterpret_cast<double*>(beta_gk.at(mem)), 2 * num_gkvec_loc,
+                                       work_.at(mem), nbeta,
+                                       &linalg_const<double>::one(),
+                                       reinterpret_cast<double*>(op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__)),
+                                       2 * op_phi__.pw_coeffs(jspn).prime().ld());
+
+    switch (pu_) {
+        case device_t::GPU: {
+#ifdef __GPU
             acc::sync_stream(stream_id(-1));
-            #endif
+#endif
+            break;
+        }
+        case device_t::CPU: {
             break;
         }
     }
 }
 
-template<>
-inline void Non_local_operator<double>::apply_one_atom(int chunk__,
-                                                       int ispn_block__,
-                                                       Wave_functions& op_phi__,
-                                                       int idx0__,
-                                                       int n__,
-                                                       Beta_projectors_base& beta_,
-                                                       matrix<double>& beta_phi__,
-                                                       const int i)
+template <>
+inline void Non_local_operator<double>::apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__,
+                                              int idx0__, int n__, Beta_projectors_base& beta__,
+                                              matrix<double>& beta_phi__)
 {
     TERMINATE_NOT_IMPLEMENTED;
 }
@@ -375,211 +356,209 @@ inline void Non_local_operator<double>::apply_one_atom(int chunk__,
 template <typename T>
 class D_operator : public Non_local_operator<T>
 {
-    private:
-        void initialize()
-        {
-            auto& uc = this->ctx_.unit_cell();
-
-            for (int ia = 0; ia < uc.num_atoms(); ia++) {
-                int nbf = uc.atom(ia).mt_basis_size();
-                if (uc.atom(ia).type().spin_orbit_coupling()) {
-
-                    // the pseudo potential contains information about
-                    // spin orbit coupling so we use a different formula
-                    // Eq.19 PRB 71 115106 for calculating the D matrix
-
-                    // Note that the D matrices are stored and
-                    // calculated in the up-down basis already not the
-                    // (Veff,Bx,By,Bz) one.
-                    for (int xi2 = 0; xi2 < nbf; xi2++) {
-                        for (int xi1 = 0; xi1 < nbf; xi1++) {
-                            int idx = xi2 * nbf + xi1;
-                            for (int s = 0; s < 4; s++) {
-                                this->op_(this->packed_mtrx_offset_(ia) + idx, s) =
-                                    //type_wrapper<T>::bypass(uc.atom(ia).d_mtrx_so(xi1, xi2, s));
-                                    utils::zero_if_not_complex<T>(uc.atom(ia).d_mtrx_so(xi1, xi2, s));
-                            }
+  private:
+    void initialize()
+    {
+        auto& uc = this->ctx_.unit_cell();
+
+        for (int ia = 0; ia < uc.num_atoms(); ia++) {
+            int nbf = uc.atom(ia).mt_basis_size();
+            if (uc.atom(ia).type().spin_orbit_coupling()) {
+
+                // the pseudo potential contains information about
+                // spin orbit coupling so we use a different formula
+                // Eq.19 PRB 71 115106 for calculating the D matrix
+
+                // Note that the D matrices are stored and
+                // calculated in the up-down basis already not the
+                // (Veff,Bx,By,Bz) one.
+                for (int xi2 = 0; xi2 < nbf; xi2++) {
+                    for (int xi1 = 0; xi1 < nbf; xi1++) {
+                        int idx = xi2 * nbf + xi1;
+                        for (int s = 0; s < 4; s++) {
+                            this->op_(this->packed_mtrx_offset_(ia) + idx, s) =
+                                // type_wrapper<T>::bypass(uc.atom(ia).d_mtrx_so(xi1, xi2, s));
+                                utils::zero_if_not_complex<T>(uc.atom(ia).d_mtrx_so(xi1, xi2, s));
                         }
                     }
-                } else {
-                    // No spin orbit coupling for this atom \f[D = D(V_{eff})
-                    // I + D(B_x) \sigma_x + D(B_y) sigma_y + D(B_z)
-                    // sigma_z\f] since the D matrices are calculated that
-                    // way.
-                    for (int xi2 = 0; xi2 < nbf; xi2++) {
-                        for (int xi1 = 0; xi1 < nbf; xi1++) {
-                            int idx = xi2 * nbf + xi1;
-                            switch (this->ctx_.num_mag_dims()) {
-                                case 3: {
-                                    double bx = uc.atom(ia).d_mtrx(xi1, xi2, 2);
-                                    double by = uc.atom(ia).d_mtrx(xi1, xi2, 3);
-                                    this->op_(this->packed_mtrx_offset_(ia) + idx, 2) =
-                                        //type_wrapper<T>::bypass(double_complex(bx, -by));
-                                        utils::zero_if_not_complex<T>(double_complex(bx, -by));
-                                    this->op_(this->packed_mtrx_offset_(ia) + idx, 3) =
-                                        //type_wrapper<T>::bypass(double_complex(bx, by));
-                                        utils::zero_if_not_complex<T>(double_complex(bx, by));
-                                }
-                                case 1: {
-                                    double v  = uc.atom(ia).d_mtrx(xi1, xi2, 0);
-                                    double bz = uc.atom(ia).d_mtrx(xi1, xi2, 1);
-                                    this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = v + bz;
-                                    this->op_(this->packed_mtrx_offset_(ia) + idx, 1) = v - bz;
-                                    break;
-                                }
-                                case 0: {
-                                    this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = uc.atom(ia).d_mtrx(xi1, xi2, 0);
-                                    break;
-                                }
-                                default: {
-                                    TERMINATE("wrong number of magnetic dimensions");
-                                }
+                }
+            } else {
+                // No spin orbit coupling for this atom \f[D = D(V_{eff})
+                // I + D(B_x) \sigma_x + D(B_y) sigma_y + D(B_z)
+                // sigma_z\f] since the D matrices are calculated that
+                // way.
+                for (int xi2 = 0; xi2 < nbf; xi2++) {
+                    for (int xi1 = 0; xi1 < nbf; xi1++) {
+                        int idx = xi2 * nbf + xi1;
+                        switch (this->ctx_.num_mag_dims()) {
+                            case 3: {
+                                double bx = uc.atom(ia).d_mtrx(xi1, xi2, 2);
+                                double by = uc.atom(ia).d_mtrx(xi1, xi2, 3);
+                                this->op_(this->packed_mtrx_offset_(ia) + idx, 2) =
+                                    // type_wrapper<T>::bypass(double_complex(bx, -by));
+                                    utils::zero_if_not_complex<T>(double_complex(bx, -by));
+                                this->op_(this->packed_mtrx_offset_(ia) + idx, 3) =
+                                    // type_wrapper<T>::bypass(double_complex(bx, by));
+                                    utils::zero_if_not_complex<T>(double_complex(bx, by));
+                            }
+                            case 1: {
+                                double v                                          = uc.atom(ia).d_mtrx(xi1, xi2, 0);
+                                double bz                                         = uc.atom(ia).d_mtrx(xi1, xi2, 1);
+                                this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = v + bz;
+                                this->op_(this->packed_mtrx_offset_(ia) + idx, 1) = v - bz;
+                                break;
+                            }
+                            case 0: {
+                                this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = uc.atom(ia).d_mtrx(xi1, xi2, 0);
+                                break;
+                            }
+                            default: {
+                                TERMINATE("wrong number of magnetic dimensions");
                             }
                         }
                     }
                 }
             }
+        }
 
-            if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) {
-                auto cs = this->op_.checksum();
-                utils::print_checksum("D_operator", cs);
-            }
-
-            if (this->pu_ == GPU) {
-                this->op_.allocate(memory_t::device).copy_to(memory_t::device);
-            }
+        if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) {
+            auto cs = this->op_.checksum();
+            utils::print_checksum("D_operator", cs);
         }
 
-    public:
-        D_operator(Simulation_context const& ctx_)
-            : Non_local_operator<T>(ctx_)
-        {
-            this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1);
-            this->op_.zero();
-            /* D-matrix is complex in non-collinear case */
-            if (ctx_.num_mag_dims() == 3) {
-                assert((std::is_same<T, double_complex>::value));
-            }
-            initialize();
+        if (this->pu_ == GPU) {
+            this->op_.allocate(memory_t::device).copy_to(memory_t::device);
         }
+    }
 
+  public:
+    D_operator(Simulation_context const& ctx_)
+        : Non_local_operator<T>(ctx_)
+    {
+        this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1);
+        this->op_.zero();
+        /* D-matrix is complex in non-collinear case */
+        if (ctx_.num_mag_dims() == 3) {
+            assert((std::is_same<T, double_complex>::value));
+        }
+        initialize();
+    }
 };
 
 template <typename T>
 class Q_operator : public Non_local_operator<T>
 {
-    private:
-
-        void initialize()
-        {
-            auto& uc = this->ctx_.unit_cell();
-            for (int ia = 0; ia < uc.num_atoms(); ia++) {
-                int iat = uc.atom(ia).type().id();
-                if (!uc.atom_type(iat).augment()) {
-                    continue;
-                }
-                int nbf = uc.atom(ia).mt_basis_size();
-                for (int xi2 = 0; xi2 < nbf; xi2++) {
-                    for (int xi1 = 0; xi1 < nbf; xi1++) {
-                        /* The ultra soft pseudo potential has spin orbit coupling incorporated to it, so we
-                           need to rotate the Q matrix */
-                        if (this->ctx_.unit_cell().atom_type(iat).spin_orbit_coupling()) {
-                            /* this is nothing else than Eq.18 of Ref PRB 71, 115106 */
-                            for (auto si = 0; si < 2; si++) {
-                                for (auto sj = 0; sj < 2; sj++) {
-
-                                    double_complex result(0, 0);
-
-                                    for (int xi2p = 0; xi2p < nbf; xi2p++) {
-                                        if (uc.atom(ia).type().compare_index_beta_functions(xi2, xi2p)) {
-                                            for (int xi1p = 0; xi1p < nbf; xi1p++) {
-                                                /* The F coefficients are already "block diagonal" so we do a full
-                                                   summation. We actually rotate the q_matrices only */
-                                                if (uc.atom(ia).type().compare_index_beta_functions(xi1, xi1p)) {
-                                                    result += this->ctx_.augmentation_op(iat).q_mtrx(xi1p, xi2p) *
-                                                        (uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 0) *
-                                                         uc.atom(ia).type().f_coefficients(xi2p, xi2, 0, si) +
-                                                         uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 1) *
-                                                         uc.atom(ia).type().f_coefficients(xi2p, xi2, 1, si));
-                                                }
+  private:
+    void initialize()
+    {
+        auto& uc = this->ctx_.unit_cell();
+        for (int ia = 0; ia < uc.num_atoms(); ia++) {
+            int iat = uc.atom(ia).type().id();
+            if (!uc.atom_type(iat).augment()) {
+                continue;
+            }
+            int nbf = uc.atom(ia).mt_basis_size();
+            for (int xi2 = 0; xi2 < nbf; xi2++) {
+                for (int xi1 = 0; xi1 < nbf; xi1++) {
+                    /* The ultra soft pseudo potential has spin orbit coupling incorporated to it, so we
+                       need to rotate the Q matrix */
+                    if (this->ctx_.unit_cell().atom_type(iat).spin_orbit_coupling()) {
+                        /* this is nothing else than Eq.18 of Ref PRB 71, 115106 */
+                        for (auto si = 0; si < 2; si++) {
+                            for (auto sj = 0; sj < 2; sj++) {
+
+                                double_complex result(0, 0);
+
+                                for (int xi2p = 0; xi2p < nbf; xi2p++) {
+                                    if (uc.atom(ia).type().compare_index_beta_functions(xi2, xi2p)) {
+                                        for (int xi1p = 0; xi1p < nbf; xi1p++) {
+                                            /* The F coefficients are already "block diagonal" so we do a full
+                                               summation. We actually rotate the q_matrices only */
+                                            if (uc.atom(ia).type().compare_index_beta_functions(xi1, xi1p)) {
+                                                result += this->ctx_.augmentation_op(iat).q_mtrx(xi1p, xi2p) *
+                                                          (uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 0) *
+                                                               uc.atom(ia).type().f_coefficients(xi2p, xi2, 0, si) +
+                                                           uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 1) *
+                                                               uc.atom(ia).type().f_coefficients(xi2p, xi2, 1, si));
                                             }
                                         }
                                     }
-
-                                    /* the order of the index is important */
-                                    const int ind = (si == sj) ? si : sj + 2;
-                                    /* this gives
-                                       ind = 0 if si = up and sj = up
-                                       ind = 1 if si = sj = down
-                                       ind = 2 if si = down and sj = up
-                                       ind = 3 if si = up and sj = down */
-                                    this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ind) =
-                                        //type_wrapper<T>::bypass(result);
-                                        utils::zero_if_not_complex<T>(result);
                                 }
+
+                                /* the order of the index is important */
+                                const int ind = (si == sj) ? si : sj + 2;
+                                /* this gives
+                                   ind = 0 if si = up and sj = up
+                                   ind = 1 if si = sj = down
+                                   ind = 2 if si = down and sj = up
+                                   ind = 3 if si = up and sj = down */
+                                this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ind) =
+                                    // type_wrapper<T>::bypass(result);
+                                    utils::zero_if_not_complex<T>(result);
                             }
-                        } else {
-                            for (int ispn = 0; ispn < this->ctx_.num_spins(); ispn++) {
-                                this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ispn) =
-                                    this->ctx_.augmentation_op(iat).q_mtrx(xi1, xi2);
-                            }
+                        }
+                    } else {
+                        for (int ispn = 0; ispn < this->ctx_.num_spins(); ispn++) {
+                            this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ispn) =
+                                this->ctx_.augmentation_op(iat).q_mtrx(xi1, xi2);
                         }
                     }
                 }
             }
-            if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) {
-                auto cs = this->op_.checksum();
-                utils::print_checksum("Q_operator", cs);
-            }
-
-            if (this->pu_ == device_t::GPU) {
-                this->op_.allocate(memory_t::device).copy_to(memory_t::device);
-            }
+        }
+        if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) {
+            auto cs = this->op_.checksum();
+            utils::print_checksum("Q_operator", cs);
         }
 
-    public:
-        Q_operator(Simulation_context const& ctx_)
-            : Non_local_operator<T>(ctx_)
-        {
-            /* Q-operator is independent of spin if there is no spin-orbit; however, it simplifies the apply()
-             * method if the Q-operator has a spin index */
-            this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1);
-            this->op_.zero();
-            initialize();
+        if (this->pu_ == device_t::GPU) {
+            this->op_.allocate(memory_t::device).copy_to(memory_t::device);
         }
+    }
+
+  public:
+    Q_operator(Simulation_context const& ctx_)
+        : Non_local_operator<T>(ctx_)
+    {
+        /* Q-operator is independent of spin if there is no spin-orbit; however, it simplifies the apply()
+         * method if the Q-operator has a spin index */
+        this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1);
+        this->op_.zero();
+        initialize();
+    }
 };
 
 template <typename T>
 class P_operator : public Non_local_operator<T>
 {
-    public:
-        P_operator(Simulation_context const& ctx_, mdarray<double_complex, 3>& p_mtrx__)
-            : Non_local_operator<T>(ctx_)
-        {
-            /* Q-operator is independent of spin */
-            this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, 1);
-            this->op_.zero();
-
-            auto& uc = ctx_.unit_cell();
-            for (int ia = 0; ia < uc.num_atoms(); ia++) {
-                int iat = uc.atom(ia).type().id();
-                if (!uc.atom_type(iat).augment()) {
-                    continue;
-                }
-                int nbf = uc.atom(ia).mt_basis_size();
-                for (int xi2 = 0; xi2 < nbf; xi2++) {
-                    for (int xi1 = 0; xi1 < nbf; xi1++) {
-                        this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, 0) = -p_mtrx__(xi1, xi2, iat).real();
-                    }
-                }
+  public:
+    P_operator(Simulation_context const& ctx_, mdarray<double_complex, 3>& p_mtrx__)
+        : Non_local_operator<T>(ctx_)
+    {
+        /* Q-operator is independent of spin */
+        this->op_ = mdarray<T, 2>(this->packed_mtrx_size_, 1);
+        this->op_.zero();
+
+        auto& uc = ctx_.unit_cell();
+        for (int ia = 0; ia < uc.num_atoms(); ia++) {
+            int iat = uc.atom(ia).type().id();
+            if (!uc.atom_type(iat).augment()) {
+                continue;
             }
-            if (this->pu_ == GPU) {
-                this->op_.allocate(memory_t::device);
-                this->op_.copy_to(memory_t::device);
+            int nbf = uc.atom(ia).mt_basis_size();
+            for (int xi2 = 0; xi2 < nbf; xi2++) {
+                for (int xi1 = 0; xi1 < nbf; xi1++) {
+                    this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, 0) = -p_mtrx__(xi1, xi2, iat).real();
+                }
             }
         }
+        if (this->pu_ == GPU) {
+            this->op_.allocate(memory_t::device);
+            this->op_.copy_to(memory_t::device);
+        }
+    }
 };
 
-}
+} // namespace sirius
 
 #endif
diff --git a/src/Hubbard/hubbard_occupancies_derivatives.hpp b/src/Hubbard/hubbard_occupancies_derivatives.hpp
index 0c068c389..a23da7129 100644
--- a/src/Hubbard/hubbard_occupancies_derivatives.hpp
+++ b/src/Hubbard/hubbard_occupancies_derivatives.hpp
@@ -164,21 +164,22 @@ void Hubbard::compute_occupancies_derivatives(K_point&                    kp,
 
                             // compute Q_ij <\beta_i|\phi> |d \beta_j> and add it to d\phi
                             {
-                                // < beta | phi> for this chunk
-                                auto beta_phi =
-                                    kp.beta_projectors().inner<double_complex>(chunk__, phi, 0, 0, this->number_of_hubbard_orbitals());
-                                q_op.apply_one_atom(chunk__, 0, dphi, 0, this->number_of_hubbard_orbitals(), bp_grad_, beta_phi, i);
+                                /* <beta | phi> for this chunk */
+                                auto beta_phi = kp.beta_projectors().inner<double_complex>(chunk__, phi, 0, 0,
+                                                                                           this->number_of_hubbard_orbitals());
+                                q_op.apply(chunk__, i, 0, dphi, 0, this->number_of_hubbard_orbitals(), bp_grad_, beta_phi);
                             }
 
                             // compute Q_ij <d \beta_i|\phi> |\beta_j> and add it to d\phi
                             {
-                                // < dbeta | phi> for this chunk
-                                auto dbeta_phi = bp_grad_.inner<double_complex>(chunk__, phi, 0, 0, this->number_of_hubbard_orbitals());
+                                /* <dbeta | phi> for this chunk */
+                                auto dbeta_phi = bp_grad_.inner<double_complex>(chunk__, phi, 0, 0,
+                                                                                this->number_of_hubbard_orbitals());
 
                                 /* apply Q operator (diagonal in spin) */
                                 /* Effectively compute Q_ij <d beta_i| phi> |beta_j> and add it dphi */
-                                q_op.apply_one_atom(chunk__, 0, dphi, 0, this->number_of_hubbard_orbitals(), kp.beta_projectors(), dbeta_phi,
-                                                    i);
+                                q_op.apply(chunk__, i, 0, dphi, 0, this->number_of_hubbard_orbitals(),
+                                           kp.beta_projectors(), dbeta_phi);
                             }
                         }
                     }
diff --git a/src/Potential/xc.hpp b/src/Potential/xc.hpp
index 8fb82df16..f4a1b9527 100644
--- a/src/Potential/xc.hpp
+++ b/src/Potential/xc.hpp
@@ -488,7 +488,9 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__)
         rhomin = std::min(rhomin, d);
         rho.f_rg(ir) = std::max(d, 0.0);
     }
-    if (rhomin < 0.0 && std::abs(rhomin) > 1e-9) {
+    ctx_.fft().comm().allreduce<double, mpi_op_t::min>(&rhomin, 1);
+    /* even a small negative density is a sign of something bing wrong; don't remove this check */
+    if (rhomin < 0.0 && ctx_.comm().rank() == 0) {
         std::stringstream s;
         s << "Interstitial charge density has negative values" << std::endl
           << "most negatve value : " << rhomin;
@@ -501,11 +503,11 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__)
             utils::print_hash("rho", h);
         }
     }
-    
+
     Smooth_periodic_function_gradient<double> grad_rho;
     Smooth_periodic_function<double> lapl_rho;
     Smooth_periodic_function<double> grad_rho_grad_rho;
-    
+
     if (is_gga) {
         /* use fft_transfrom of the base class (Smooth_periodic_function) */
         rho.fft_transform(-1);
@@ -521,7 +523,7 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__)
 
         /* product of gradients */
         grad_rho_grad_rho = dot(grad_rho, grad_rho);
-        
+
         /* Laplacian in real space */
         lapl_rho.fft_transform(1);
 
@@ -576,7 +578,7 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__)
             if (ixc.is_gga()) {
                 std::vector<double> vrho_t(spl_np_t.local_size());
                 std::vector<double> vsigma_t(spl_np_t.local_size());
-                
+
                 ixc.get_gga(spl_np_t.local_size(), 
                             &rho.f_rg(spl_np_t.global_offset()), 
                             &grad_rho_grad_rho.f_rg(spl_np_t.global_offset()),
@@ -674,7 +676,8 @@ inline void Potential::xc_rg_magnetic(Density const& density__)
     }
     t1.stop();
 
-    if (rhomin < 0.0 && std::abs(rhomin) > 1e-9) {
+    ctx_.fft().comm().allreduce<double, mpi_op_t::min>(&rhomin, 1);
+    if (rhomin < 0.0 && ctx_.comm().rank() == 0) {
         std::stringstream s;
         s << "Interstitial charge density has negative values" << std::endl
           << "most negatve value : " << rhomin;
diff --git a/src/SDDK/communicator.hpp b/src/SDDK/communicator.hpp
index 1c40a97d4..61754f5db 100644
--- a/src/SDDK/communicator.hpp
+++ b/src/SDDK/communicator.hpp
@@ -48,7 +48,8 @@ namespace sddk {
 enum class mpi_op_t
 {
     sum,
-    max
+    max,
+    min
 };
 
 template <mpi_op_t op>
@@ -72,6 +73,15 @@ struct mpi_op_wrapper<mpi_op_t::max>
     }
 };
 
+template <>
+struct mpi_op_wrapper<mpi_op_t::min>
+{
+    static MPI_Op kind()
+    {
+        return MPI_MIN;
+    }
+};
+
 template <typename T>
 struct mpi_type_wrapper;
 
diff --git a/src/constants.hpp b/src/constants.hpp
index d82594937..b2caed061 100644
--- a/src/constants.hpp
+++ b/src/constants.hpp
@@ -28,8 +28,8 @@
 #include <complex>
 
 const int major_version = 5;
-const int minor_version = 8;
-const int revision      = 7;
+const int minor_version = 9;
+const int revision      = 0;
 
 /// NIST value for the inverse fine structure (http://physics.nist.gov/cuu/Constants/index.html)
 const double speed_of_light = 137.035999139;