diff --git a/VERSION b/VERSION index de6ee6401..b3d91f9cf 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -5.8.7 +5.9.0 diff --git a/src/Beta_projectors/beta_projectors_base.hpp b/src/Beta_projectors/beta_projectors_base.hpp index 10ec0fc43..63477f687 100644 --- a/src/Beta_projectors/beta_projectors_base.hpp +++ b/src/Beta_projectors/beta_projectors_base.hpp @@ -38,7 +38,7 @@ extern "C" void create_beta_gk_gpu(int num_atoms, double_complex* beta_gk); #endif -enum beta_desc_idx +enum beta_desc_idx { nbf = 0, offset = 1, @@ -416,13 +416,13 @@ inline void Beta_projectors_base::local_inner_aux(double_complex { utils::timer t1("sirius::Beta_projectors_base::local_inner_aux"); linalg2(ctx_.blas_linalg_t()).gemm('C', 'N', nbeta__, n__, num_gkvec_loc(), - &linalg_const::one(), - beta_pw_coeffs_a_ptr__, - num_gkvec_loc(), - phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__), - phi__.pw_coeffs(ispn__).prime().ld(), - &linalg_const::zero(), - beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld()); + &linalg_const::one(), + beta_pw_coeffs_a_ptr__, + num_gkvec_loc(), + phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__), + phi__.pw_coeffs(ispn__).prime().ld(), + &linalg_const::zero(), + beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld()); auto pp = utils::get_env("SIRIUS_PRINT_PERFORMANCE"); if (pp && gkvec_.comm().rank() == 0) { @@ -443,13 +443,13 @@ inline void Beta_projectors_base::local_inner_aux(double* beta_pw_coeffs matrix& beta_phi__) const { linalg2(ctx_.blas_linalg_t()).gemm('C', 'N', nbeta__, n__, 2 * num_gkvec_loc(), - &linalg_const::two(), - beta_pw_coeffs_a_ptr__, - 2 * num_gkvec_loc(), - reinterpret_cast(phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__)), - 2 * phi__.pw_coeffs(ispn__).prime().ld(), - &linalg_const::zero(), - beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld()); + &linalg_const::two(), + beta_pw_coeffs_a_ptr__, + 2 * num_gkvec_loc(), + reinterpret_cast(phi__.pw_coeffs(ispn__).prime().at(ctx_.preferred_memory_t(), 0, idx0__)), + 2 * phi__.pw_coeffs(ispn__).prime().ld(), + &linalg_const::zero(), + beta_phi__.at(ctx_.preferred_memory_t()), beta_phi__.ld()); /* rank 0 has to do some extra work for Gamma-point case */ if (gkvec_.comm().rank() == 0) { diff --git a/src/Hamiltonian/non_local_operator.hpp b/src/Hamiltonian/non_local_operator.hpp index 63361d0c7..5a5c33899 100644 --- a/src/Hamiltonian/non_local_operator.hpp +++ b/src/Hamiltonian/non_local_operator.hpp @@ -33,92 +33,80 @@ namespace sirius { template class Non_local_operator { - protected: - Simulation_context const& ctx_; + protected: + Simulation_context const& ctx_; - device_t pu_; + device_t pu_; - int packed_mtrx_size_; + int packed_mtrx_size_; - mdarray packed_mtrx_offset_; + mdarray packed_mtrx_offset_; - /// Non-local operator matrix. - mdarray op_; + /// Non-local operator matrix. + mdarray op_; - mdarray work_; + mdarray work_; - bool is_null_{false}; + bool is_null_{false}; - /* copy assigment operrator is forbidden */ - Non_local_operator& operator=(Non_local_operator const& src) = delete; - /* copy constructor is forbidden */ - Non_local_operator(Non_local_operator const& src) = delete; + /* copy assigment operrator is forbidden */ + Non_local_operator& operator=(Non_local_operator const& src) = delete; + /* copy constructor is forbidden */ + Non_local_operator(Non_local_operator const& src) = delete; - public: - Non_local_operator(Simulation_context const& ctx__) - : ctx_(ctx__) - { - PROFILE("sirius::Non_local_operator::Non_local_operator"); - pu_ = this->ctx_.processing_unit(); - auto& uc = this->ctx_.unit_cell(); - packed_mtrx_offset_ = mdarray(uc.num_atoms()); - packed_mtrx_size_ = 0; - for (int ia = 0; ia < uc.num_atoms(); ia++) { - int nbf = uc.atom(ia).mt_basis_size(); - packed_mtrx_offset_(ia) = packed_mtrx_size_; - packed_mtrx_size_ += nbf * nbf; - } + public: + Non_local_operator(Simulation_context const& ctx__) + : ctx_(ctx__) + { + PROFILE("sirius::Non_local_operator::Non_local_operator"); + pu_ = this->ctx_.processing_unit(); + auto& uc = this->ctx_.unit_cell(); + packed_mtrx_offset_ = mdarray(uc.num_atoms()); + packed_mtrx_size_ = 0; + for (int ia = 0; ia < uc.num_atoms(); ia++) { + int nbf = uc.atom(ia).mt_basis_size(); + packed_mtrx_offset_(ia) = packed_mtrx_size_; + packed_mtrx_size_ += nbf * nbf; + } - switch (pu_) { - case device_t::GPU: { - packed_mtrx_offset_.allocate(memory_t::device); - packed_mtrx_offset_.copy_to(memory_t::device); - break; - } - case device_t::CPU: break; + switch (pu_) { + case device_t::GPU: { + packed_mtrx_offset_.allocate(memory_t::device).copy_to(memory_t::device); + break; + } + case device_t::CPU: { + break; } } + } - ~Non_local_operator() - { - } + ~Non_local_operator() + { + } - inline void apply(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, - matrix& beta_phi__); - - inline void apply_one_atom(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, - matrix& beta_phi__, - int i); - - inline T operator()(int xi1__, int xi2__, int ia__) - { - return (*this)(xi1__, xi2__, 0, ia__); - } + /// Apply chunk of beta-projectors to all wave functions. + inline void apply(int chunk__, int ispn_block__, Wave_functions& op_phi__, int idx0__, int n__, + Beta_projectors_base& beta__, matrix& beta_phi__); - inline T operator()(int xi1__, int xi2__, int ispn__, int ia__) - { - int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size(); - return op_(packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__); - } + /// Apply beta projectors from one atom in a chunk of beta projectors to all wave-functions. + inline void apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__, int idx0__, int n__, + Beta_projectors_base& beta__, matrix& beta_phi__); + + inline T operator()(int xi1__, int xi2__, int ia__) + { + return (*this)(xi1__, xi2__, 0, ia__); + } + + inline T operator()(int xi1__, int xi2__, int ispn__, int ia__) + { + int nbf = this->ctx_.unit_cell().atom(ia__).mt_basis_size(); + return op_(packed_mtrx_offset_(ia__) + xi2__ * nbf + xi1__, ispn__); + } }; -template<> -inline void Non_local_operator::apply(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, +template <> +inline void Non_local_operator::apply(int chunk__, int ispn_block__, Wave_functions& op_phi__, + int idx0__, int n__, Beta_projectors_base& beta__, matrix& beta_phi__) { PROFILE("sirius::Non_local_operator::apply"); @@ -129,9 +117,9 @@ inline void Non_local_operator::apply(int chunk__, int jspn = ispn_block__ & 1; - auto& beta_gk = beta_.pw_coeffs_a(); - int num_gkvec_loc = beta_.num_gkvec_loc(); - int nbeta = beta_.chunk(chunk__).num_beta_; + auto& beta_gk = beta__.pw_coeffs_a(); + int num_gkvec_loc = beta__.num_gkvec_loc(); + int nbeta = beta__.chunk(chunk__).num_beta_; if (static_cast(nbeta * n__) > work_.size()) { work_ = mdarray(nbeta * n__); @@ -145,28 +133,27 @@ inline void Non_local_operator::apply(int chunk__, switch (pu_) { case device_t::CPU: { mem = memory_t::host; - la = linalg_t::blas; + la = linalg_t::blas; break; } case device_t::GPU: { mem = memory_t::device; - la = linalg_t::cublas; + la = linalg_t::cublas; break; } } /* compute O * for atoms in a chunk */ #pragma omp parallel for - for (int i = 0; i < beta_.chunk(chunk__).num_atoms_; i++) { + for (int i = 0; i < beta__.chunk(chunk__).num_atoms_; i++) { /* number of beta functions for a given atom */ - int nbf = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i); - int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i); - int ia = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i); + int nbf = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, i); + int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, i); + int ia = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, i); linalg2(la).gemm('N', 'N', nbf, n__, nbf, &linalg_const::one(), - op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf, - beta_phi__.at(mem, offs, 0), nbeta, - &linalg_const::zero(), - work_.at(mem, offs), nbeta, stream_id(omp_get_thread_num())); + op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf, beta_phi__.at(mem, offs, 0), nbeta, + &linalg_const::zero(), work_.at(mem, offs), nbeta, + stream_id(omp_get_thread_num())); } switch (pu_) { case device_t::GPU: { @@ -176,16 +163,16 @@ inline void Non_local_operator::apply(int chunk__, acc::sync_stream(stream_id(omp_get_thread_num())); #endif break; - } - case device_t::CPU: break; + case device_t::CPU: { + break; + } } /* compute * O * and add to op_phi */ linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', num_gkvec_loc, n__, nbeta, &linalg_const::one(), - beta_gk.at(mem), num_gkvec_loc, - work_.at(mem), nbeta, + beta_gk.at(mem), num_gkvec_loc, work_.at(mem), nbeta, &linalg_const::one(), op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__), op_phi__.pw_coeffs(jspn).prime().ld()); @@ -196,27 +183,23 @@ inline void Non_local_operator::apply(int chunk__, acc::sync_stream(stream_id(-1)); #endif break; - } - case device_t::CPU: break; + case device_t::CPU: { + break; + } } } -template<> -inline void Non_local_operator::apply_one_atom(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, - matrix& beta_phi__, - const int i) +template <> +inline void Non_local_operator::apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__, + int idx0__, int n__, Beta_projectors_base& beta__, + matrix& beta_phi__) { int jspn = ispn_block__ & 1; - auto& beta_gk = beta_.pw_coeffs_a(); - int num_gkvec_loc = beta_.num_gkvec_loc(); - int nbeta = beta_.chunk(chunk__).num_beta_; + auto& beta_gk = beta__.pw_coeffs_a(); + int num_gkvec_loc = beta__.num_gkvec_loc(); + int nbeta = beta__.chunk(chunk__).num_beta_; if (static_cast(nbeta * n__) > work_.size()) { work_ = mdarray(nbeta * n__); @@ -225,69 +208,56 @@ inline void Non_local_operator::apply_one_atom(int chunk__, } } - int nbf = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i); - int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i); - int ia = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i); + int nbf = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, ia__); + int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, ia__); + int ia = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, ia__); - work_.zero(); + /* setup linear algebra parameters */ + memory_t mem{memory_t::none}; + linalg_t la{linalg_t::none}; switch (pu_) { - case device_t::CPU: { - linalg::gemm(0, 0, nbf, - n__, nbf, - op_.at(memory_t::host, packed_mtrx_offset_(ia), ispn_block__), - nbf, - beta_phi__.at(memory_t::host, offs, 0), - nbeta, - work_.at(memory_t::host), - nbf); - /* compute * O * and add to op_phi */ - linalg::gemm(0, 0, - num_gkvec_loc, - n__, nbf, - linalg_const::one(), - beta_gk.at(memory_t::host, 0, offs), - num_gkvec_loc, - work_.at(memory_t::host), nbf, - linalg_const::one(), - op_phi__.pw_coeffs(jspn).prime().at(memory_t::host, 0, idx0__), - op_phi__.pw_coeffs(jspn).prime().ld()); - - break; + case device_t::CPU: { + mem = memory_t::host; + la = linalg_t::blas; + break; + } + case device_t::GPU: { + mem = memory_t::device; + la = linalg_t::cublas; + break; + } } - case device_t::GPU: { + + linalg2(la).gemm('N', 'N', nbf, n__, nbf, + &linalg_const::one(), + op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf, + beta_phi__.at(mem, offs, 0), nbeta, + &linalg_const::zero(), + work_.at(mem), nbf); + + linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', num_gkvec_loc, n__, nbf, + &linalg_const::one(), + beta_gk.at(mem, 0, offs), num_gkvec_loc, + work_.at(mem), nbf, + &linalg_const::one(), + op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__), + op_phi__.pw_coeffs(jspn).prime().ld()); + switch (pu_) { + case device_t::CPU: { + break; + } + case device_t::GPU: { #ifdef __GPU - linalg::gemm(0, 0, nbf, - n__, nbf, - op_.at(memory_t::device, packed_mtrx_offset_(ia), ispn_block__), - nbf, - beta_phi__.at(memory_t::device, offs, 0), - nbeta, - work_.at(memory_t::device), - nbf, -1); - linalg::gemm(0, 0, - num_gkvec_loc, - n__, nbf, - &linalg_const::one(), - beta_gk.at(memory_t::device, 0, offs), - beta_gk.ld(), work_.at(memory_t::device), nbf, - &linalg_const::one(), - op_phi__.pw_coeffs(jspn).prime().at(memory_t::device, 0, idx0__), - op_phi__.pw_coeffs(jspn).prime().ld()); - acc::sync_stream(stream_id(-1)); + acc::sync_stream(stream_id(-1)); #endif - break; - } + break; + } } } -template<> -inline void Non_local_operator::apply(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, - matrix& beta_phi__) +template <> +inline void Non_local_operator::apply(int chunk__, int ispn_block__, Wave_functions& op_phi__, int idx0__, + int n__, Beta_projectors_base& beta__, matrix& beta_phi__) { PROFILE("sirius::Non_local_operator::apply"); @@ -297,9 +267,9 @@ inline void Non_local_operator::apply(int chunk__, int jspn = ispn_block__ & 1; - auto& beta_gk = beta_.pw_coeffs_a(); - int num_gkvec_loc = beta_.num_gkvec_loc(); - int nbeta = beta_.chunk(chunk__).num_beta_; + auto& beta_gk = beta__.pw_coeffs_a(); + int num_gkvec_loc = beta__.num_gkvec_loc(); + int nbeta = beta__.chunk(chunk__).num_beta_; if (static_cast(nbeta * n__) > work_.size()) { work_ = mdarray(nbeta * n__); @@ -308,66 +278,77 @@ inline void Non_local_operator::apply(int chunk__, } } - /* compute O * */ - #pragma omp parallel for - for (int i = 0; i < beta_.chunk(chunk__).num_atoms_; i++) { - /* number of beta functions for a given atom */ - int nbf = beta_.chunk(chunk__).desc_(beta_desc_idx::nbf, i); - int offs = beta_.chunk(chunk__).desc_(beta_desc_idx::offset, i); - int ia = beta_.chunk(chunk__).desc_(beta_desc_idx::ia, i); - - switch (pu_) { - case CPU: { - linalg::gemm(0, 0, nbf, n__, nbf, op_.at(memory_t::host, packed_mtrx_offset_(ia), ispn_block__), nbf, - beta_phi__.at(memory_t::host, offs, 0), nbeta, work_.at(memory_t::host, offs), nbeta); - break; - } - case GPU: { - #ifdef __GPU - linalg::gemm(0, 0, nbf, n__, nbf, op_.at(memory_t::device, packed_mtrx_offset_(ia), ispn_block__), nbf, - beta_phi__.at(memory_t::device, offs, 0), nbeta, work_.at(memory_t::device, offs), nbeta, omp_get_thread_num()); - break; - #endif - } + /* setup linear algebra parameters */ + memory_t mem{memory_t::none}; + linalg_t la{linalg_t::none}; + switch (pu_) { + case device_t::CPU: { + mem = memory_t::host; + la = linalg_t::blas; + break; + } + case device_t::GPU: { + mem = memory_t::device; + la = linalg_t::cublas; + break; } } - /* compute * O * and add to op_phi */ + /* compute O * for atoms in a chunk */ + #pragma omp parallel for + for (int i = 0; i < beta__.chunk(chunk__).num_atoms_; i++) { + /* number of beta functions for a given atom */ + int nbf = beta__.chunk(chunk__).desc_(beta_desc_idx::nbf, i); + int offs = beta__.chunk(chunk__).desc_(beta_desc_idx::offset, i); + int ia = beta__.chunk(chunk__).desc_(beta_desc_idx::ia, i); + linalg2(la).gemm('N', 'N', nbf, n__, nbf, + &linalg_const::one(), + op_.at(mem, packed_mtrx_offset_(ia), ispn_block__), nbf, + beta_phi__.at(mem, offs, 0), nbeta, + &linalg_const::zero(), + work_.at(mem, offs), nbeta, + stream_id(omp_get_thread_num())); + } switch (pu_) { - case CPU: { - linalg::gemm(0, 0, 2 * num_gkvec_loc, n__, nbeta, 1.0, reinterpret_cast(beta_gk.at(memory_t::host)), - 2 * num_gkvec_loc, work_.at(memory_t::host), nbeta, 1.0, - reinterpret_cast(op_phi__.pw_coeffs(jspn).prime().at(memory_t::host, 0, idx0__)), - 2 * op_phi__.pw_coeffs(jspn).prime().ld()); - break; - } - case GPU: { - #ifdef __GPU + case device_t::GPU: { +#ifdef __GPU /* wait for previous zgemms */ #pragma omp parallel acc::sync_stream(stream_id(omp_get_thread_num())); +#endif + break; + } + case device_t::CPU: { + break; + } + } - linalg::gemm(0, 0, 2 * num_gkvec_loc, n__, nbeta, &linalg_const::one(), - reinterpret_cast(beta_gk.template at(memory_t::device)), 2 * num_gkvec_loc, work_.at(memory_t::device), nbeta, - &linalg_const::one(), - reinterpret_cast(op_phi__.pw_coeffs(jspn).prime().at(memory_t::device, 0, idx0__)), - 2 * num_gkvec_loc); + /* compute * O * and add to op_phi */ + linalg2(ctx_.blas_linalg_t()).gemm('N', 'N', 2 * num_gkvec_loc, n__, nbeta, + &linalg_const::one(), + reinterpret_cast(beta_gk.at(mem)), 2 * num_gkvec_loc, + work_.at(mem), nbeta, + &linalg_const::one(), + reinterpret_cast(op_phi__.pw_coeffs(jspn).prime().at(ctx_.preferred_memory_t(), 0, idx0__)), + 2 * op_phi__.pw_coeffs(jspn).prime().ld()); + + switch (pu_) { + case device_t::GPU: { +#ifdef __GPU acc::sync_stream(stream_id(-1)); - #endif +#endif + break; + } + case device_t::CPU: { break; } } } -template<> -inline void Non_local_operator::apply_one_atom(int chunk__, - int ispn_block__, - Wave_functions& op_phi__, - int idx0__, - int n__, - Beta_projectors_base& beta_, - matrix& beta_phi__, - const int i) +template <> +inline void Non_local_operator::apply(int chunk__, int ia__, int ispn_block__, Wave_functions& op_phi__, + int idx0__, int n__, Beta_projectors_base& beta__, + matrix& beta_phi__) { TERMINATE_NOT_IMPLEMENTED; } @@ -375,211 +356,209 @@ inline void Non_local_operator::apply_one_atom(int chunk__, template class D_operator : public Non_local_operator { - private: - void initialize() - { - auto& uc = this->ctx_.unit_cell(); - - for (int ia = 0; ia < uc.num_atoms(); ia++) { - int nbf = uc.atom(ia).mt_basis_size(); - if (uc.atom(ia).type().spin_orbit_coupling()) { - - // the pseudo potential contains information about - // spin orbit coupling so we use a different formula - // Eq.19 PRB 71 115106 for calculating the D matrix - - // Note that the D matrices are stored and - // calculated in the up-down basis already not the - // (Veff,Bx,By,Bz) one. - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 < nbf; xi1++) { - int idx = xi2 * nbf + xi1; - for (int s = 0; s < 4; s++) { - this->op_(this->packed_mtrx_offset_(ia) + idx, s) = - //type_wrapper::bypass(uc.atom(ia).d_mtrx_so(xi1, xi2, s)); - utils::zero_if_not_complex(uc.atom(ia).d_mtrx_so(xi1, xi2, s)); - } + private: + void initialize() + { + auto& uc = this->ctx_.unit_cell(); + + for (int ia = 0; ia < uc.num_atoms(); ia++) { + int nbf = uc.atom(ia).mt_basis_size(); + if (uc.atom(ia).type().spin_orbit_coupling()) { + + // the pseudo potential contains information about + // spin orbit coupling so we use a different formula + // Eq.19 PRB 71 115106 for calculating the D matrix + + // Note that the D matrices are stored and + // calculated in the up-down basis already not the + // (Veff,Bx,By,Bz) one. + for (int xi2 = 0; xi2 < nbf; xi2++) { + for (int xi1 = 0; xi1 < nbf; xi1++) { + int idx = xi2 * nbf + xi1; + for (int s = 0; s < 4; s++) { + this->op_(this->packed_mtrx_offset_(ia) + idx, s) = + // type_wrapper::bypass(uc.atom(ia).d_mtrx_so(xi1, xi2, s)); + utils::zero_if_not_complex(uc.atom(ia).d_mtrx_so(xi1, xi2, s)); } } - } else { - // No spin orbit coupling for this atom \f[D = D(V_{eff}) - // I + D(B_x) \sigma_x + D(B_y) sigma_y + D(B_z) - // sigma_z\f] since the D matrices are calculated that - // way. - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 < nbf; xi1++) { - int idx = xi2 * nbf + xi1; - switch (this->ctx_.num_mag_dims()) { - case 3: { - double bx = uc.atom(ia).d_mtrx(xi1, xi2, 2); - double by = uc.atom(ia).d_mtrx(xi1, xi2, 3); - this->op_(this->packed_mtrx_offset_(ia) + idx, 2) = - //type_wrapper::bypass(double_complex(bx, -by)); - utils::zero_if_not_complex(double_complex(bx, -by)); - this->op_(this->packed_mtrx_offset_(ia) + idx, 3) = - //type_wrapper::bypass(double_complex(bx, by)); - utils::zero_if_not_complex(double_complex(bx, by)); - } - case 1: { - double v = uc.atom(ia).d_mtrx(xi1, xi2, 0); - double bz = uc.atom(ia).d_mtrx(xi1, xi2, 1); - this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = v + bz; - this->op_(this->packed_mtrx_offset_(ia) + idx, 1) = v - bz; - break; - } - case 0: { - this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = uc.atom(ia).d_mtrx(xi1, xi2, 0); - break; - } - default: { - TERMINATE("wrong number of magnetic dimensions"); - } + } + } else { + // No spin orbit coupling for this atom \f[D = D(V_{eff}) + // I + D(B_x) \sigma_x + D(B_y) sigma_y + D(B_z) + // sigma_z\f] since the D matrices are calculated that + // way. + for (int xi2 = 0; xi2 < nbf; xi2++) { + for (int xi1 = 0; xi1 < nbf; xi1++) { + int idx = xi2 * nbf + xi1; + switch (this->ctx_.num_mag_dims()) { + case 3: { + double bx = uc.atom(ia).d_mtrx(xi1, xi2, 2); + double by = uc.atom(ia).d_mtrx(xi1, xi2, 3); + this->op_(this->packed_mtrx_offset_(ia) + idx, 2) = + // type_wrapper::bypass(double_complex(bx, -by)); + utils::zero_if_not_complex(double_complex(bx, -by)); + this->op_(this->packed_mtrx_offset_(ia) + idx, 3) = + // type_wrapper::bypass(double_complex(bx, by)); + utils::zero_if_not_complex(double_complex(bx, by)); + } + case 1: { + double v = uc.atom(ia).d_mtrx(xi1, xi2, 0); + double bz = uc.atom(ia).d_mtrx(xi1, xi2, 1); + this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = v + bz; + this->op_(this->packed_mtrx_offset_(ia) + idx, 1) = v - bz; + break; + } + case 0: { + this->op_(this->packed_mtrx_offset_(ia) + idx, 0) = uc.atom(ia).d_mtrx(xi1, xi2, 0); + break; + } + default: { + TERMINATE("wrong number of magnetic dimensions"); } } } } } + } - if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) { - auto cs = this->op_.checksum(); - utils::print_checksum("D_operator", cs); - } - - if (this->pu_ == GPU) { - this->op_.allocate(memory_t::device).copy_to(memory_t::device); - } + if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) { + auto cs = this->op_.checksum(); + utils::print_checksum("D_operator", cs); } - public: - D_operator(Simulation_context const& ctx_) - : Non_local_operator(ctx_) - { - this->op_ = mdarray(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1); - this->op_.zero(); - /* D-matrix is complex in non-collinear case */ - if (ctx_.num_mag_dims() == 3) { - assert((std::is_same::value)); - } - initialize(); + if (this->pu_ == GPU) { + this->op_.allocate(memory_t::device).copy_to(memory_t::device); } + } + public: + D_operator(Simulation_context const& ctx_) + : Non_local_operator(ctx_) + { + this->op_ = mdarray(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1); + this->op_.zero(); + /* D-matrix is complex in non-collinear case */ + if (ctx_.num_mag_dims() == 3) { + assert((std::is_same::value)); + } + initialize(); + } }; template class Q_operator : public Non_local_operator { - private: - - void initialize() - { - auto& uc = this->ctx_.unit_cell(); - for (int ia = 0; ia < uc.num_atoms(); ia++) { - int iat = uc.atom(ia).type().id(); - if (!uc.atom_type(iat).augment()) { - continue; - } - int nbf = uc.atom(ia).mt_basis_size(); - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 < nbf; xi1++) { - /* The ultra soft pseudo potential has spin orbit coupling incorporated to it, so we - need to rotate the Q matrix */ - if (this->ctx_.unit_cell().atom_type(iat).spin_orbit_coupling()) { - /* this is nothing else than Eq.18 of Ref PRB 71, 115106 */ - for (auto si = 0; si < 2; si++) { - for (auto sj = 0; sj < 2; sj++) { - - double_complex result(0, 0); - - for (int xi2p = 0; xi2p < nbf; xi2p++) { - if (uc.atom(ia).type().compare_index_beta_functions(xi2, xi2p)) { - for (int xi1p = 0; xi1p < nbf; xi1p++) { - /* The F coefficients are already "block diagonal" so we do a full - summation. We actually rotate the q_matrices only */ - if (uc.atom(ia).type().compare_index_beta_functions(xi1, xi1p)) { - result += this->ctx_.augmentation_op(iat).q_mtrx(xi1p, xi2p) * - (uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 0) * - uc.atom(ia).type().f_coefficients(xi2p, xi2, 0, si) + - uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 1) * - uc.atom(ia).type().f_coefficients(xi2p, xi2, 1, si)); - } + private: + void initialize() + { + auto& uc = this->ctx_.unit_cell(); + for (int ia = 0; ia < uc.num_atoms(); ia++) { + int iat = uc.atom(ia).type().id(); + if (!uc.atom_type(iat).augment()) { + continue; + } + int nbf = uc.atom(ia).mt_basis_size(); + for (int xi2 = 0; xi2 < nbf; xi2++) { + for (int xi1 = 0; xi1 < nbf; xi1++) { + /* The ultra soft pseudo potential has spin orbit coupling incorporated to it, so we + need to rotate the Q matrix */ + if (this->ctx_.unit_cell().atom_type(iat).spin_orbit_coupling()) { + /* this is nothing else than Eq.18 of Ref PRB 71, 115106 */ + for (auto si = 0; si < 2; si++) { + for (auto sj = 0; sj < 2; sj++) { + + double_complex result(0, 0); + + for (int xi2p = 0; xi2p < nbf; xi2p++) { + if (uc.atom(ia).type().compare_index_beta_functions(xi2, xi2p)) { + for (int xi1p = 0; xi1p < nbf; xi1p++) { + /* The F coefficients are already "block diagonal" so we do a full + summation. We actually rotate the q_matrices only */ + if (uc.atom(ia).type().compare_index_beta_functions(xi1, xi1p)) { + result += this->ctx_.augmentation_op(iat).q_mtrx(xi1p, xi2p) * + (uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 0) * + uc.atom(ia).type().f_coefficients(xi2p, xi2, 0, si) + + uc.atom(ia).type().f_coefficients(xi1, xi1p, sj, 1) * + uc.atom(ia).type().f_coefficients(xi2p, xi2, 1, si)); } } } - - /* the order of the index is important */ - const int ind = (si == sj) ? si : sj + 2; - /* this gives - ind = 0 if si = up and sj = up - ind = 1 if si = sj = down - ind = 2 if si = down and sj = up - ind = 3 if si = up and sj = down */ - this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ind) = - //type_wrapper::bypass(result); - utils::zero_if_not_complex(result); } + + /* the order of the index is important */ + const int ind = (si == sj) ? si : sj + 2; + /* this gives + ind = 0 if si = up and sj = up + ind = 1 if si = sj = down + ind = 2 if si = down and sj = up + ind = 3 if si = up and sj = down */ + this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ind) = + // type_wrapper::bypass(result); + utils::zero_if_not_complex(result); } - } else { - for (int ispn = 0; ispn < this->ctx_.num_spins(); ispn++) { - this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ispn) = - this->ctx_.augmentation_op(iat).q_mtrx(xi1, xi2); - } + } + } else { + for (int ispn = 0; ispn < this->ctx_.num_spins(); ispn++) { + this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, ispn) = + this->ctx_.augmentation_op(iat).q_mtrx(xi1, xi2); } } } } - if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) { - auto cs = this->op_.checksum(); - utils::print_checksum("Q_operator", cs); - } - - if (this->pu_ == device_t::GPU) { - this->op_.allocate(memory_t::device).copy_to(memory_t::device); - } + } + if (this->ctx_.control().print_checksum_ && this->ctx_.comm().rank() == 0) { + auto cs = this->op_.checksum(); + utils::print_checksum("Q_operator", cs); } - public: - Q_operator(Simulation_context const& ctx_) - : Non_local_operator(ctx_) - { - /* Q-operator is independent of spin if there is no spin-orbit; however, it simplifies the apply() - * method if the Q-operator has a spin index */ - this->op_ = mdarray(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1); - this->op_.zero(); - initialize(); + if (this->pu_ == device_t::GPU) { + this->op_.allocate(memory_t::device).copy_to(memory_t::device); } + } + + public: + Q_operator(Simulation_context const& ctx_) + : Non_local_operator(ctx_) + { + /* Q-operator is independent of spin if there is no spin-orbit; however, it simplifies the apply() + * method if the Q-operator has a spin index */ + this->op_ = mdarray(this->packed_mtrx_size_, ctx_.num_mag_dims() + 1); + this->op_.zero(); + initialize(); + } }; template class P_operator : public Non_local_operator { - public: - P_operator(Simulation_context const& ctx_, mdarray& p_mtrx__) - : Non_local_operator(ctx_) - { - /* Q-operator is independent of spin */ - this->op_ = mdarray(this->packed_mtrx_size_, 1); - this->op_.zero(); - - auto& uc = ctx_.unit_cell(); - for (int ia = 0; ia < uc.num_atoms(); ia++) { - int iat = uc.atom(ia).type().id(); - if (!uc.atom_type(iat).augment()) { - continue; - } - int nbf = uc.atom(ia).mt_basis_size(); - for (int xi2 = 0; xi2 < nbf; xi2++) { - for (int xi1 = 0; xi1 < nbf; xi1++) { - this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, 0) = -p_mtrx__(xi1, xi2, iat).real(); - } - } + public: + P_operator(Simulation_context const& ctx_, mdarray& p_mtrx__) + : Non_local_operator(ctx_) + { + /* Q-operator is independent of spin */ + this->op_ = mdarray(this->packed_mtrx_size_, 1); + this->op_.zero(); + + auto& uc = ctx_.unit_cell(); + for (int ia = 0; ia < uc.num_atoms(); ia++) { + int iat = uc.atom(ia).type().id(); + if (!uc.atom_type(iat).augment()) { + continue; } - if (this->pu_ == GPU) { - this->op_.allocate(memory_t::device); - this->op_.copy_to(memory_t::device); + int nbf = uc.atom(ia).mt_basis_size(); + for (int xi2 = 0; xi2 < nbf; xi2++) { + for (int xi1 = 0; xi1 < nbf; xi1++) { + this->op_(this->packed_mtrx_offset_(ia) + xi2 * nbf + xi1, 0) = -p_mtrx__(xi1, xi2, iat).real(); + } } } + if (this->pu_ == GPU) { + this->op_.allocate(memory_t::device); + this->op_.copy_to(memory_t::device); + } + } }; -} +} // namespace sirius #endif diff --git a/src/Hubbard/hubbard_occupancies_derivatives.hpp b/src/Hubbard/hubbard_occupancies_derivatives.hpp index 0c068c389..a23da7129 100644 --- a/src/Hubbard/hubbard_occupancies_derivatives.hpp +++ b/src/Hubbard/hubbard_occupancies_derivatives.hpp @@ -164,21 +164,22 @@ void Hubbard::compute_occupancies_derivatives(K_point& kp, // compute Q_ij <\beta_i|\phi> |d \beta_j> and add it to d\phi { - // < beta | phi> for this chunk - auto beta_phi = - kp.beta_projectors().inner(chunk__, phi, 0, 0, this->number_of_hubbard_orbitals()); - q_op.apply_one_atom(chunk__, 0, dphi, 0, this->number_of_hubbard_orbitals(), bp_grad_, beta_phi, i); + /* for this chunk */ + auto beta_phi = kp.beta_projectors().inner(chunk__, phi, 0, 0, + this->number_of_hubbard_orbitals()); + q_op.apply(chunk__, i, 0, dphi, 0, this->number_of_hubbard_orbitals(), bp_grad_, beta_phi); } // compute Q_ij |\beta_j> and add it to d\phi { - // < dbeta | phi> for this chunk - auto dbeta_phi = bp_grad_.inner(chunk__, phi, 0, 0, this->number_of_hubbard_orbitals()); + /* for this chunk */ + auto dbeta_phi = bp_grad_.inner(chunk__, phi, 0, 0, + this->number_of_hubbard_orbitals()); /* apply Q operator (diagonal in spin) */ /* Effectively compute Q_ij |beta_j> and add it dphi */ - q_op.apply_one_atom(chunk__, 0, dphi, 0, this->number_of_hubbard_orbitals(), kp.beta_projectors(), dbeta_phi, - i); + q_op.apply(chunk__, i, 0, dphi, 0, this->number_of_hubbard_orbitals(), + kp.beta_projectors(), dbeta_phi); } } } diff --git a/src/Potential/xc.hpp b/src/Potential/xc.hpp index 8fb82df16..f4a1b9527 100644 --- a/src/Potential/xc.hpp +++ b/src/Potential/xc.hpp @@ -488,7 +488,9 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__) rhomin = std::min(rhomin, d); rho.f_rg(ir) = std::max(d, 0.0); } - if (rhomin < 0.0 && std::abs(rhomin) > 1e-9) { + ctx_.fft().comm().allreduce(&rhomin, 1); + /* even a small negative density is a sign of something bing wrong; don't remove this check */ + if (rhomin < 0.0 && ctx_.comm().rank() == 0) { std::stringstream s; s << "Interstitial charge density has negative values" << std::endl << "most negatve value : " << rhomin; @@ -501,11 +503,11 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__) utils::print_hash("rho", h); } } - + Smooth_periodic_function_gradient grad_rho; Smooth_periodic_function lapl_rho; Smooth_periodic_function grad_rho_grad_rho; - + if (is_gga) { /* use fft_transfrom of the base class (Smooth_periodic_function) */ rho.fft_transform(-1); @@ -521,7 +523,7 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__) /* product of gradients */ grad_rho_grad_rho = dot(grad_rho, grad_rho); - + /* Laplacian in real space */ lapl_rho.fft_transform(1); @@ -576,7 +578,7 @@ inline void Potential::xc_rg_nonmagnetic(Density const& density__) if (ixc.is_gga()) { std::vector vrho_t(spl_np_t.local_size()); std::vector vsigma_t(spl_np_t.local_size()); - + ixc.get_gga(spl_np_t.local_size(), &rho.f_rg(spl_np_t.global_offset()), &grad_rho_grad_rho.f_rg(spl_np_t.global_offset()), @@ -674,7 +676,8 @@ inline void Potential::xc_rg_magnetic(Density const& density__) } t1.stop(); - if (rhomin < 0.0 && std::abs(rhomin) > 1e-9) { + ctx_.fft().comm().allreduce(&rhomin, 1); + if (rhomin < 0.0 && ctx_.comm().rank() == 0) { std::stringstream s; s << "Interstitial charge density has negative values" << std::endl << "most negatve value : " << rhomin; diff --git a/src/SDDK/communicator.hpp b/src/SDDK/communicator.hpp index 1c40a97d4..61754f5db 100644 --- a/src/SDDK/communicator.hpp +++ b/src/SDDK/communicator.hpp @@ -48,7 +48,8 @@ namespace sddk { enum class mpi_op_t { sum, - max + max, + min }; template @@ -72,6 +73,15 @@ struct mpi_op_wrapper } }; +template <> +struct mpi_op_wrapper +{ + static MPI_Op kind() + { + return MPI_MIN; + } +}; + template struct mpi_type_wrapper; diff --git a/src/constants.hpp b/src/constants.hpp index d82594937..b2caed061 100644 --- a/src/constants.hpp +++ b/src/constants.hpp @@ -28,8 +28,8 @@ #include const int major_version = 5; -const int minor_version = 8; -const int revision = 7; +const int minor_version = 9; +const int revision = 0; /// NIST value for the inverse fine structure (http://physics.nist.gov/cuu/Constants/index.html) const double speed_of_light = 137.035999139;