Alignment check before calling xxx_sparse functions

qianmou2718 · Jan 9, 2023 · fb038b4 · fb038b4
1 parent 9bbfe84
commit fb038b4
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 18 deletions.
diff --git a/pyscf/dft/gen_grid.py b/pyscf/dft/gen_grid.py
@@ -560,7 +560,7 @@ def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs):
             self.coords = self.coords[idx]
             self.weights = self.weights[idx]
 
-        if self.alignment:
+        if self.alignment > 1:
             padding = _padding_size(self.size, self.alignment)
             logger.debug(self, 'Padding %d grids', padding)
             if padding > 0:
@@ -620,7 +620,7 @@ def prune_by_density_(self, rho, threshold=0):
                          self.weights.size - numpy.count_nonzero(idx))
             self.coords  = numpy.asarray(self.coords [idx], order='C')
             self.weights = numpy.asarray(self.weights[idx], order='C')
-            if self.alignment:
+            if self.alignment > 1:
                 padding = _padding_size(self.size, self.alignment)
                 logger.debug(self, 'prune_by_density_: %d padding grids', padding)
                 if padding > 0:
@@ -668,4 +668,6 @@ def _default_ang(nuc, level=3):
                          (65, 65, 65, 65, 65, 65, 65 ),))   # 9
 
 def _padding_size(ngrids, alignment):
+    if alignment <= 1:
+        return 0
     return (ngrids + alignment - 1) // alignment * alignment - ngrids
diff --git a/pyscf/dft/numint.py b/pyscf/dft/numint.py
@@ -34,7 +34,7 @@
         warnings.warn('XC functional libraries (libxc or XCfun) are not available.')
         raise
 
-from pyscf.dft.gen_grid import BLKSIZE, NBINS, CUTOFF, make_mask
+from pyscf.dft.gen_grid import BLKSIZE, NBINS, CUTOFF, ALIGNMENT_UNIT, make_mask
 from pyscf.dft import xc_deriv
 from pyscf import __config__
 
@@ -830,8 +830,7 @@ def _sparse_enough(screen_index):
     return numpy.count_nonzero(screen_index) < screen_index.size * threshold
 
 def _dot_ao_ao_dense(ao1, ao2, wv, out=None):
-    '''Returns bra.T.dot(ket) while sparsity is explicitly considered.
-    Note the return may have ~1e-13 difference to _dot_ao_ao.
+    '''Returns (bra*wv).T.dot(ket)
     '''
     assert ao1.flags.f_contiguous
     assert ao2.flags.f_contiguous
@@ -857,13 +856,13 @@ def _dot_ao_ao_sparse(ao1, ao2, wv, nbins, screen_index, pair_mask, ao_loc,
     '''Returns (bra*wv).T.dot(ket) while sparsity is explicitly considered.
     Note the return may have ~1e-13 difference to _dot_ao_ao.
     '''
-    if screen_index is None or pair_mask is None:
+    ngrids, nao = ao1.shape
+    if screen_index is None or pair_mask is None or ngrids % ALIGNMENT_UNIT != 0:
         return _dot_ao_ao_dense(ao1, ao2, wv, out)
 
     assert ao1.flags.f_contiguous
     assert ao2.flags.f_contiguous
     assert ao1.dtype == ao2.dtype == numpy.double
-    ngrids, nao = ao1.shape
     nbas = screen_index.shape[1]
     if out is None:
         out = numpy.zeros((nao, nao), dtype=ao1.dtype)
@@ -898,12 +897,12 @@ def _dot_ao_dm_sparse(ao, dm, nbins, screen_index, pair_mask, ao_loc):
     ao matrix, (numpy.dot(ao, dm)*ao).sum(axis=1), their value can be matched up
     to ~1e-13.
     '''
-    if screen_index is None or pair_mask is None:
+    ngrids, nao = ao.shape
+    if screen_index is None or pair_mask is None or ngrids % ALIGNMENT_UNIT != 0:
         return lib.dot(dm.T, ao.T).T
 
     assert ao.flags.f_contiguous
     assert ao.dtype == dm.dtype == numpy.double
-    ngrids, nao = ao.shape
     nbas = screen_index.shape[1]
     dm = numpy.asarray(dm, order='C')
     out = _empty_aligned((nao, ngrids)).T
@@ -936,6 +935,9 @@ def _scale_ao_sparse(ao, wv, screen_index, ao_loc, out=None):
         ngrids, nao = ao.shape
         comp = 1
     nbas = screen_index.shape[1]
+    if ngrids % ALIGNMENT_UNIT != 0:
+        return _scale_ao(ao, wv, out=out)
+
     if out is None:
         out = _empty_aligned((nao, ngrids)).T
     else:
@@ -954,13 +956,13 @@ def _contract_rho_sparse(bra, ket, screen_index, ao_loc):
     '''Returns numpy.einsum('gi,gi->g', bra, ket) while sparsity is explicitly
     considered. Note the return may have ~1e-13 difference to _contract_rho.
     '''
-    if screen_index is None:
+    ngrids, nao = bra.shape
+    if screen_index is None or ngrids % ALIGNMENT_UNIT != 0:
         return _contract_rho(bra, ket)
 
     assert bra.flags.f_contiguous
     assert ket.flags.f_contiguous
     assert bra.dtype == ket.dtype == numpy.double
-    ngrids, nao = bra.shape
     nbas = screen_index.shape[1]
     rho = numpy.empty(ngrids)
     libdft.VXCdcontract_rho_sparse(
@@ -2527,6 +2529,9 @@ def _uks_mgga_wv2(rho0, rho1, fxc, kxc, weight):
     return wva, wvb
 
 def _empty_aligned(shape, alignment=8):
+    if alignment <= 1:
+        return numpy.empty(shape)
+
     size = numpy.prod(shape)
     buf = numpy.empty(size + alignment - 1)
     align8 = alignment * 8
@@ -2642,8 +2647,9 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     # NOTE to index grids.non0tab, the blksize needs to be an integer
     # multiplier of BLKSIZE
     if blksize is None:
-        blksize = int(max_memory*1e6/((comp+1)*nao*8*BLKSIZE))*BLKSIZE
-        blksize = max(BLKSIZE, min(blksize, ngrids, BLKSIZE*1200))
+        blksize = int(max_memory*1e6/((comp+1)*nao*8*BLKSIZE))
+        blksize = max(4, min(blksize, ngrids//BLKSIZE+1, 1200)) * BLKSIZE
+    assert blksize % BLKSIZE == 0
 
     if non0tab is None and mol is grids.mol:
         non0tab = grids.non0tab
@@ -2653,6 +2659,9 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
         non0tab[:] = NBINS + 1  # Corresponding to AO value ~= 1
     screen_index = non0tab
 
+    # the xxx_sparse() functions require ngrids 8-byte aligned
+    allow_sparse = ngrids % ALIGNMENT_UNIT == 0
+
     if buf is None:
         buf = _empty_aligned(comp * blksize * nao)
     for ip0, ip1 in lib.prange(0, ngrids, blksize):
@@ -2662,7 +2671,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
         # TODO: pass grids.cutoff to eval_ao
         ao = ni.eval_ao(mol, coords, deriv=deriv, non0tab=mask,
                         cutoff=grids.cutoff, out=buf)
-        if not _sparse_enough(mask):
+        if not allow_sparse and not _sparse_enough(mask):
             # Unset mask for dense AO tensor. It determines which eval_rho
             # to be called in make_rho
             mask = None

diff --git a/pyscf/pbc/dft/numint.py b/pyscf/pbc/dft/numint.py
@@ -24,7 +24,7 @@
 from pyscf.dft.numint import eval_mat, _dot_ao_ao, _dot_ao_dm, _tau_dot
 from pyscf.dft.numint import _scale_ao, _contract_rho
 from pyscf.dft.numint import OCCDROP
-from pyscf.dft.gen_grid import NBINS, CUTOFF
+from pyscf.dft.gen_grid import NBINS, CUTOFF, ALIGNMENT_UNIT
 from pyscf.pbc.dft.gen_grid import make_mask, BLKSIZE
 from pyscf.pbc.lib.kpts_helper import member, is_zero
 
@@ -966,10 +966,11 @@ def block_loop(self, cell, grids, nao=None, deriv=0, kpt=numpy.zeros(3),
         grids_weights = grids.weights
         ngrids = grids_coords.shape[0]
         comp = (deriv+1)*(deriv+2)*(deriv+3)//6
-# NOTE to index grids.non0tab, the blksize needs to be the integer multiplier of BLKSIZE
+        # NOTE to index grids.non0tab, blksize needs to be integer multiplier of BLKSIZE
         if blksize is None:
-            blksize = int(max_memory*1e6/(comp*2*nao*16*BLKSIZE))*BLKSIZE
-            blksize = max(BLKSIZE, min(blksize, ngrids, BLKSIZE*2400))
+            blksize = int(max_memory*1e6/(comp*2*nao*16*BLKSIZE))
+            blksize = max(4, min(blksize, ngrids//BLKSIZE+1, 2400)) * BLKSIZE
+        assert blksize % BLKSIZE == 0
         if non0tab is None:
             non0tab = grids.non0tab
         if non0tab is None: