Merge pull request #28 from sunqm/to_cpu

Add a method "to_cpu" to convert GPU module to CPU module
pyscf · Oct 9, 2023 · 2f51c01 · 2f51c01
2 parents cbe7fc0 + bcdadc7
commit 2f51c01
Show file tree

Hide file tree

Showing 44 changed files with 693 additions and 655 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,29 @@
+[flake8]
+# https://flake8.pycqa.org/en/2.5.5/warnings.html#error-codes
+ignore =
+        # Indentation:
+        E126, E127, E128, E129,
+        # Whitespaces:
+        E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, E251, E271,
+        # Comments:
+        E261, E262, E265, E266,
+        # Blank lines:
+        E301, E302, E303, E305, E306,
+        # Imports:
+        E401, E402,
+        # Other:
+        E701, E731, E741, E275,
+        F401, C901, W391, W503, W504, W291, W292, W293
+
+exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg
+max-line-length = 160
+
+per-file-ignores =
+    pyscf/dft/libxc.py: E122,E501
+    pyscf/dft/xcfun.py: E122,E501
+    pyscf/grad/sacasscf.py: E501
+    pyscf/lo/ibo.py: E501
+    pyscf/pbc/cc/kccsd_t.py: E501
+    pyscf/pbc/mpicc/kccsd_rhf.py: E501
+    pyscf/pbc/mpicc/kintermediates_rhf.py: E501
+    pyscf/pbc/tools/pywannier90.py: E501
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,18 @@
+name: Lint
+
+on: [push, pull_request]
+
+jobs:
+  flake:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: 3.9
+    - name: Install flake8
+      run: pip install "flake8>=3.7.0"
+    - name: Static analysis
+      run: flake8 --config .flake8 gpu4pyscf
diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py
@@ -1 +1 @@
-__version__ = '0.5.2'
+__version__ = '0.6.0'
diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py
@@ -18,9 +18,10 @@
 import cupy
 import ctypes
 import numpy as np
-from pyscf import lib, __config__
+from pyscf import lib
 from pyscf.df import df, addons
-from gpu4pyscf.lib.cupy_helper import *
+from gpu4pyscf.lib.cupy_helper import (
+    cholesky, tag_array, get_avail_mem, cart2sph, solve_triangular)
 from gpu4pyscf.df import int3c2e, df_jk
 from gpu4pyscf.lib import logger
 from gpu4pyscf import __config__
@@ -31,60 +32,65 @@
 LINEAR_DEP_TOL = 1e-7
 
 class DF(df.DF):
-    device = 'gpu'
+    from gpu4pyscf.lib.utils import to_gpu, device
+
     def __init__(self, mol, auxbasis=None):
         super().__init__(mol, auxbasis)
         self.auxmol = None
         self.intopt = None
         self.nao = None
         self.naux = None
         self.cd_low = None
-        self.intopt = None
         self._cderi = None
-
+
+    def to_cpu(self):
+        from gpu4pyscf.lib.utils import to_cpu
+        obj = to_cpu(self)
+        return obj.reset()
+
     def build(self, direct_scf_tol=1e-14, omega=None):
         mol = self.mol
         auxmol = self.auxmol
         self.nao = mol.nao
-        
+
         # cache indices for better performance
         nao = mol.nao
         tril_row, tril_col = cupy.tril_indices(nao)
         tril_row = cupy.asarray(tril_row)
         tril_col = cupy.asarray(tril_col)
-        
+
         self.tril_row = tril_row
         self.tril_col = tril_col
-        
+
         idx = np.arange(nao)
         self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)
-        
+
         t0 = (logger.process_clock(), logger.perf_counter())
         log = logger.new_logger(mol, mol.verbose)
         if auxmol is None:
             self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)
-        
+
         if omega and omega > 1e-10:
             with auxmol.with_range_coulomb(omega):
                 j2c_cpu = auxmol.intor('int2c2e', hermi=1)
         else:
             j2c_cpu = auxmol.intor('int2c2e', hermi=1)
         j2c = cupy.asarray(j2c_cpu)
-        t0 = log.timer_debug1('2c2e', *t0)    
+        t0 = log.timer_debug1('2c2e', *t0)
         intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
         intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256)
-        t1 = log.timer_debug1('prepare intopt', *t0)
+        log.timer_debug1('prepare intopt', *t0)
         self.j2c = j2c.copy()
         j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)]
         try:
             self.cd_low = cholesky(j2c)
             self.cd_low = tag_array(self.cd_low, tag='cd')
-        except:
+        except Exception:
             w, v = cupy.linalg.eigh(j2c)
             idx = w > LINEAR_DEP_TOL
             self.cd_low = (v[:,idx] / cupy.sqrt(w[idx]))
             self.cd_low = tag_array(self.cd_low, tag='eig')
-        
+
         v = w = None
         naux = self.naux = self.cd_low.shape[1]
         log.debug('size of aux basis %d', naux)
@@ -99,7 +105,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
         if omega is None:
             return df_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol)
         assert omega >= 0.0
-        
+
         # A temporary treatment for RSH-DF integrals
         key = '%.6f' % omega
         if key in self._rsh_df:
@@ -109,7 +115,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
             logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)
 
         return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)
-    
+
     def get_blksize(self, extra=0, nao=None):
         '''
         extra for pre-calculated space for other variables
@@ -124,7 +130,7 @@ def get_blksize(self, extra=0, nao=None):
             raise RuntimeError("Not enough GPU memory")
         return blksize
 
-    
+
     def loop(self, blksize=None):
         '''
         loop over all cderi and unpack
@@ -137,7 +143,7 @@ def loop(self, blksize=None):
         rows = self.intopt.cderi_row
         cols = self.intopt.cderi_col
         buf_prefetch = None
-        
+
         data_stream = cupy.cuda.stream.Stream(non_blocking=True)
         compute_stream = cupy.cuda.get_current_stream()
         #compute_stream = cupy.cuda.stream.Stream()
@@ -160,32 +166,30 @@ def loop(self, blksize=None):
             yield buf2, buf.T
             compute_stream.wait_event(stop_event)
             cupy.cuda.Device().synchronize()
-            
+
             if buf_prefetch is not None:
                 buf = buf_prefetch
 
     def reset(self, mol=None):
         '''
         reset object for scanner
         '''
-        if mol is not None:
-            self.mol = mol
-        self.auxmol = None
-        self._cderi = None
-        self._rsh_df = {}
+        super().reset(mol)
         self.intopt = None
+        self.nao = None
+        self.naux = None
+        self.cd_low = None
+        self._cderi = None
         return self
 
 def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     '''
     Returns:
         2D array of (naux,nao*(nao+1)/2) in C-contiguous
     '''
-    nao = mol.nao
     naoaux, naux = cd_low.shape
     npair = len(intopt.cderi_row)
     log = logger.new_logger(mol, mol.verbose)
-    t0 = (logger.process_clock(), logger.perf_counter())
     nq = len(intopt.log_qs)
 
     # if the matrix exceeds the limit, store CDERI in CPU memory
@@ -194,7 +198,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
     if naux * npair * 8 < 0.4 * avail_mem:
         try:
             cderi = cupy.empty([naux, npair], order='C')
-        except:
+        except Exception:
             use_gpu_memory = False
     else:
         use_gpu_memory = False
@@ -204,7 +208,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         # TODO: async allocate memory
         mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
         cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)
-    
+
     data_stream = cupy.cuda.stream.Stream(non_blocking=False)
     count = 0
     nq = len(intopt.log_qs)
@@ -217,7 +221,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
         lj = intopt.angular[cpj]
         i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
         j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
-        ni = i1 - i0; nj = j1 - j0
+        ni = i1 - i0
+        nj = j1 - j0
         if sr_only:
             # TODO: in-place implementation or short-range kernel
             ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
@@ -241,22 +246,22 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
 
         if lj>1: ints_slices = cart2sph(ints_slices, axis=1, ang=lj)
         if li>1: ints_slices = cart2sph(ints_slices, axis=2, ang=li)
-        
+
         i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1]
         j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1]
-        
+
         row = intopt.ao_pairs_row[cp_ij_id] - i0
         col = intopt.ao_pairs_col[cp_ij_id] - j0
         if cpi == cpj:
             ints_slices = ints_slices + ints_slices.transpose([0,2,1])
         ints_slices = ints_slices[:,col,row]
-        
+
         if cd_low.tag == 'eig':
             cderi_block = cupy.dot(cd_low.T, ints_slices)
             ints_slices = None
         elif cd_low.tag == 'cd':
             cderi_block = solve_triangular(cd_low, ints_slices)
-        
+
         ij0, ij1 = count, count+cderi_block.shape[1]
         count = ij1
         if isinstance(cderi, cupy.ndarray):
@@ -265,7 +270,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
             with data_stream:
                 for i in range(naux):
                     cderi_block[i].get(out=cderi[i,ij0:ij1])
-            
+
         t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1)
 
     cupy.cuda.Device().synchronize()