Skip to content

Commit

Permalink
Merge pull request #28 from sunqm/to_cpu
Browse files Browse the repository at this point in the history
Add a method "to_cpu" to convert GPU module to CPU module
  • Loading branch information
wxj6000 authored Oct 9, 2023
2 parents cbe7fc0 + bcdadc7 commit 2f51c01
Show file tree
Hide file tree
Showing 44 changed files with 693 additions and 655 deletions.
29 changes: 29 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[flake8]
# https://flake8.pycqa.org/en/2.5.5/warnings.html#error-codes
ignore =
# Indentation:
E126, E127, E128, E129,
# Whitespaces:
E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, E251, E271,
# Comments:
E261, E262, E265, E266,
# Blank lines:
E301, E302, E303, E305, E306,
# Imports:
E401, E402,
# Other:
E701, E731, E741, E275,
F401, C901, W391, W503, W504, W291, W292, W293

exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg
max-line-length = 160

per-file-ignores =
pyscf/dft/libxc.py: E122,E501
pyscf/dft/xcfun.py: E122,E501
pyscf/grad/sacasscf.py: E501
pyscf/lo/ibo.py: E501
pyscf/pbc/cc/kccsd_t.py: E501
pyscf/pbc/mpicc/kccsd_rhf.py: E501
pyscf/pbc/mpicc/kintermediates_rhf.py: E501
pyscf/pbc/tools/pywannier90.py: E501
18 changes: 18 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Lint

on: [push, pull_request]

jobs:
flake:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install flake8
run: pip install "flake8>=3.7.0"
- name: Static analysis
run: flake8 --config .flake8 gpu4pyscf
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.2'
__version__ = '0.6.0'
73 changes: 39 additions & 34 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
import cupy
import ctypes
import numpy as np
from pyscf import lib, __config__
from pyscf import lib
from pyscf.df import df, addons
from gpu4pyscf.lib.cupy_helper import *
from gpu4pyscf.lib.cupy_helper import (
cholesky, tag_array, get_avail_mem, cart2sph, solve_triangular)
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
Expand All @@ -31,60 +32,65 @@
LINEAR_DEP_TOL = 1e-7

class DF(df.DF):
device = 'gpu'
from gpu4pyscf.lib.utils import to_gpu, device

def __init__(self, mol, auxbasis=None):
super().__init__(mol, auxbasis)
self.auxmol = None
self.intopt = None
self.nao = None
self.naux = None
self.cd_low = None
self.intopt = None
self._cderi = None


def to_cpu(self):
from gpu4pyscf.lib.utils import to_cpu
obj = to_cpu(self)
return obj.reset()

def build(self, direct_scf_tol=1e-14, omega=None):
mol = self.mol
auxmol = self.auxmol
self.nao = mol.nao

# cache indices for better performance
nao = mol.nao
tril_row, tril_col = cupy.tril_indices(nao)
tril_row = cupy.asarray(tril_row)
tril_col = cupy.asarray(tril_col)

self.tril_row = tril_row
self.tril_col = tril_col

idx = np.arange(nao)
self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)

t0 = (logger.process_clock(), logger.perf_counter())
log = logger.new_logger(mol, mol.verbose)
if auxmol is None:
self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)

if omega and omega > 1e-10:
with auxmol.with_range_coulomb(omega):
j2c_cpu = auxmol.intor('int2c2e', hermi=1)
else:
j2c_cpu = auxmol.intor('int2c2e', hermi=1)
j2c = cupy.asarray(j2c_cpu)
t0 = log.timer_debug1('2c2e', *t0)
t0 = log.timer_debug1('2c2e', *t0)
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256)
t1 = log.timer_debug1('prepare intopt', *t0)
log.timer_debug1('prepare intopt', *t0)
self.j2c = j2c.copy()
j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)]
try:
self.cd_low = cholesky(j2c)
self.cd_low = tag_array(self.cd_low, tag='cd')
except:
except Exception:
w, v = cupy.linalg.eigh(j2c)
idx = w > LINEAR_DEP_TOL
self.cd_low = (v[:,idx] / cupy.sqrt(w[idx]))
self.cd_low = tag_array(self.cd_low, tag='eig')

v = w = None
naux = self.naux = self.cd_low.shape[1]
log.debug('size of aux basis %d', naux)
Expand All @@ -99,7 +105,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
if omega is None:
return df_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol)
assert omega >= 0.0

# A temporary treatment for RSH-DF integrals
key = '%.6f' % omega
if key in self._rsh_df:
Expand All @@ -109,7 +115,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)

return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)

def get_blksize(self, extra=0, nao=None):
'''
extra for pre-calculated space for other variables
Expand All @@ -124,7 +130,7 @@ def get_blksize(self, extra=0, nao=None):
raise RuntimeError("Not enough GPU memory")
return blksize


def loop(self, blksize=None):
'''
loop over all cderi and unpack
Expand All @@ -137,7 +143,7 @@ def loop(self, blksize=None):
rows = self.intopt.cderi_row
cols = self.intopt.cderi_col
buf_prefetch = None

data_stream = cupy.cuda.stream.Stream(non_blocking=True)
compute_stream = cupy.cuda.get_current_stream()
#compute_stream = cupy.cuda.stream.Stream()
Expand All @@ -160,32 +166,30 @@ def loop(self, blksize=None):
yield buf2, buf.T
compute_stream.wait_event(stop_event)
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch

def reset(self, mol=None):
'''
reset object for scanner
'''
if mol is not None:
self.mol = mol
self.auxmol = None
self._cderi = None
self._rsh_df = {}
super().reset(mol)
self.intopt = None
self.nao = None
self.naux = None
self.cd_low = None
self._cderi = None
return self

def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
'''
Returns:
2D array of (naux,nao*(nao+1)/2) in C-contiguous
'''
nao = mol.nao
naoaux, naux = cd_low.shape
npair = len(intopt.cderi_row)
log = logger.new_logger(mol, mol.verbose)
t0 = (logger.process_clock(), logger.perf_counter())
nq = len(intopt.log_qs)

# if the matrix exceeds the limit, store CDERI in CPU memory
Expand All @@ -194,7 +198,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
if naux * npair * 8 < 0.4 * avail_mem:
try:
cderi = cupy.empty([naux, npair], order='C')
except:
except Exception:
use_gpu_memory = False
else:
use_gpu_memory = False
Expand All @@ -204,7 +208,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
# TODO: async allocate memory
mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)

data_stream = cupy.cuda.stream.Stream(non_blocking=False)
count = 0
nq = len(intopt.log_qs)
Expand All @@ -217,7 +221,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
lj = intopt.angular[cpj]
i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
ni = i1 - i0; nj = j1 - j0
ni = i1 - i0
nj = j1 - j0
if sr_only:
# TODO: in-place implementation or short-range kernel
ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
Expand All @@ -241,22 +246,22 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):

if lj>1: ints_slices = cart2sph(ints_slices, axis=1, ang=lj)
if li>1: ints_slices = cart2sph(ints_slices, axis=2, ang=li)

i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1]
j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1]

row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0
if cpi == cpj:
ints_slices = ints_slices + ints_slices.transpose([0,2,1])
ints_slices = ints_slices[:,col,row]

if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices)
ints_slices = None
elif cd_low.tag == 'cd':
cderi_block = solve_triangular(cd_low, ints_slices)

ij0, ij1 = count, count+cderi_block.shape[1]
count = ij1
if isinstance(cderi, cupy.ndarray):
Expand All @@ -265,7 +270,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
with data_stream:
for i in range(naux):
cderi_block[i].get(out=cderi[i,ij0:ij1])

t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1)

cupy.cuda.Device().synchronize()
Expand Down
Loading

0 comments on commit 2f51c01

Please sign in to comment.