Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a method "to_cpu" to convert GPU module to CPU module #28

Merged
merged 4 commits into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[flake8]
# https://flake8.pycqa.org/en/2.5.5/warnings.html#error-codes
ignore =
# Indentation:
E126, E127, E128, E129,
# Whitespaces:
E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, E251, E271,
# Comments:
E261, E262, E265, E266,
# Blank lines:
E301, E302, E303, E305, E306,
# Imports:
E401, E402,
# Other:
E701, E731, E741, E275,
F401, C901, W391, W503, W504, W291, W292, W293

exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg
max-line-length = 160

per-file-ignores =
pyscf/dft/libxc.py: E122,E501
pyscf/dft/xcfun.py: E122,E501
pyscf/grad/sacasscf.py: E501
pyscf/lo/ibo.py: E501
pyscf/pbc/cc/kccsd_t.py: E501
pyscf/pbc/mpicc/kccsd_rhf.py: E501
pyscf/pbc/mpicc/kintermediates_rhf.py: E501
pyscf/pbc/tools/pywannier90.py: E501
18 changes: 18 additions & 0 deletions .github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
name: Lint

on: [push, pull_request]

jobs:
flake:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install flake8
run: pip install "flake8>=3.7.0"
- name: Static analysis
run: flake8 --config .flake8 gpu4pyscf
2 changes: 1 addition & 1 deletion gpu4pyscf/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.5.2'
__version__ = '0.6.0'
73 changes: 39 additions & 34 deletions gpu4pyscf/df/df.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,10 @@
import cupy
import ctypes
import numpy as np
from pyscf import lib, __config__
from pyscf import lib
from pyscf.df import df, addons
from gpu4pyscf.lib.cupy_helper import *
from gpu4pyscf.lib.cupy_helper import (
cholesky, tag_array, get_avail_mem, cart2sph, solve_triangular)
from gpu4pyscf.df import int3c2e, df_jk
from gpu4pyscf.lib import logger
from gpu4pyscf import __config__
Expand All @@ -31,60 +32,65 @@
LINEAR_DEP_TOL = 1e-7

class DF(df.DF):
device = 'gpu'
from gpu4pyscf.lib.utils import to_gpu, device

def __init__(self, mol, auxbasis=None):
super().__init__(mol, auxbasis)
self.auxmol = None
self.intopt = None
self.nao = None
self.naux = None
self.cd_low = None
self.intopt = None
self._cderi = None


def to_cpu(self):
from gpu4pyscf.lib.utils import to_cpu
obj = to_cpu(self)
return obj.reset()

def build(self, direct_scf_tol=1e-14, omega=None):
mol = self.mol
auxmol = self.auxmol
self.nao = mol.nao

# cache indices for better performance
nao = mol.nao
tril_row, tril_col = cupy.tril_indices(nao)
tril_row = cupy.asarray(tril_row)
tril_col = cupy.asarray(tril_col)

self.tril_row = tril_row
self.tril_col = tril_col

idx = np.arange(nao)
self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx)

t0 = (logger.process_clock(), logger.perf_counter())
log = logger.new_logger(mol, mol.verbose)
if auxmol is None:
self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis)

if omega and omega > 1e-10:
with auxmol.with_range_coulomb(omega):
j2c_cpu = auxmol.intor('int2c2e', hermi=1)
else:
j2c_cpu = auxmol.intor('int2c2e', hermi=1)
j2c = cupy.asarray(j2c_cpu)
t0 = log.timer_debug1('2c2e', *t0)
t0 = log.timer_debug1('2c2e', *t0)
intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e')
intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256)
t1 = log.timer_debug1('prepare intopt', *t0)
log.timer_debug1('prepare intopt', *t0)
self.j2c = j2c.copy()
j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)]
try:
self.cd_low = cholesky(j2c)
self.cd_low = tag_array(self.cd_low, tag='cd')
except:
except Exception:
w, v = cupy.linalg.eigh(j2c)
idx = w > LINEAR_DEP_TOL
self.cd_low = (v[:,idx] / cupy.sqrt(w[idx]))
self.cd_low = tag_array(self.cd_low, tag='eig')

v = w = None
naux = self.naux = self.cd_low.shape[1]
log.debug('size of aux basis %d', naux)
Expand All @@ -99,7 +105,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
if omega is None:
return df_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol)
assert omega >= 0.0

# A temporary treatment for RSH-DF integrals
key = '%.6f' % omega
if key in self._rsh_df:
Expand All @@ -109,7 +115,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True,
logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega)

return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega)

def get_blksize(self, extra=0, nao=None):
'''
extra for pre-calculated space for other variables
Expand All @@ -124,7 +130,7 @@ def get_blksize(self, extra=0, nao=None):
raise RuntimeError("Not enough GPU memory")
return blksize


def loop(self, blksize=None):
'''
loop over all cderi and unpack
Expand All @@ -137,7 +143,7 @@ def loop(self, blksize=None):
rows = self.intopt.cderi_row
cols = self.intopt.cderi_col
buf_prefetch = None

data_stream = cupy.cuda.stream.Stream(non_blocking=True)
compute_stream = cupy.cuda.get_current_stream()
#compute_stream = cupy.cuda.stream.Stream()
Expand All @@ -160,32 +166,30 @@ def loop(self, blksize=None):
yield buf2, buf.T
compute_stream.wait_event(stop_event)
cupy.cuda.Device().synchronize()

if buf_prefetch is not None:
buf = buf_prefetch

def reset(self, mol=None):
'''
reset object for scanner
'''
if mol is not None:
self.mol = mol
self.auxmol = None
self._cderi = None
self._rsh_df = {}
super().reset(mol)
self.intopt = None
self.nao = None
self.naux = None
self.cd_low = None
self._cderi = None
return self

def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
'''
Returns:
2D array of (naux,nao*(nao+1)/2) in C-contiguous
'''
nao = mol.nao
naoaux, naux = cd_low.shape
npair = len(intopt.cderi_row)
log = logger.new_logger(mol, mol.verbose)
t0 = (logger.process_clock(), logger.perf_counter())
nq = len(intopt.log_qs)

# if the matrix exceeds the limit, store CDERI in CPU memory
Expand All @@ -194,7 +198,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
if naux * npair * 8 < 0.4 * avail_mem:
try:
cderi = cupy.empty([naux, npair], order='C')
except:
except Exception:
use_gpu_memory = False
else:
use_gpu_memory = False
Expand All @@ -204,7 +208,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
# TODO: async allocate memory
mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8)
cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem)

data_stream = cupy.cuda.stream.Stream(non_blocking=False)
count = 0
nq = len(intopt.log_qs)
Expand All @@ -217,7 +221,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
lj = intopt.angular[cpj]
i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1]
j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1]
ni = i1 - i0; nj = j1 - j0
ni = i1 - i0
nj = j1 - j0
if sr_only:
# TODO: in-place implementation or short-range kernel
ints_slices = cupy.zeros([naoaux, nj, ni], order='C')
Expand All @@ -241,22 +246,22 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):

if lj>1: ints_slices = cart2sph(ints_slices, axis=1, ang=lj)
if li>1: ints_slices = cart2sph(ints_slices, axis=2, ang=li)

i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1]
j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1]

row = intopt.ao_pairs_row[cp_ij_id] - i0
col = intopt.ao_pairs_col[cp_ij_id] - j0
if cpi == cpj:
ints_slices = ints_slices + ints_slices.transpose([0,2,1])
ints_slices = ints_slices[:,col,row]

if cd_low.tag == 'eig':
cderi_block = cupy.dot(cd_low.T, ints_slices)
ints_slices = None
elif cd_low.tag == 'cd':
cderi_block = solve_triangular(cd_low, ints_slices)

ij0, ij1 = count, count+cderi_block.shape[1]
count = ij1
if isinstance(cderi, cupy.ndarray):
Expand All @@ -265,7 +270,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False):
with data_stream:
for i in range(naux):
cderi_block[i].get(out=cderi[i,ij0:ij1])

t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1)

cupy.cuda.Device().synchronize()
Expand Down
Loading