From 2859f445099b3358a1fff7867ee5c53abd4a0b9d Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Sat, 30 Sep 2023 15:38:13 -0700 Subject: [PATCH 1/4] Add method to_cpu --- gpu4pyscf/df/df.py | 45 +++++++++++++++------------- gpu4pyscf/df/df_jk.py | 33 +++++++++++---------- gpu4pyscf/df/grad/rhf.py | 16 +++++----- gpu4pyscf/df/grad/rks.py | 29 ++++++++++--------- gpu4pyscf/df/hessian/rhf.py | 52 ++++++++++++++++++--------------- gpu4pyscf/df/hessian/rks.py | 9 +++++- gpu4pyscf/dft/gen_grid.py | 22 ++++++++------ gpu4pyscf/dft/gks.py | 4 ++- gpu4pyscf/dft/numint.py | 5 +++- gpu4pyscf/dft/rks.py | 43 +++++++++++++-------------- gpu4pyscf/dft/roks.py | 4 ++- gpu4pyscf/dft/uks.py | 4 ++- gpu4pyscf/grad/rhf.py | 4 ++- gpu4pyscf/hessian/rhf.py | 22 ++++++++------ gpu4pyscf/hessian/rks.py | 50 ++++++++++++++++++-------------- gpu4pyscf/lib/utils.py | 20 +++++++++++++ gpu4pyscf/scf/ghf.py | 4 ++- gpu4pyscf/scf/hf.py | 58 ++++++++++++++++++++----------------- gpu4pyscf/scf/rohf.py | 4 ++- gpu4pyscf/scf/uhf.py | 4 ++- 20 files changed, 255 insertions(+), 177 deletions(-) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index fc01e28f..2d4e8038 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -24,6 +24,7 @@ from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ +from gpu4pyscf.lib.utils import to_cpu from cupyx import scipy MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) @@ -32,6 +33,7 @@ class DF(df.DF): device = 'gpu' + def __init__(self, mol, auxbasis=None): super().__init__(mol, auxbasis) self.auxmol = None @@ -39,38 +41,41 @@ def __init__(self, mol, auxbasis=None): self.nao = None self.naux = None self.cd_low = None - self.intopt = None self._cderi = None - + + def to_cpu(self): + obj = to_cpu(self) + return obj.reset() + def build(self, direct_scf_tol=1e-14, omega=None): mol = self.mol auxmol = self.auxmol self.nao = mol.nao - + # cache indices for better performance nao = mol.nao tril_row, tril_col = cupy.tril_indices(nao) tril_row = cupy.asarray(tril_row) tril_col = cupy.asarray(tril_col) - + self.tril_row = tril_row self.tril_col = tril_col - + idx = np.arange(nao) self.diag_idx = cupy.asarray(idx*(idx+1)//2+idx) - + t0 = (logger.process_clock(), logger.perf_counter()) log = logger.new_logger(mol, mol.verbose) if auxmol is None: self.auxmol = auxmol = addons.make_auxmol(mol, self.auxbasis) - + if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): j2c_cpu = auxmol.intor('int2c2e', hermi=1) else: j2c_cpu = auxmol.intor('int2c2e', hermi=1) j2c = cupy.asarray(j2c_cpu) - t0 = log.timer_debug1('2c2e', *t0) + t0 = log.timer_debug1('2c2e', *t0) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256) t1 = log.timer_debug1('prepare intopt', *t0) @@ -84,7 +89,7 @@ def build(self, direct_scf_tol=1e-14, omega=None): idx = w > LINEAR_DEP_TOL self.cd_low = (v[:,idx] / cupy.sqrt(w[idx])) self.cd_low = tag_array(self.cd_low, tag='eig') - + v = w = None naux = self.naux = self.cd_low.shape[1] log.debug('size of aux basis %d', naux) @@ -99,7 +104,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True, if omega is None: return df_jk.get_jk(self, dm, hermi, with_j, with_k, direct_scf_tol) assert omega >= 0.0 - + # A temporary treatment for RSH-DF integrals key = '%.6f' % omega if key in self._rsh_df: @@ -109,7 +114,7 @@ def get_jk(self, dm, hermi=1, with_j=True, with_k=True, logger.info(self, 'Create RSH-DF object %s for omega=%s', rsh_df, omega) return df_jk.get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol, omega=omega) - + def get_blksize(self, extra=0, nao=None): ''' extra for pre-calculated space for other variables @@ -124,7 +129,7 @@ def get_blksize(self, extra=0, nao=None): raise RuntimeError("Not enough GPU memory") return blksize - + def loop(self, blksize=None): ''' loop over all cderi and unpack @@ -137,7 +142,7 @@ def loop(self, blksize=None): rows = self.intopt.cderi_row cols = self.intopt.cderi_col buf_prefetch = None - + data_stream = cupy.cuda.stream.Stream(non_blocking=True) compute_stream = cupy.cuda.get_current_stream() #compute_stream = cupy.cuda.stream.Stream() @@ -160,7 +165,7 @@ def loop(self, blksize=None): yield buf2, buf.T compute_stream.wait_event(stop_event) cupy.cuda.Device().synchronize() - + if buf_prefetch is not None: buf = buf_prefetch @@ -204,7 +209,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): # TODO: async allocate memory mem = cupy.cuda.alloc_pinned_memory(naux * npair * 8) cderi = np.ndarray([naux, npair], dtype=np.float64, order='C', buffer=mem) - + data_stream = cupy.cuda.stream.Stream(non_blocking=False) count = 0 nq = len(intopt.log_qs) @@ -241,22 +246,22 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): if lj>1: ints_slices = cart2sph(ints_slices, axis=1, ang=lj) if li>1: ints_slices = cart2sph(ints_slices, axis=2, ang=li) - + i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1] j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1] - + row = intopt.ao_pairs_row[cp_ij_id] - i0 col = intopt.ao_pairs_col[cp_ij_id] - j0 if cpi == cpj: ints_slices = ints_slices + ints_slices.transpose([0,2,1]) ints_slices = ints_slices[:,col,row] - + if cd_low.tag == 'eig': cderi_block = cupy.dot(cd_low.T, ints_slices) ints_slices = None elif cd_low.tag == 'cd': cderi_block = solve_triangular(cd_low, ints_slices) - + ij0, ij1 = count, count+cderi_block.shape[1] count = ij1 if isinstance(cderi, cupy.ndarray): @@ -265,7 +270,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): with data_stream: for i in range(naux): cderi_block[i].get(out=cderi[i,ij0:ij1]) - + t1 = log.timer_debug1(f'solve {cp_ij_id} / {nq}', *t1) cupy.cuda.Device().synchronize() diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 954bfa2e..eebec753 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -24,6 +24,7 @@ from pyscf.scf import dhf from pyscf.df import df_jk, addons from gpu4pyscf.lib.cupy_helper import contract, solve_triangular, take_last2d, transpose_sum, load_library, get_avail_mem +from gpu4pyscf.lib.utils import to_cpu from gpu4pyscf.dft import rks, numint from gpu4pyscf.scf import hf from gpu4pyscf.df import df, int3c2e @@ -85,7 +86,7 @@ def _density_fit(mf, auxbasis=None, with_df=None, only_dfj=False): ''' assert isinstance(mf, scf.hf.SCF) - + if with_df is None: if isinstance(mf, dhf.UHF): with_df = df.DF4C(mf.mol) @@ -120,7 +121,9 @@ class DensityFitting(df_jk._DFHF, mf_class): Set mf.with_df = None to switch off density fitting mode. See also the documents of class %s for other SCF attributes. ''' % mf_class - + + to_cpu = to_cpu + def __init__(self, mf, dfobj, only_dfj): self.__dict__.update(mf.__dict__) self._eri = None @@ -130,7 +133,7 @@ def __init__(self, mf, dfobj, only_dfj): self.with_df = dfobj self.only_dfj = only_dfj self._keys = self._keys.union(['with_df', 'only_dfj']) - + init_workflow = init_workflow def reset(self, mol=None): @@ -153,14 +156,14 @@ def get_jk(self, mol=None, dm=None, hermi=1, with_j=True, with_k=True, else: vj, vk = mf_class.get_jk(self, mol, dm, hermi, with_j, with_k, omega) return vj, vk - + def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): ''' effective potential ''' if mol is None: mol = self.mol if dm is None: dm = self.make_rdm1() - + # for DFT if mf_class == rks.RKS: return rks._get_veff(self, dm=dm) @@ -172,7 +175,7 @@ def get_veff(self, mol=None, dm=None, dm_last=None, vhf_last=0, hermi=1): else: vj, vk = self.get_jk(mol, dm, hermi=hermi) return vj - vk * .5 - + def energy_elec(self, dm=None, h1e=None, vhf=None): ''' electronic energy @@ -185,10 +188,10 @@ def energy_elec(self, dm=None, h1e=None, vhf=None): e1 = cupy.sum(h1e*dm) ecoul = self.ecoul exc = self.exc - e2 = ecoul + exc + e2 = ecoul + exc #logger.debug(self, f'E1 = {e1}, Ecoul = {ecoul}, Exc = {exc}') return e1+e2, e2 - + e1 = cupy.einsum('ij,ji->', h1e, dm).real e_coul = cupy.einsum('ij,ji->', vhf, dm).real * .5 self.scf_summary['e1'] = e1 @@ -204,7 +207,7 @@ def energy_tot(self, dm, h1e, vhf=None): e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc self.scf_summary['nuc'] = nuc.real return e_tot - + def nuc_grad_method(self): if mf_class == rks.RKS: from gpu4pyscf.df.grad import rks as rks_grad @@ -213,7 +216,7 @@ def nuc_grad_method(self): from gpu4pyscf.df.grad import rhf as rhf_grad return rhf_grad.Gradients(self) raise NotImplementedError() - + def Hessian(self): from gpu4pyscf.df.hessian import rhf, rks @@ -237,7 +240,7 @@ def _cderi(self, x): @property def auxbasis(self): return getattr(self.with_df, 'auxbasis', None) - + return DensityFitting(mf, with_df, only_dfj) def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e-14, omega=None): @@ -257,8 +260,8 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- dms = dms_tag.reshape([-1,nao,nao]) nset = dms.shape[0] t0 = (logger.process_clock(), logger.perf_counter()) - if dfobj._cderi is None: - log.warn('CDERI not found, build...') + if dfobj._cderi is None: + log.warn('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) nao, naux = dfobj.nao, dfobj.naux @@ -291,7 +294,7 @@ def get_j(cderi_sparse): blksize = dfobj.get_blksize(extra=nao*nocc) for cderi, cderi_sparse in dfobj.loop(blksize=blksize): # leading dimension is 1 - if with_j: + if with_j: vj += get_j(cderi_sparse) if with_k: rhok = contract('Lij,jk->Lki', cderi, occ_coeff) @@ -358,7 +361,7 @@ def _get_jk(dfobj, dm, hermi=1, with_j=True, with_k=True, else: rsh_df = dfobj._rsh_df[key] = copy.copy(dfobj).reset() logger.info(dfobj, 'Create RSH-DF object %s for omega=%s', rsh_df, omega) - + with rsh_df.mol.with_range_coulomb(omega): return get_jk(rsh_df, dm, hermi, with_j, with_k, direct_scf_tol) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 037da620..19a78a06 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -21,7 +21,7 @@ from pyscf import lib, scf, gto from gpu4pyscf.scf.hf import _get_jk from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu from gpu4pyscf.lib.cupy_helper import print_mem_info, solve_triangular, tag_array, unpack_tril, contract, load_library from gpu4pyscf.grad.rhf import _grad_elec from gpu4pyscf import __config__ @@ -51,12 +51,12 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg auxmol = with_df.auxmol intopt = with_df.intopt - + nao, naux = mol.nao, with_df.naux - + log = logger.new_logger(mol, mol.verbose) t0 = (logger.process_clock(), logger.perf_counter()) - + if isinstance(mf_grad.base, scf.rohf.ROHF): raise NotImplementedError() mo_coeff = cupy.asarray(mf_grad.base.mo_coeff) @@ -73,14 +73,14 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg cols = with_df.intopt.cderi_col dm_sparse = dm[rows, cols] dm_sparse[with_df.intopt.cderi_diag] *= .5 - + blksize = with_df.get_blksize() if with_j: rhoj = cupy.empty([naux]) if with_k: rhok = cupy.empty([naux, nocc, nocc], order='C') p0 = p1 = 0 - + for cderi, cderi_sparse in with_df.loop(blksize=blksize): p1 = p0 + cderi.shape[0] if with_j: @@ -136,7 +136,7 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg # rebuild with aosym intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, \ group_size_aux=block_size)#, group_size=block_size) - + # sph2cart for ao cart2sph = intopt.cart2sph orbo_cart = cart2sph @ orbo @@ -225,6 +225,8 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg class Gradients(rhf.Gradients): + to_cpu = to_cpu + device = 'gpu' get_jk = patch_cpu_kernel(rhf.Gradients.get_jk)(_get_jk) grad_elec = patch_cpu_kernel(rhf.Gradients.grad_elec)(_grad_elec) diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index 20bc74d0..af0f6a44 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -20,25 +20,24 @@ from pyscf.df.grad import rks from gpu4pyscf.grad import rks as rks_grad from gpu4pyscf.df.grad.rhf import _get_jk, _grad_elec -from gpu4pyscf.lib.utils import patch_cpu_kernel from gpu4pyscf.lib.cupy_helper import contract, tag_array from gpu4pyscf.lib import logger def _get_veff(ks_grad, mol=None, dm=None): - + '''Coulomb + XC functional ''' if mol is None: mol = ks_grad.mol if dm is None: dm = ks_grad.base.make_rdm1() t0 = (logger.process_clock(), logger.perf_counter()) - + mf = ks_grad.base ni = mf._numint if ks_grad.grids is not None: grids = ks_grad.grids else: grids = mf.grids - + if grids.coords is None: grids.build(sort_grids=False) #grids.build(with_non0tab=True) @@ -56,7 +55,7 @@ def _get_veff(ks_grad, mol=None, dm=None): raise NotImplementedError #enabling range-separated hybrids omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin) - + mem_now = lib.current_memory()[0] max_memory = max(2000, ks_grad.max_memory*.9-mem_now) if ks_grad.grid_response: @@ -80,12 +79,12 @@ def _get_veff(ks_grad, mol=None, dm=None): max_memory=max_memory, verbose=ks_grad.verbose) vxc += vnlc t0 = logger.timer(ks_grad, 'vxc total', *t0) - + # this can be moved into vxc calculations occ_coeff = cupy.asarray(mf.mo_coeff[:, mf.mo_occ>0.5], order='C') tmp = contract('nij,jk->nik', vxc, occ_coeff) vxc = 2.0*contract('nik,ik->ni', tmp, occ_coeff) - + aoslices = mol.aoslice_by_atom() vxc = [vxc[:,p0:p1].sum(axis=1) for p0, p1 in aoslices[:,2:]] vxc = cupy.asarray(vxc) @@ -96,7 +95,7 @@ def _get_veff(ks_grad, mol=None, dm=None): e1_aux = vjaux else: vj, vk, vjaux, vkaux = ks_grad.get_jk(mol, dm) - + if ks_grad.auxbasis_response: vk_aux = vkaux * hyb vk *= hyb @@ -105,11 +104,11 @@ def _get_veff(ks_grad, mol=None, dm=None): vk += vk_lr * (alpha - hyb) if ks_grad.auxbasis_response: vk_aux += vkaux_lr * (alpha - hyb) - + vxc += vj - vk * .5 if ks_grad.auxbasis_response: e1_aux = vjaux - vk_aux * .5 - + if ks_grad.auxbasis_response: logger.debug1(ks_grad, 'sum(auxbasis response) %s', e1_aux.sum(axis=0)) else: @@ -118,6 +117,8 @@ def _get_veff(ks_grad, mol=None, dm=None): return vxc class Gradients(rks.Gradients): + to_cpu = to_cpu + device = 'gpu' get_jk = patch_cpu_kernel(rks.Gradients.get_jk)(_get_jk) get_veff = patch_cpu_kernel(rks.Gradients.get_veff)(_get_veff) @@ -130,7 +131,7 @@ def get_j(self, mol=None, dm=None, hermi=0, omega=None): def get_k(self, mol=None, dm=None, hermi=0, omega=None): _, vk, _, vkaux = self.get_jk(mol, dm, with_j=False, omega=omega) return vk, vkaux - + def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': from pyscf import lib @@ -139,19 +140,19 @@ def get_dispersion(self): d3 = disp.DFTD3Dispersion(self.mol, xc=self.base.xc, version=self.base.disp) _, g_d3 = d3.kernel() return g_d3 - + if self.base.disp[:2].upper() == 'D4': from pyscf.data.elements import charge atoms = numpy.array([ charge(a[0]) for a in self.mol._atom]) coords = self.mol.atom_coords() - + from pyscf import lib with lib.with_omp_threads(1): from dftd4.interface import DampingParam, DispersionModel model = DispersionModel(atoms, coords) res = model.get_dispersion(DampingParam(method=self.base.xc), grad=True) return res.get("gradient") - + def extra_force(self, atom_id, envs): if self.auxbasis_response: return envs['dvhf'].aux[atom_id] diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 63e7696f..7fba97c5 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -41,6 +41,7 @@ from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger +from gpu4pyscf.lib.utils import to_cpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -68,7 +69,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mocc = mo_coeff[:,mo_occ>0] mocc_2 = cupy.einsum('pi,i->pi', mocc, mo_occ[mo_occ>0]**.5) dm0 = cupy.dot(mocc, mocc.T) * 2 - + hcore_deriv = hessobj.hcore_generator(mol) # ------------------------------------ @@ -82,7 +83,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, naux = auxmol.nao auxslices = auxmol.aoslice_by_atom() aoslices = mol.aoslice_by_atom() - + if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c = auxmol.intor('int2c2e', aosym='s1') @@ -91,13 +92,13 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, int2c = auxmol.intor('int2c2e', aosym='s1') int2c_ip1 = auxmol.intor('int2c2e_ip1', aosym='s1') t1 = log.timer_debug1('intermediate variables with int2c2e and int2c2e_ip1', *t1) - + # ================================ sorted AO begin =============================================== intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size_aux=128, group_size=128) sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx - + mocc_2 = mocc_2[sph_ao_idx, :] dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)] dm0_tag = tag_array(dm0, occ_coeff=mocc_2) @@ -115,7 +116,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if hessobj.auxbasis_response: hj_ao_aux = cupy.zeros([nao,naux,3,3]) hk_ao_aux = cupy.zeros([nao,naux,3,3]) - + # int3c contributions wj, wk_Pl_ = int3c2e.get_int3c2e_wjk(mol, auxmol, dm0_tag, omega=omega) rhoj0_P = contract('pq,q->p', int2c_inv, wj) @@ -172,20 +173,20 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, wk1_I = rhok0_tmp = None wk1_Pko = rhok1_Pko = int2c_tmp = None t1 = log.timer_debug1('intermediate variables with int3c2e_ip1', *t1) - + cupy.get_default_memory_pool().free_all_blocks() # int3c_ipip1 contributions hj_ao_diag, hk_ao_diag = int3c2e.get_int3c2e_ipip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega) hj_ao_diag *= 2.0 t1 = log.timer_debug1('intermediate variables with int3c2e_ipip1', *t1) - # int3c_ipvip1 contributions + # int3c_ipvip1 contributions # (11|0), (0|00) without response of RI basis hj, hk = int3c2e.get_int3c2e_ipvip1_hjk(intopt, rhoj0_P, rhok0_P__, dm0_tag, omega=omega) hj_ao_ao += 2.0*hj hk_ao_ao += hk t1 = log.timer_debug1('intermediate variables with int3c2e_ipvip1', *t1) - + # int3c_ip1ip2 contributions # (10|1), (0|0)(0|00) if hessobj.auxbasis_response: @@ -219,7 +220,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rho2c_0 = cupy.einsum('pij,qji->pq', rhok0_P__, rhok0_P__) hk_aux_diag -= .5 * cupy.einsum('pq,xpq->px', rho2c_0, int2c_ipip1).reshape(-1,3,3) int2c_ipip1 = None - + if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c_ip1ip2 = auxmol.intor('int2c2e_ip1ip2', aosym='s1') @@ -239,7 +240,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, rhoj1 = cupy.einsum('px,pq->xpq', wj_ip2, int2c_inv) # (0|0)(1|00) rhoj0_01 = cupy.einsum('xp,pq->xpq', wj0_01, int2c_inv) # (0|1)(0|00) rhoj0_10 = cupy.einsum('p,xpq->xpq', rhoj0_P, int2c_ip1_inv) # (1|0)(0|00) - + hj_aux_aux += .5 * cupy.einsum('xpr,yqr->pqxy', rhoj0_10, wj0_10) # (00|0)(1|0), (0|1)(0|00) hj_aux_aux -= cupy.einsum('xpq,yq->pqxy', rhoj1, wj0_01) # (00|1), (1|0)(0|00) hj_aux_aux += .5 * cupy.einsum('xpq,qy->pqxy', rhoj1, wj_ip2) # (00|1), (1|00) @@ -261,7 +262,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, hk_aux_aux += cupy.einsum('xpq,yqp->pqxy', int2c_ip1_inv, rho2c0_10) # (00|0)(1|0)(1|0)(0|00) hk_aux_aux -= cupy.einsum('pqxy,pq->pqxy', rho2c1_10, int2c_inv) # (00|1)(1|0)(0|00) hk_aux_aux -= cupy.einsum('pqx,yqp->pqxy', rho2c_10, int2c_ip1_inv) # (00|1)(0|1)(0|00) - hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) + hk_aux_aux += .5 * cupy.einsum('pqxy,pq->pqxy', rho2c_11, int2c_inv) # (00|1)(1|00) rho2c_0 = rho2c_10 = rho2c_11 = rho2c0_10 = rho2c1_10 = rho2c0_11 = int2c_ip_ip = None wk_ip2_P__ = int2c_ip1_inv = None ao_idx = np.argsort(intopt.sph_ao_idx) @@ -273,11 +274,11 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if hessobj.auxbasis_response: rev_ao_aux = cupy.ix_(ao_idx, aux_idx) hj_ao_aux = hj_ao_aux[rev_ao_aux] - if hessobj.auxbasis_response > 1: + if hessobj.auxbasis_response > 1: rev_aux_aux = cupy.ix_(aux_idx, aux_idx) hj_aux_diag = hj_aux_diag[aux_idx] hj_aux_aux = hj_aux_aux[rev_aux_aux] - + if with_k: hk_ao_diag = hk_ao_diag[ao_idx] hk_ao_ao = hk_ao_ao[rev_ao_ao] @@ -351,7 +352,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, e1[j0,i0] = e1[i0,j0].T ej[j0,i0] = ej[i0,j0].T ek[j0,i0] = ek[i0,j0].T - + log.timer('RHF partial hessian', *time0) return e1, ej, ek @@ -387,13 +388,13 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, auxmol = df.addons.make_auxmol(mol, auxbasis=mf.with_df.auxbasis) aoslices = mol.aoslice_by_atom() auxslices = auxmol.aoslice_by_atom() - + naux = auxmol.nao nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] dm0 = cupy.dot(mocc, mocc.T) * 2 - + if omega and omega > 1e-10: with auxmol.with_range_coulomb(omega): int2c = auxmol.intor('int2c2e', aosym='s1') @@ -407,12 +408,12 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, sph_aux_idx = intopt.sph_aux_idx rev_ao_idx = np.argsort(intopt.sph_ao_idx) rev_aux_idx = np.argsort(intopt.sph_aux_idx) - + mocc = mocc[sph_ao_idx, :] mo_coeff = mo_coeff[sph_ao_idx,:] dm0 = dm0[cupy.ix_(sph_ao_idx, sph_ao_idx)] dm0_tag = tag_array(dm0, occ_coeff=mocc) - + int2c = int2c[cupy.ix_(sph_aux_idx, sph_aux_idx)] int2c_inv = cupy.linalg.pinv(int2c, rcond=1e-12) @@ -423,19 +424,19 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, if with_k: rhok0_P__ = contract('pq,qij->pij', int2c_inv, wk_P__) wj = wk_Pl_ = wk_P__ = int2c_inv = int2c = None - + # int3c_ip1 contributions vj1_buf, vk1_buf, vj1_ao, vk1_ao = int3c2e.get_int3c2e_ip1_vjk(intopt, rhoj0, rhok0_Pl_, dm0_tag, aoslices, omega=omega) vj1_buf = vj1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] vk1_buf = vk1_buf[cupy.ix_(numpy.arange(3), rev_ao_idx, rev_ao_idx)] - + vj1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vj1_ao, mo_coeff) vk1_int3c_ip1 = -cupy.einsum('nxiq,ip->nxpq', vk1_ao, mo_coeff) vj1_ao = vk1_ao = None t0 = log.timer_debug1('Fock matrix due to int3c2e_ip1', *t0) # -------------------------- - # int3c_ip2 contribution + # int3c_ip2 contribution # -------------------------- cupy.get_default_memory_pool().free_all_blocks() if hessobj.auxbasis_response: @@ -493,13 +494,13 @@ def _ao2mo(mat): shl0, shl1, p0, p1 = aoslices[ia] vj1_ao = cupy.zeros([3,nao,nao]) vk1_ao = cupy.zeros([3,nao,nao]) - + vj1_ao[:,p0:p1,:] -= vj1_buf[:,p0:p1,:] vj1_ao[:,:,p0:p1] -= vj1_buf[:,p0:p1,:].transpose(0,2,1) if with_k: vk1_ao[:,p0:p1,:] -= vk1_buf[:,p0:p1,:] vk1_ao[:,:,p0:p1] -= vk1_buf[:,p0:p1,:].transpose(0,2,1) - + h1 = hcore_deriv(ia) h1 = _ao2mo(h1) vj1 = vj1_int3c[ia] + _ao2mo(vj1_ao) @@ -509,9 +510,12 @@ def _ao2mo(mat): class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' + + to_cpu = to_cpu + def __init__(self, mf): self.auxbasis_response = 1 rhf_hess.Hessian.__init__(self, mf) partial_hess_elec = partial_hess_elec - make_h1 = make_h1 \ No newline at end of file + make_h1 = make_h1 diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 50a046c0..75de5577 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -33,6 +33,7 @@ from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger +from gpu4pyscf.lib.utils import to_cpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -63,7 +64,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst, max_memory, verbose, True, omega=omega)[2] de2 -= (alpha - hyb) * ek_lr - + max_memory = None veff_diag = rks_hess._get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory) t1 = log.timer_debug1('computing veff_diag', *t1) @@ -111,10 +112,16 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): class Hessian(rks_hess.Hessian): '''Non-relativistic RKS hessian''' + def __init__(self, mf): self.auxbasis_response = 1 rks_hess.Hessian.__init__(self, mf) + def to_cpu(self): + from pyscf.df.hessian.rks import Hessian + obj = to_cpu(self) + return obj.view(Hessian) + partial_hess_elec = partial_hess_elec make_h1 = make_h1 diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 7a71b6ef..34c55711 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -38,6 +38,8 @@ from cupyx.scipy.spatial.distance import cdist from gpu4pyscf.dft import radi from gpu4pyscf.lib.cupy_helper import load_library +from gpu4pyscf.lib.utils import to_cpu + libdft = lib.load_library('libdft') libgdft = load_library('libgdft') @@ -289,7 +291,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev, libdft.MakeAngularGrid(grid.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(n)) ang_grids[n] = grid - + angs = numpy.array(angs) coords = [] vol = [] @@ -309,7 +311,7 @@ def gen_atomic_grids(mol, atom_grid={}, radi_method=radi.gauss_chebyshev, ''' atom_grids_tab[symb] = (cupy.vstack(coords), cupy.hstack(vol)) #print(atom_grids_tab[symb][0].shape, atom_grids_tab[symb][1].shape) - + return atom_grids_tab def get_partition(mol, atom_grids_tab, @@ -346,7 +348,7 @@ def get_partition(mol, atom_grids_tab, for i in range(mol.natm) for j in range(mol.natm)]) p_radii_table = f_radii_table.ctypes.data_as(ctypes.c_void_p) - + def gen_grid_partition0(coords): coords = numpy.asarray(coords, order='F') ngrids = coords.shape[0] @@ -365,14 +367,14 @@ def gen_grid_partition(coords): rinv = 1.0/atm_dist cupy.fill_diagonal(rinv, 0.0) g = rinv[:,:,None] * r12 - + if f_radii_adjust is not None: g = f_radii_adjust(g) - + g = (3.0 - g*g) * g * .5 g = (3.0 - g*g) * g * .5 g = (3.0 - g*g) * g * .5 - + pbecke = cupy.prod(0.5 * (1.0 - g), axis=1) * 2 return pbecke else: @@ -457,7 +459,7 @@ def arg_group_grids(mol, coords, box_size=GROUP_BOX_SIZE): box_ids[box_ids[:,0] > boxes[0], 0] = boxes[0] box_ids[box_ids[:,1] > boxes[1], 1] = boxes[1] box_ids[box_ids[:,2] > boxes[2], 2] = boxes[2] - + rev_idx, counts = numpy.unique(box_ids, axis=0, return_inverse=True, return_counts=True)[1:3] return rev_idx.argsort(kind='stable') @@ -550,6 +552,8 @@ class Grids(lib.StreamObject): alignment = ALIGNMENT_UNIT cutoff = CUTOFF + to_cpu = to_cpu + def __init__(self, mol): self.mol = mol self.stdout = mol.stdout @@ -593,7 +597,7 @@ def dump_flags(self, verbose=None): if self.atom_grid: logger.info(self, 'User specified grid scheme %s', str(self.atom_grid)) return self - + def build(self, mol=None, with_non0tab=False, sort_grids=True, **kwargs): if mol is None: mol = self.mol if self.verbose >= logger.WARN: @@ -718,4 +722,4 @@ def _default_ang(nuc, level=3): def _padding_size(ngrids, alignment): if alignment <= 1: return 0 - return (ngrids + alignment - 1) // alignment * alignment - ngrids \ No newline at end of file + return (ngrids + alignment - 1) // alignment * alignment - ngrids diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py index fa45552d..4b79a992 100644 --- a/gpu4pyscf/dft/gks.py +++ b/gpu4pyscf/dft/gks.py @@ -18,9 +18,11 @@ from pyscf.dft import gks from gpu4pyscf.dft import numint from gpu4pyscf.scf.ghf import get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu class GKS(gks.GKS): + to_cpu = to_cpu + def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index e26043a7..996a8096 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -25,7 +25,7 @@ from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ @@ -1074,7 +1074,10 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, yield ao, sindex, weight, coords class NumInt(numint.NumInt): + to_cpu = to_cpu + device = 'gpu' + def __init__(self, xc='LDA'): super().__init__() self.gdftopt = None diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index 38f55083..69e83d24 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -27,7 +27,7 @@ from gpu4pyscf.scf import diis from gpu4pyscf.lib import logger from gpu4pyscf.dft import numint, gen_grid -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu from gpu4pyscf.lib.cupy_helper import load_library, tag_array libcupy_helper = load_library('libcupy_helper') @@ -36,18 +36,18 @@ def prune_small_rho_grids_(ks, mol, dm, grids): rho = ks._numint.get_rho(mol, dm, grids, ks.max_memory) - + threshold = ks.small_rho_cutoff '''Prune grids if the electron density on the grid is small''' if threshold == 0: return grids mol = grids.mol - + n = cupy.dot(rho, grids.weights) - if abs(n-mol.nelectron) < gen_grid.NELEC_ERROR_TOL*n: + if abs(n-mol.nelectron) < gen_grid.NELEC_ERROR_TOL*n: rho *= grids.weights idx = cupy.abs(rho) > threshold / grids.weights.size - + logger.debug(grids, 'Drop grids %d', grids.weights.size - cupy.count_nonzero(idx)) grids.coords = cupy.asarray(grids.coords [idx], order='C') grids.weights = cupy.asarray(grids.weights[idx], order='C') @@ -80,9 +80,9 @@ def initialize_grids(ks, mol=None, dm=None): # dm.ndim == 2 indicates ground state isinstance(dm, cupy.ndarray) and dm.ndim == 2): # Filter grids the first time setup grids - ks.grids = prune_small_rho_grids_(ks, ks.mol, dm, ks.grids) + ks.grids = prune_small_rho_grids_(ks, ks.mol, dm, ks.grids) t0 = logger.timer_debug1(ks, 'setting up grids', *t0) - + is_nlc = ks.nlc or ks._numint.libxc.is_nlc(ks.xc) if is_nlc and ks.nlcgrids.coords is None: if ks.nlcgrids.coords is None: @@ -125,14 +125,14 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): matrix Veff = J + Vxc. Veff can be a list matrices, if the input dm is a list of density matrices. ''' - + if mol is None: mol = ks.mol if dm is None: dm = ks.make_rdm1() t0 = (logger.process_clock(), logger.perf_counter()) if ks.grids.coords is None: ks.grids.ao_values = None initialize_grids(ks, mol, dm) - + if hasattr(ks, 'screen_tol') and ks.screen_tol is not None: ks.direct_scf_tol = ks.screen_tol ground_state = (isinstance(dm, cupy.ndarray) and dm.ndim == 2) @@ -151,7 +151,7 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): xc = ks.nlc n, enlc, vnlc = ni.nr_nlc_vxc(mol, ks.nlcgrids, xc, dm, max_memory=max_memory) - + exc += enlc vxc += vnlc #logger.debug(ks, 'nelec by numeric integration = %s', n) @@ -168,7 +168,7 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): vj += vhf_last.vj else: vj = ks.get_j(mol, dm, hermi) - + vxc += vj else: if (ks._eri is None and ks.direct_scf and @@ -192,7 +192,7 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): vxc += vj - vk * .5 if ground_state: exc -= cupy.einsum('ij,ji', dm, vk).real * .5 * .5 - + if ground_state: ecoul = cupy.einsum('ij,ji', dm, vj).real * .5 else: @@ -204,6 +204,8 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): return vxc class RKS(rks.RKS, scf.hf.RHF): + to_cpu = to_cpu + def __init__(self, mol, xc='LDA,VWN', disp=None): super().__init__(mol, xc) self._numint = numint.NumInt(xc=xc) @@ -217,20 +219,20 @@ def device(self): @device.setter def device(self, value): self._numint.device = value - + def get_dispersion(self): if self.disp is None: return 0.0 - + if self.disp[:2].upper() == 'D3': - # multi-threads in DFTD3 conflicts with PyTorch, set it to be 1 for safty + # multi-threads in DFTD3 conflicts with PyTorch, set it to be 1 for safty from pyscf import lib with lib.with_omp_threads(1): import dftd3.pyscf as disp d3 = disp.DFTD3Dispersion(self.mol, xc=self.xc, version=self.disp) e_d3, _ = d3.kernel() return e_d3 - + if self.disp[:2].upper() == 'D4': from pyscf.data.elements import charge atoms = numpy.array([ charge(a[0]) for a in self.mol._atom]) @@ -241,7 +243,7 @@ def get_dispersion(self): model = DispersionModel(atoms, coords) res = model.get_dispersion(DampingParam(method=self.xc), grad=False) return res.get("energy") - + def reset(self, mol=None): pyscf_hf.SCF.reset(self, mol) self.grids.reset(mol) @@ -253,18 +255,17 @@ def energy_elec(self, dm=None, h1e=None, vhf=None): if dm is None: dm = self.make_rdm1() if h1e is None: h1e = self.get_hcore() if vhf is None: vhf = self.get_veff(self.mol, dm) - + e1 = cupy.sum(h1e*dm) ecoul = self.ecoul exc = self.exc e2 = ecoul + exc return e1+e2, e2 - + def energy_tot(self, dm, h1e, vhf=None): nuc = self.energy_nuc() e_tot = self.energy_elec(dm, h1e, vhf)[0] + nuc self.scf_summary['nuc'] = nuc.real return e_tot - + get_veff = patch_cpu_kernel(rks.RKS.get_veff)(_get_veff) - \ No newline at end of file diff --git a/gpu4pyscf/dft/roks.py b/gpu4pyscf/dft/roks.py index 8f8c3e71..495eca39 100644 --- a/gpu4pyscf/dft/roks.py +++ b/gpu4pyscf/dft/roks.py @@ -18,9 +18,11 @@ from pyscf.dft import roks from gpu4pyscf.dft import numint from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu class ROKS(roks.ROKS): + to_cpu = to_cpu + def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py index 1d094a96..d5c22843 100644 --- a/gpu4pyscf/dft/uks.py +++ b/gpu4pyscf/dft/uks.py @@ -18,9 +18,11 @@ from pyscf.dft import uks from gpu4pyscf.dft import numint from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu class UKS(uks.UKS): + to_cpu = to_cpu + def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index e2976ef1..af1190de 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -23,7 +23,7 @@ from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu from gpu4pyscf.lib.cupy_helper import tag_array from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df @@ -548,6 +548,8 @@ def calculate_h1e(h1_gpu, s1_gpu): return de.get() class Gradients(rhf.Gradients): + to_cpu = to_cpu + device = 'gpu' grad_elec = patch_cpu_kernel(rhf.Gradients.grad_elec)(_grad_elec) grad_nuc = patch_cpu_kernel(rhf.Gradients.grad_nuc)(_grad_nuc) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 4774e412..db22b716 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -15,7 +15,7 @@ # # Author: Qiming Sun # modified by: Xiaojie Wu -# +# ''' @@ -38,13 +38,14 @@ from gpu4pyscf.scf import cphf from gpu4pyscf.lib.cupy_helper import contract, tag_array from gpu4pyscf.lib import logger +from gpu4pyscf.lib.utils import to_cpu def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo1=None, mo_e1=None, h1ao=None, atmlst=None, max_memory=4000, verbose=None): log = logger.new_logger(hessobj, verbose) time0 = t1 = (logger.process_clock(), logger.perf_counter()) - + mol = hessobj.mol mf = hessobj.base if mo_energy is None: mo_energy = mf.mo_energy @@ -84,10 +85,10 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, s1ao = cupy.zeros((3,nao,nao)) s1ao[:,p0:p1] += s1a[:,p0:p1] s1ao[:,:,p0:p1] += s1a[:,p0:p1].transpose(0,2,1) - + tmp = cupy.einsum('xpq,pi->xiq', s1ao, mocc) s1oo = cupy.einsum('xiq,qj->xij', tmp, mocc) - + #s1oo = cupy.einsum('xpq,pi,qj->xij', s1ao, mocc, mocc) s1mo = cupy.einsum('xij,ip->xpj', s1ao, mo_coeff) @@ -105,7 +106,7 @@ def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, de2[j0,i0] = de2[i0,j0].T log.timer('RHF hessian', *time0) - + return de2 def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -127,7 +128,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, if mo_occ is None: mo_occ = mf.mo_occ if mo_coeff is None: mo_coeff = mf.mo_coeff if atmlst is None: atmlst = range(mol.natm) - + nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] dm0 = numpy.dot(mocc, mocc.T) * 2 @@ -176,7 +177,7 @@ def _partial_hess_ejk(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, vj1 = vj1.reshape(3,3,nao,nao) vk1 = vk1.reshape(3,3,nao,nao) t1 = log.timer_debug1('contracting int2e_ipvip1 for atom %d'%ia, *t1) - + ej[i0,i0] += cupy.einsum('xypq,pq->xy', vj1_diag[:,:,p0:p1], dm0[p0:p1])*2 ek[i0,i0] += cupy.einsum('xypq,pq->xy', vk1_diag[:,:,p0:p1], dm0[p0:p1]) e1[i0,i0] -= cupy.einsum('xypq,pq->xy', s1aa[:,:,p0:p1], dme0[p0:p1])*2 @@ -284,7 +285,7 @@ def _get_jk(mol, intor, comp, aosym, script_dms, intor = mol._add_suffix(intor) scripts = script_dms[::2] dms = script_dms[1::2] - + vs = _vhf.direct_bindm(intor, aosym, scripts, dms, comp, mol._atm, mol._bas, mol._env, vhfopt=vhfopt, cintopt=cintopt, shls_slice=shls_slice) @@ -368,7 +369,7 @@ def gen_vind(mf, mo_coeff, mo_occ): mocc = mo_coeff[:,mo_occ>0] nocc = mocc.shape[1] vresp = mf.gen_response(mo_coeff, mo_occ, hermi=1) - + def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) @@ -471,6 +472,9 @@ def h_op(x): class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' + + to_cpu = to_cpu + def __init__(self, scf_method): self.verbose = scf_method.verbose self.stdout = scf_method.stdout diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index b7059599..ab5cfec6 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -32,6 +32,7 @@ # import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks # noqa +from gpu4pyscf.lib.utils import to_cpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -62,7 +63,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst, max_memory, verbose, abs(hyb) > 1e-10) de2 += ej - hyb * ek # (A,B,dR_A,dR_B) - + mem_now = lib.current_memory()[0] max_memory = max(2000, mf.max_memory*.9-mem_now) veff_diag = _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory) @@ -189,7 +190,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): grids = mf.grids if grids.coords is None: grids.build(with_non0tab=True) - + # move data to GPU mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) @@ -199,11 +200,11 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory): xctype = ni._xc_type(mf.xc) shls_slice = (0, mol.nbas) ao_loc = mol.ao_loc_nr() - + opt = getattr(ni, 'gdftopt', None) if opt is None: raise RuntimeError("DFT Options are not initialized") - + coeff = cupy.asarray(opt.coeff) mo_coeff = coeff @ mo_coeff nao = mo_coeff.shape[0] @@ -236,17 +237,17 @@ def contract_(mat, ao, aoidx, wv, mask): wv = weight * vxc #:aow = numpy.einsum('npi,np->pi', ao[:4], wv[:4]) aow = numint._scale_ao(ao[:4], wv[:4]) - + for i in range(6): vmat[i] += numint._dot_ao_ao(mol, ao[i+4], aow, mask, shls_slice, ao_loc) - + contract_(vmat[0], ao, [XXX,XXY,XXZ], wv, mask) contract_(vmat[1], ao, [XXY,XYY,XYZ], wv, mask) contract_(vmat[2], ao, [XXZ,XYZ,XZZ], wv, mask) contract_(vmat[3], ao, [XYY,YYY,YYZ], wv, mask) contract_(vmat[4], ao, [XYZ,YYZ,YZZ], wv, mask) contract_(vmat[5], ao, [XZZ,YZZ,ZZZ], wv, mask) - + rho = vxc = wv = aow = None elif xctype == 'MGGA': def contract_(mat, ao, aoidx, wv, mask): @@ -281,19 +282,19 @@ def contract_(mat, ao, aoidx, wv, mask): vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[1], mask, shls_slice, ao_loc) for i, j in enumerate([XXZ, XYZ, XZZ, YYZ, YZZ, ZZZ]): vmat[i] += numint._dot_ao_ao(mol, ao[j], aow[2], mask, shls_slice, ao_loc) - + vmat = vmat[[0,1,2, 1,3,4, 2,4,5]] vmat = cupy.einsum('pi,npq,qj->nij', coeff, vmat, coeff) - + return vmat.reshape(3,3,nao_sph,nao_sph) def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): # TODO: hard coded ao = ao.transpose([0,2,1]) ao_dm0 = [x.T for x in ao_dm0] - + p0, p1 = aoslices[atm_id][2:] ngrids = ao[0].shape[0] if xctype == 'GGA': @@ -316,7 +317,7 @@ def _make_dR_rho1(ao, ao_dm0, atm_id, aoslices, xctype): rho1[:,4] *= .5 else: raise RuntimeError - + ao_dm0_0 = ao_dm0[0][:,p0:p1] # (d_X \nabla_x mu) nu DM_{mu,nu} rho1[:,0] = cupy.einsum('xpi,pi->xp', ao[1:4,:,p0:p1], ao_dm0_0) @@ -377,7 +378,7 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): coeff = cupy.asarray(opt.coeff) dm0 = mf.make_rdm1(mo_coeff, mo_occ) - + vmat = cupy.zeros((mol.natm,3,3,nao,nao)) ipip = cupy.zeros((3,3,nao,nao)) if xctype == 'LDA': @@ -432,12 +433,12 @@ def _get_vxc_deriv2(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[:4], wv[i,:4]) for i in range(3)] _d1d2_dot_(vmat[ia], mol, ao[1:4], aow, mask, ao_loc, False) ao_dm0 = aow = None - + for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,:,:,p0:p1] += ipip[:,:,:,p0:p1] vmat[ia,:,:,:,p0:p1] += ipip[:,:,p0:p1].transpose(1,0,3,2) - + elif xctype == 'MGGA': XX, XY, XZ = 4, 5, 6 YX, YY, YZ = 5, 7, 8 @@ -494,7 +495,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): grids = hessobj.grids else: grids = mf.grids - + if grids.coords is None: grids.build(with_non0tab=True) @@ -515,7 +516,7 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): raise RuntimeError("DFT Options are not initialized") coeff = cupy.asarray(opt.coeff) dm0 = mf.make_rdm1(mo_coeff, mo_occ) - + v_ip = cupy.zeros((3,nao,nao)) vmat = cupy.zeros((mol.natm,3,nao,nao)) max_memory = max(2000, max_memory-vmat.size*8/1e6) @@ -592,14 +593,14 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): aow = [numint._scale_ao(ao[j], wv[i,4]) for i in range(3)] vmat[ia] += rks_grad._d1_dot_(aow, ao[j].T) ao_dm0 = aow = None - + for ia in range(mol.natm): p0, p1 = aoslices[ia][2:] vmat[ia,:,p0:p1] += v_ip[:,p0:p1] vmat[ia] = -vmat[ia] - vmat[ia].transpose(0,2,1) vmat = cupy.einsum("kxij,jq->kxiq", vmat, mocc) vmat = cupy.einsum("kxiq,ip->kxpq", vmat, mo_coeff) - + return vmat @@ -611,6 +612,11 @@ def __init__(self, mf): self.grid_response = False self._keys = self._keys.union(['grids']) + def to_cpu(self): + from pyscf.hessian.rks import Hessian + obj = to_cpu(self) + return obj.view(Hessian) + def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': from pyscf import lib @@ -627,7 +633,7 @@ def get_dispersion(self): mol.set_geom_(coords, unit='Bohr') d3 = disp.DFTD3Dispersion(mol, xc=self.base.xc, version=self.base.disp) _, g1 = d3.kernel() - + coords[i,j] -= 2.0*eps mol.set_geom_(coords, unit='Bohr') d3 = disp.DFTD3Dispersion(mol, xc=self.base.xc, version=self.base.disp) @@ -636,7 +642,7 @@ def get_dispersion(self): coords[i,j] += eps h_d3[i,:,j,:] = (g1 - g2)/(2.0*eps) return h_d3 - + if self.base.disp[:2].upper() == 'D4': from pyscf.data.elements import charge atoms = numpy.array([ charge(a[0]) for a in self.mol._atom]) @@ -656,7 +662,7 @@ def get_dispersion(self): model = DispersionModel(atoms, coords) res = model.get_dispersion(params, grad=True) g1 = res.get("gradient") - + coords[i,j] -= 2.0*eps mol.set_geom_(coords, unit='Bohr') model = DispersionModel(atoms, coords) @@ -672,4 +678,4 @@ def get_dispersion(self): make_h1 = make_h1 from pyscf import dft -dft.rks.RKS.Hessian = dft.rks_symm.RKS.Hessian = lib.class_as_method(Hessian) \ No newline at end of file +dft.rks.RKS.Hessian = dft.rks_symm.RKS.Hessian = lib.class_as_method(Hessian) diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index 8fa7081a..982a824d 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -16,6 +16,7 @@ # along with this program. If not, see . import functools +import cupy def patch_cpu_kernel(cpu_kernel): '''Generate a decorator to patch cpu function to gpu function''' @@ -29,3 +30,22 @@ def hybrid_kernel(method, *args, **kwargs): hybrid_kernel.__package__ = 'gpu4pyscf' return hybrid_kernel return patch + +def to_cpu(method): + # Search for the class in pyscf closest to the one defined in gpu4pyscf + for pyscf_cls in method.__class__.__mro__: + if 'gpu4pyscf' not in pyscf_cls: + break + method = method.view(pyscf_cls) + + keys = [cls._keys for cls in pyscf_cls.__mro__[:-1] if hasattr(cls, '_keys')] + if keys: + keys = set(keys).intersection(method.__dict__) + + for key in keys: + val = getattr(method, key) + if isinstance(val, cupy.ndarray): + setattr(method, key, cupy.asnumpy(val)) + elif hasattr(val, 'to_cpu'): + setattr(method, key, val.to_cpu()) + return method diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py index 1f90ff65..4ce11c2c 100644 --- a/gpu4pyscf/scf/ghf.py +++ b/gpu4pyscf/scf/ghf.py @@ -18,7 +18,7 @@ from pyscf.scf import ghf from gpu4pyscf.scf.hf import get_jk as _get_jk_nr from gpu4pyscf.scf.hf import _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu def get_jk(mol=None, dm=None, hermi=0, with_j=True, with_k=True, omega=None): @@ -37,6 +37,8 @@ def jkbuild(mol, dm, hermi, with_j, with_k, omega=None): return vj, vk class GHF(ghf.GHF): + to_cpu = to_cpu + device = 'gpu' @patch_cpu_kernel(ghf.GHF.get_jk) diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index e9ac93fc..61956ead 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -28,7 +28,7 @@ from pyscf.lib import logger from pyscf.scf import hf, jk, _vhf from gpu4pyscf import lib -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array from gpu4pyscf.scf import diis @@ -86,13 +86,13 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, l_ctr_ao_locs = vhfopt.mol.ao_loc[l_ctr_shell_locs] dm_ctr_cond = np.max( [pyscf_lib.condense('absmax', x, l_ctr_ao_locs) for x in dms.get()], axis=0) - + dm_shl = cupy.zeros([l_ctr_shell_locs[-1], l_ctr_shell_locs[-1]]) assert dms.flags.c_contiguous size_l = np.array([1,3,6,10,15,21,28]) l_ctr = vhfopt.uniq_l_ctr[:,0] r = 0 - for i, li in enumerate(l_ctr): + for i, li in enumerate(l_ctr): i0 = l_ctr_ao_locs[i] i1 = l_ctr_ao_locs[i+1] ni_shls = (i1-i0)//size_l[li] @@ -134,7 +134,7 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, dm_ctr_cond[cpi,cpl], dm_ctr_cond[cpj,cpl]) if sub_dm_cond < direct_scf_tol * 1e3: continue - + #log_cutoff = np.log(direct_scf_tol / sub_dm_cond) log_cutoff = np.log(direct_scf_tol) sub_dm_cond = np.log(sub_dm_cond) @@ -224,11 +224,11 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, for i, v in enumerate(vk1): vk[i] += coeff.T.dot(v).dot(coeff) cput0 = log.timer_debug1('get_jk pass 2 for l>4 basis on cpu', *cput0) - + if FREE_CUPY_CACHE: coeff = dms = None cupy.get_default_memory_pool().free_all_blocks() - + if dm0.ndim == 2: if with_j: vj = vj[0] @@ -274,7 +274,7 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True, getattr(mf.opt, '_dmcondname', 'CVHFsetnr_direct_scf_dm')) vhfopt.build(mf.direct_scf_tol) mf._opt_gpu_omega = vhfopt - + vj, vk = get_jk(mol, dm, hermi, vhfopt, with_j, with_k, omega, verbose=log) log.timer('vj and vk on gpu', *cput0) return vj, vk @@ -362,7 +362,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, if(conv_tol_grad is None): conv_tol_grad = conv_tol**.5 logger.info(mf, 'Set gradient conv threshold to %g', conv_tol_grad) - + if(dm0 is None): dm0 = mf.get_init_guess(mol) dm = cupy.asarray(dm0, order='C') @@ -380,13 +380,13 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, else: h1e = cupy.asarray(mf.get_hcore(mol)) s1e = cupy.asarray(mf.get_ovlp(mol)) - + vhf = mf.get_veff(mol, dm) e_tot = mf.energy_tot(dm, h1e, vhf) logger.info(mf, 'init E= %.15g', e_tot) t1 = log.timer_debug1('total prep', *t0) scf_conv = False - + if isinstance(mf.diis, lib.diis.DIIS): mf_diis = mf.diis elif mf.diis: @@ -398,13 +398,13 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, _, mf_diis.Corth = mf.eig(fock, s1e) else: mf_diis = None - + t_beg = time.time() for cycle in range(mf.max_cycle): t0 = (logger.process_clock(), logger.perf_counter()) dm_last = dm last_hf_e = e_tot - + f = mf.get_fock(h1e, s1e, vhf, dm, cycle, mf_diis); t1 = log.timer_debug1('DIIS', *t0) mo_energy, mo_coeff = eigh(f, s1e); t1 = log.timer_debug1('eig', *t1) mo_occ = mf.get_occ(mo_energy, mo_coeff) @@ -412,7 +412,7 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, vhf = mf.get_veff(mol, dm, dm_last, vhf); t1 = log.timer_debug1('veff', *t1) e_tot = mf.energy_tot(dm, h1e, vhf); t1 = log.timer_debug1('energy', *t1) - norm_ddm = cupy.linalg.norm(dm-dm_last) + norm_ddm = cupy.linalg.norm(dm-dm_last) t1 = log.timer_debug1('total', *t0) logger.info(mf, 'cycle= %d E= %.15g delta_E= %4.3g |ddm|= %4.3g', cycle+1, e_tot, e_tot-last_hf_e, norm_ddm) @@ -421,10 +421,10 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, if(e_diff < conv_tol and norm_gorb < conv_tol_grad): scf_conv = True break - + if(cycle == mf.max_cycle): logger.warn("SCF failed to converge") - + t_end = time.time() mf.scf_time = t_end - t_beg # for dispersion correction @@ -521,7 +521,7 @@ def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'): nao = mol.nao with mol.with_common_orig((0,0,0)): ao_quad = mol.intor_symmetric('int1e_rr').reshape(3,3,nao,nao) - + el_quad = np.einsum('xyij,ji->xy', ao_quad, dm).real # Nuclear contribution @@ -530,13 +530,15 @@ def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'): nucl_quad = np.einsum('i,ix,iy->xy', charges, coords, coords) mol_quad = nucl_quad - el_quad - + if unit.upper() == 'DEBYE-ANG': mol_quad *= nist.AU2DEBYE * nist.BOHR return mol_quad class RHF(hf.RHF): + to_cpu = to_cpu + screen_tol = 1e-14 device = 'gpu' DIIS = diis.SCF_DIIS @@ -549,7 +551,7 @@ class RHF(hf.RHF): get_grad = patch_cpu_kernel(hf.RHF.get_grad)(_get_grad) gen_response = _gen_rhf_response quad_moment = _quad_moment - + def scf(self, dm0=None, **kwargs): cput0 = (logger.process_clock(), logger.perf_counter()) @@ -574,8 +576,10 @@ def scf(self, dm0=None, **kwargs): self._finalize() return self.e_tot kernel = pyscf_lib.alias(scf, alias_name='kernel') - + class _VHFOpt(_vhf.VHFOpt): + to_cpu = to_cpu + def __init__(self, mol, intor, prescreen='CVHFnoscreen', qcondname='CVHFsetnr_direct_scf', dmcondname=None): self.mol, self.coeff = basis_seg_contraction(mol) @@ -594,12 +598,12 @@ def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): l_ctrs = mol._bas[:,[gto.ANG_OF, gto.NPRIM_OF]] uniq_l_ctr, _, inv_idx, l_ctr_counts = np.unique( l_ctrs, return_index=True, return_inverse=True, return_counts=True, axis=0) - + # Limit the number of AOs in each group if group_size is not None: uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups( uniq_l_ctr, l_ctr_counts, group_size) - + if mol.verbose >= logger.DEBUG: logger.debug1(mol, 'Number of shells for each [l, nctr] group') for l_ctr, n in zip(uniq_l_ctr, l_ctr_counts): @@ -616,12 +620,12 @@ def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): self.coeff = self.coeff[ao_idx] # Sort basis inplace mol._bas = mol._bas[sorted_idx] - + # Initialize vhfopt after reordering mol._bas _vhf.VHFOpt.__init__(self, mol, self._intor, self._prescreen, self._qcondname, self._dmcondname) self.direct_scf_tol = cutoff - + lmax = uniq_l_ctr[:,0].max() nbas_by_l = [l_ctr_counts[uniq_l_ctr[:,0]==l].sum() for l in range(lmax+1)] l_slices = np.append(0, np.cumsum(nbas_by_l)) @@ -647,7 +651,7 @@ def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): if uniq_l_ctr[i,0] > LMAX_ON_GPU: # no integrals with h functions should be evaluated on GPU continue - + for q0, q1 in zip(l_ctr_offsets[:i], l_ctr_offsets[1:i+1]): q_sub = q_cond[p0:p1,q0:q1] idx = np.argwhere(q_sub > cutoff) @@ -690,7 +694,7 @@ def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): ishs = ishs[idx] jshs = jshs[idx] s_index = s_index[idx] - + ishs += p0 jshs += p0 pair2bra.append(ishs) @@ -698,7 +702,7 @@ def build(self, cutoff=1e-13, group_size=None, diag_block_with_triu=False): bins.append(_make_bins(s_index, nbins=nbins)) bins_floor.append(bin_floor) log_qs.append(cupy.asarray(log_q[idx])) - + # TODO self.pair2bra = pair2bra self.pair2ket = pair2ket @@ -811,7 +815,7 @@ def basis_seg_contraction(mol, allow_replica=False): pmol._bas = np.asarray(np.vstack(_bas), dtype=np.int32) pmol._env = _env contr_coeff = scipy.linalg.block_diag(*contr_coeff) - + if not mol.cart: contr_coeff = contr_coeff.dot(mol.cart2sph_coeff()) return pmol, contr_coeff diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py index b5e13a03..b833efbc 100644 --- a/gpu4pyscf/scf/rohf.py +++ b/gpu4pyscf/scf/rohf.py @@ -17,10 +17,12 @@ from pyscf.scf import rohf from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu class ROHF(rohf.ROHF): + to_cpu = to_cpu + device = 'gpu' get_jk = patch_cpu_kernel(rohf.ROHF.get_jk)(_get_jk) _eigh = patch_cpu_kernel(rohf.ROHF._eigh)(_eigh) diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py index 81fdc369..956654ac 100644 --- a/gpu4pyscf/scf/uhf.py +++ b/gpu4pyscf/scf/uhf.py @@ -17,9 +17,11 @@ from pyscf.scf import uhf from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu class UHF(uhf.UHF): + to_cpu = to_cpu + device = 'gpu' get_jk = patch_cpu_kernel(uhf.UHF.get_jk)(_get_jk) _eigh = patch_cpu_kernel(uhf.UHF._eigh)(_eigh) From b34a5ed481cb8b222536ead24d43d0e683e56abc Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Sun, 8 Oct 2023 11:08:17 -0700 Subject: [PATCH 2/4] Add to_gpu --- gpu4pyscf/df/df.py | 4 ++- gpu4pyscf/df/df_jk.py | 3 +- gpu4pyscf/df/grad/rhf.py | 3 +- gpu4pyscf/df/grad/rks.py | 1 + gpu4pyscf/df/hessian/rhf.py | 3 +- gpu4pyscf/df/hessian/rks.py | 5 ++- gpu4pyscf/dft/gen_grid.py | 3 +- gpu4pyscf/dft/gks.py | 3 +- gpu4pyscf/dft/numint.py | 65 +++++++++++++++++++------------------ gpu4pyscf/dft/rks.py | 3 +- gpu4pyscf/dft/roks.py | 3 +- gpu4pyscf/dft/uks.py | 3 +- gpu4pyscf/grad/rhf.py | 11 ++++--- gpu4pyscf/hessian/rhf.py | 3 +- gpu4pyscf/hessian/rks.py | 3 ++ gpu4pyscf/lib/utils.py | 5 +++ gpu4pyscf/scf/ghf.py | 3 +- gpu4pyscf/scf/hf.py | 3 +- gpu4pyscf/scf/rohf.py | 3 +- gpu4pyscf/scf/uhf.py | 3 +- 20 files changed, 81 insertions(+), 52 deletions(-) diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 2d4e8038..2315984f 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -24,7 +24,7 @@ from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu from cupyx import scipy MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) @@ -47,6 +47,8 @@ def to_cpu(self): obj = to_cpu(self) return obj.reset() + to_gpu = to_gpu + def build(self, direct_scf_tol=1e-14, omega=None): mol = self.mol auxmol = self.auxmol diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index eebec753..84e370b2 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -24,7 +24,7 @@ from pyscf.scf import dhf from pyscf.df import df_jk, addons from gpu4pyscf.lib.cupy_helper import contract, solve_triangular, take_last2d, transpose_sum, load_library, get_avail_mem -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu from gpu4pyscf.dft import rks, numint from gpu4pyscf.scf import hf from gpu4pyscf.df import df, int3c2e @@ -123,6 +123,7 @@ class DensityFitting(df_jk._DFHF, mf_class): ''' % mf_class to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mf, dfobj, only_dfj): self.__dict__.update(mf.__dict__) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 19a78a06..164762d7 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -21,7 +21,7 @@ from pyscf import lib, scf, gto from gpu4pyscf.scf.hf import _get_jk from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import print_mem_info, solve_triangular, tag_array, unpack_tril, contract, load_library from gpu4pyscf.grad.rhf import _grad_elec from gpu4pyscf import __config__ @@ -226,6 +226,7 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg class Gradients(rhf.Gradients): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' get_jk = patch_cpu_kernel(rhf.Gradients.get_jk)(_get_jk) diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index af0f6a44..1765ac86 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -118,6 +118,7 @@ def _get_veff(ks_grad, mol=None, dm=None): class Gradients(rks.Gradients): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' get_jk = patch_cpu_kernel(rks.Gradients.get_jk)(_get_jk) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index 7fba97c5..f19718c9 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -41,7 +41,7 @@ from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -512,6 +512,7 @@ class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mf): self.auxbasis_response = 1 diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 75de5577..3b4bf62c 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -33,7 +33,7 @@ from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -119,9 +119,12 @@ def __init__(self, mf): def to_cpu(self): from pyscf.df.hessian.rks import Hessian + # to_cpu returns an rhf.Hessian object obj = to_cpu(self) return obj.view(Hessian) + to_gpu = to_gpu + partial_hess_elec = partial_hess_elec make_h1 = make_h1 diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index 34c55711..feaafbd8 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -38,7 +38,7 @@ from cupyx.scipy.spatial.distance import cdist from gpu4pyscf.dft import radi from gpu4pyscf.lib.cupy_helper import load_library -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu libdft = lib.load_library('libdft') libgdft = load_library('libgdft') @@ -553,6 +553,7 @@ class Grids(lib.StreamObject): cutoff = CUTOFF to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mol): self.mol = mol diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py index 4b79a992..b432b036 100644 --- a/gpu4pyscf/dft/gks.py +++ b/gpu4pyscf/dft/gks.py @@ -18,10 +18,11 @@ from pyscf.dft import gks from gpu4pyscf.dft import numint from gpu4pyscf.scf.ghf import get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu class GKS(gks.GKS): to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 996a8096..6a21fb5f 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -25,7 +25,7 @@ from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ @@ -94,7 +94,7 @@ def eval_ao(ni, mol, coords, deriv=0, shls_slice=None, mol._env.ctypes.data_as(ctypes.c_void_p)) if err != 0: raise RuntimeError('CUDA Error') - + if deriv == 0: ao = ao[0] return ao @@ -106,7 +106,7 @@ def eval_rho(mol, ao, dm, non0tab=None, xctype='LDA', hermi=0, _, ngrids = ao.shape else: _, ngrids = ao[0].shape - + dm = cupy.asarray(dm) if xctype in ('LDA', 'HF'): c0 = dm.dot(ao) @@ -219,7 +219,7 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', _, ngrids = ao[0].shape shls_slice = (0, mol.nbas) ao_loc = None #mol.ao_loc_nr() - + cpos1= mo1 if xctype == 'LDA' or xctype == 'HF': c_0 = _dot_ao_dm(mol, ao, cpos1, non0tab, shls_slice, ao_loc) @@ -271,7 +271,7 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): thresh=1e-8 - + #output exc=cupy.zeros(rho[0,:].size) vxc=cupy.zeros([2,rho[0,:].size]) @@ -319,11 +319,11 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): dKdR=(1./6.)*K vvcoords = cupy.asarray(vvcoords, order='F') coords = cupy.asarray(coords, order='F') - + F = cupy.empty_like(R) U = cupy.empty_like(R) W = cupy.empty_like(R) - + #for i in range(R.size): # DX=vvcoords[:,0]-coords[i,0] # DY=vvcoords[:,1]-coords[i,1] @@ -338,7 +338,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): # U=numpy.sum(T) # W=numpy.sum(T*R2) # F*=-1.5 - + stream = cupy.cuda.get_current_stream() err = libgdft.VXC_vv10nlc(ctypes.cast(stream.ptr, ctypes.c_void_p), ctypes.cast(F.data.ptr, ctypes.c_void_p), @@ -356,7 +356,7 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): if err != 0: raise RuntimeError('CUDA Error') - + #exc is multiplied by Rho later exc[threshind] = Beta+0.5*F vxc[0,threshind] = Beta+F+1.5*(U*dKdR+W*dW0dR) @@ -405,7 +405,7 @@ def _make_rho(ao_value, dm, xctype=None): opt.l_bas_offsets) else: pair2shls_full, pairs_locs_full = pair2shls, pairs_locs - + release_gpu_stack() if xctype == 'LDA': ao_deriv = 0 @@ -489,17 +489,17 @@ def _make_rho(ao_value, dm, xctype=None): excsum[i] += cupy.dot(den, exc)[0] t1 = log.timer_debug1('integration', *t1) ao = None - + vmat = contract('pi,npq->niq', coeff, vmat) vmat = contract('qj,niq->nij', coeff, vmat) if xctype != 'LDA': #transpose_sum(vmat) vmat = vmat + vmat.transpose([0,2,1]) - + if FREE_CUPY_CACHE: dms = None cupy.get_default_memory_pool().free_all_blocks() - + if len(dm_shape) == 2: nelec = nelec[0] excsum = excsum[0] @@ -625,7 +625,7 @@ def get_rho(ni, mol, dm, grids, max_memory=2000): #logger.debug1(mol, 'Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) if block_size < ALIGNED: raise RuntimeError('Not enough GPU memory') - + ngrids = grids.weights.size rho = cupy.empty(ngrids) for p0, p1 in lib.prange(0, ngrids, block_size): @@ -714,10 +714,10 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if FREE_CUPY_CACHE: dms = None cupy.get_default_memory_pool().free_all_blocks() - + if len(dm_shape) == 2: vmat = vmat[0] - + return cupy.asarray(vmat) @patch_cpu_kernel(numint.nr_rks_fxc_st) @@ -756,7 +756,7 @@ def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= nset = len(dma) vmata = cupy.zeros((nset, nao, nao)) vmatb = cupy.zeros((nset, nao, nao)) - + with opt.gdft_envs_cache(): mem_avail = cupy.cuda.runtime.memGetInfo()[0] if xctype == 'LDA': @@ -910,12 +910,12 @@ def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt - + coeff = cupy.asarray(opt.coeff) ngrids = grids.weights.size comp = (ao_deriv+1)*(ao_deriv+2)*(ao_deriv+3)//6 nao = coeff.shape[0] - + def make_rdm1(mo_coeff, mo_occ): orbo = coeff.dot(mo_coeff[:,mo_occ>0]) dm = (orbo*mo_occ[mo_occ>0]).dot(orbo.T) @@ -927,7 +927,7 @@ def make_rdm1(mo_coeff, mo_occ): logger.debug1(mol, 'Available GPU mem %f Mb, block_size %d', mem_avail/1e6, block_size) if block_size < ALIGNED: raise RuntimeError('Not enough GPU memory') - + if spin == 0: dm = make_rdm1(mo_coeff, mo_occ) rho = [] @@ -951,7 +951,7 @@ def make_rdm1(mo_coeff, mo_occ): if FREE_CUPY_CACHE: dm = dma = dmb = None cupy.get_default_memory_pool().free_all_blocks() - + vxc, fxc = ni.eval_xc_eff(xc_code, rho, deriv=2, xctype=xctype)[1:3] return rho, vxc, fxc @@ -980,11 +980,11 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None inp['rho'] = rho[0] inp['sigma'] = rho[1]*rho[1] + rho[2]*rho[2] + rho[3]*rho[3] inp['tau'] = rho[-1] # can be 4 (without laplacian) or 5 (with laplacian) - - do_vxc = True - do_fxc = deriv > 1 + + do_vxc = True + do_fxc = deriv > 1 do_kxc = deriv > 2 - + vxc_labels = ["vrho", "vsigma", "vlapl", "vtau"] fxc_labels = ["v2rho2", "v2rhosigma", "v2sigma2", "v2lapl2", "v2tau2", \ "v2rholapl", "v2rhotau", "v2lapltau", "v2sigmalapl", "v2sigmatau"] @@ -1004,7 +1004,7 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None else: ret_full[label] = xc_res[label] * w vxc = None - fxc = None + fxc = None kxc = None exc = ret_full["zk"] @@ -1027,7 +1027,7 @@ def _init_xcfuns(xc_code): xc_names = [('HYB_GGA_XC_B3LYP',1)] else: xc_names = dft.libxc.parse_xc(xc_upper)[1:][0] - + xcfuns = [] for xc, w in xc_names: xcfun = libxc.XCfun(xc, 'unpolarized') @@ -1063,7 +1063,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, if opt is None: ni.build(mol, grids.coords) opt = ni.gdftopt - + mol = opt.mol with opt.gdft_envs_cache(): for ip0, ip1 in lib.prange(0, ngrids, blksize): @@ -1075,6 +1075,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, class NumInt(numint.NumInt): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' @@ -1218,7 +1219,7 @@ def _dot_ao_ao_sparse(bra, ket, wv, nbins, screen_index, ao_loc, bas_pair2shls.ctypes.data_as(ctypes.c_void_p), screen_index.ctypes.data_as(ctypes.c_void_p), ao_loc.ctypes.data_as(ctypes.c_void_p)) - + if err != 0: raise RuntimeError('CUDA Error') return out @@ -1247,7 +1248,7 @@ def _scale_ao(ao, wv, out=None): return contract('nip,np->ip', ao, wv) nvar, nao, ngrids = ao.shape assert wv.shape == (nvar, ngrids) - + wv = cupy.asarray(wv, order='C') if out is None: out = cupy.empty((nao, ngrids), order='C') @@ -1274,7 +1275,7 @@ class _GDFTOpt: def __init__(self, mol): self.envs_cache = ctypes.POINTER(_GDFTEnvsCache)() self._mol = mol - + def build(self, mol=None): if mol is None: mol = self._mol @@ -1323,7 +1324,7 @@ def build(self, mol=None): if inv_idx_padding: inv_idx = np.append(inv_idx, inv_idx_padding) pmol._bas = np.vstack([pmol._bas, pmol._bas[bas_to_pad]]) - + ao_loc = pmol.ao_loc_nr() nao = ao_loc[-1] sorted_idx = np.argsort(inv_idx) diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index 69e83d24..ea995601 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -27,7 +27,7 @@ from gpu4pyscf.scf import diis from gpu4pyscf.lib import logger from gpu4pyscf.dft import numint, gen_grid -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import load_library, tag_array libcupy_helper = load_library('libcupy_helper') @@ -205,6 +205,7 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): class RKS(rks.RKS, scf.hf.RHF): to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mol, xc='LDA,VWN', disp=None): super().__init__(mol, xc) diff --git a/gpu4pyscf/dft/roks.py b/gpu4pyscf/dft/roks.py index 495eca39..cb83d395 100644 --- a/gpu4pyscf/dft/roks.py +++ b/gpu4pyscf/dft/roks.py @@ -18,10 +18,11 @@ from pyscf.dft import roks from gpu4pyscf.dft import numint from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu class ROKS(roks.ROKS): to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py index d5c22843..1f7775be 100644 --- a/gpu4pyscf/dft/uks.py +++ b/gpu4pyscf/dft/uks.py @@ -18,10 +18,11 @@ from pyscf.dft import uks from gpu4pyscf.dft import numint from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu class UKS(uks.UKS): to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index af1190de..65b35162 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -23,7 +23,7 @@ from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import tag_array from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df @@ -486,14 +486,14 @@ def _grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None) if mo_occ is None: mo_occ = mf.mo_occ if mo_coeff is None: mo_coeff = mf.mo_coeff log = logger.Logger(mf_grad.stdout, mf_grad.verbose) - + mo_energy = cupy.asarray(mo_energy) mo_occ = cupy.asarray(mo_occ) mo_coeff = cupy.asarray(mo_coeff) dm0 = mf.make_rdm1(mo_coeff, mo_occ) dme0 = mf_grad.make_rdm1e(mo_energy, mo_coeff, mo_occ) - + # CPU tasks are executed on background def calculate_h1e(h1_gpu, s1_gpu): # (\nabla i | hcore | j) - (\nabla i | j) @@ -501,7 +501,7 @@ def calculate_h1e(h1_gpu, s1_gpu): s1_cpu = mf_grad.get_ovlp(mol) h1_gpu[:] = cupy.asarray(h1_cpu) s1_gpu[:] = cupy.asarray(s1_cpu) - return + return h1 = cupy.empty([3, dm0.shape[0], dm0.shape[1]]) s1 = cupy.empty([3, dm0.shape[0], dm0.shape[1]]) @@ -542,13 +542,14 @@ def calculate_h1e(h1_gpu, s1_gpu): if log.verbose >= logger.DEBUG: log.timer_debug1('gradients of electronic part', *t0) - + # net force should be zero de -= cupy.sum(de, axis=0)/len(atmlst) return de.get() class Gradients(rhf.Gradients): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' grad_elec = patch_cpu_kernel(rhf.Gradients.grad_elec)(_grad_elec) diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index db22b716..5c2eb19a 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -38,7 +38,7 @@ from gpu4pyscf.scf import cphf from gpu4pyscf.lib.cupy_helper import contract, tag_array from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu +from gpu4pyscf.lib.utils import to_cpu, to_gpu def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo1=None, mo_e1=None, h1ao=None, @@ -474,6 +474,7 @@ class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' to_cpu = to_cpu + to_gpu = to_gpu def __init__(self, scf_method): self.verbose = scf_method.verbose diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index ab5cfec6..ae0f3148 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -614,9 +614,12 @@ def __init__(self, mf): def to_cpu(self): from pyscf.hessian.rks import Hessian + # to_cpu returns an rhf.Hessian object obj = to_cpu(self) return obj.view(Hessian) + to_gpu = to_gpu + def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': from pyscf import lib diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index 982a824d..b5ec3659 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -49,3 +49,8 @@ def to_cpu(method): elif hasattr(val, 'to_cpu'): setattr(method, key, val.to_cpu()) return method + +def identity(x): + return x + +to_gpu = identity diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py index 4ce11c2c..03edafe1 100644 --- a/gpu4pyscf/scf/ghf.py +++ b/gpu4pyscf/scf/ghf.py @@ -18,7 +18,7 @@ from pyscf.scf import ghf from gpu4pyscf.scf.hf import get_jk as _get_jk_nr from gpu4pyscf.scf.hf import _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu def get_jk(mol=None, dm=None, hermi=0, with_j=True, with_k=True, omega=None): @@ -38,6 +38,7 @@ def jkbuild(mol, dm, hermi, with_j, with_k, omega=None): class GHF(ghf.GHF): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 61956ead..4d387d2d 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -28,7 +28,7 @@ from pyscf.lib import logger from pyscf.scf import hf, jk, _vhf from gpu4pyscf import lib -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array from gpu4pyscf.scf import diis @@ -538,6 +538,7 @@ def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'): class RHF(hf.RHF): to_cpu = to_cpu + to_gpu = to_gpu screen_tol = 1e-14 device = 'gpu' diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py index b833efbc..5ac284e8 100644 --- a/gpu4pyscf/scf/rohf.py +++ b/gpu4pyscf/scf/rohf.py @@ -17,11 +17,12 @@ from pyscf.scf import rohf from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu class ROHF(rohf.ROHF): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' get_jk = patch_cpu_kernel(rohf.ROHF.get_jk)(_get_jk) diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py index 956654ac..633b163b 100644 --- a/gpu4pyscf/scf/uhf.py +++ b/gpu4pyscf/scf/uhf.py @@ -17,10 +17,11 @@ from pyscf.scf import uhf from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu +from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu class UHF(uhf.UHF): to_cpu = to_cpu + to_gpu = to_gpu device = 'gpu' get_jk = patch_cpu_kernel(uhf.UHF.get_jk)(_get_jk) From 6fd6da3f46b2770bac2397219267c5c330f05b8b Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Sun, 8 Oct 2023 16:55:56 -0700 Subject: [PATCH 3/4] Add linter --- .flake8 | 29 +++ .github/workflows/lint.yml | 18 ++ gpu4pyscf/__init__.py | 2 +- gpu4pyscf/df/df.py | 32 ++- gpu4pyscf/df/df_jk.py | 9 +- gpu4pyscf/df/grad/rhf.py | 20 +- gpu4pyscf/df/grad/rks.py | 14 +- gpu4pyscf/df/hessian/rhf.py | 7 +- gpu4pyscf/df/hessian/rks.py | 5 +- gpu4pyscf/df/int3c2e.py | 270 +++++++++++++------------- gpu4pyscf/df/patch_pyscf.py | 5 +- gpu4pyscf/df/tests/test_df_ecp.py | 6 +- gpu4pyscf/df/tests/test_df_grad.py | 5 +- gpu4pyscf/df/tests/test_df_scf.py | 1 - gpu4pyscf/df/tests/test_geomopt.py | 1 - gpu4pyscf/dft/gen_grid.py | 19 +- gpu4pyscf/dft/gks.py | 21 +- gpu4pyscf/dft/libxc.py | 2 +- gpu4pyscf/dft/numint.py | 34 ++-- gpu4pyscf/dft/patch_pyscf.py | 9 - gpu4pyscf/dft/rks.py | 21 +- gpu4pyscf/dft/roks.py | 17 +- gpu4pyscf/dft/tests/test_ao_values.py | 2 +- gpu4pyscf/dft/tests/test_dft.py | 1 - gpu4pyscf/dft/tests/test_dft_ecp.py | 1 - gpu4pyscf/dft/tests/test_numint.py | 35 ++-- gpu4pyscf/dft/uks.py | 17 +- gpu4pyscf/grad/patch_pyscf.py | 3 +- gpu4pyscf/grad/rhf.py | 19 +- gpu4pyscf/grad/rks.py | 13 +- gpu4pyscf/hessian/rhf.py | 5 +- gpu4pyscf/hessian/rks.py | 6 +- gpu4pyscf/lib/cupy_helper.py | 19 +- gpu4pyscf/lib/cutensor.py | 7 +- gpu4pyscf/lib/logger.py | 3 +- gpu4pyscf/lib/utils.py | 7 + gpu4pyscf/patch_pyscf.py | 8 +- gpu4pyscf/scf/ghf.py | 44 ++--- gpu4pyscf/scf/hf.py | 63 +++--- gpu4pyscf/scf/int4c2e.py | 21 +- gpu4pyscf/scf/patch_pyscf.py | 11 +- gpu4pyscf/scf/rohf.py | 11 +- gpu4pyscf/scf/tests/test_rhf.py | 85 ++++---- gpu4pyscf/scf/uhf.py | 11 +- 44 files changed, 435 insertions(+), 504 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/lint.yml diff --git a/.flake8 b/.flake8 new file mode 100644 index 00000000..394ff69b --- /dev/null +++ b/.flake8 @@ -0,0 +1,29 @@ +[flake8] +# https://flake8.pycqa.org/en/2.5.5/warnings.html#error-codes +ignore = + # Indentation: + E126, E127, E128, E129, + # Whitespaces: + E201, E202, E203, E211, E221, E222, E225, E226, E228, E231, E241, E251, E271, + # Comments: + E261, E262, E265, E266, + # Blank lines: + E301, E302, E303, E305, E306, + # Imports: + E401, E402, + # Other: + E701, E731, E741, E275, + F401, C901, W391, W503, W504, W291, W292, W293 + +exclude = test, tests, .git, __pycache__, build, dist, __init__.py .eggs, *.egg +max-line-length = 160 + +per-file-ignores = + pyscf/dft/libxc.py: E122,E501 + pyscf/dft/xcfun.py: E122,E501 + pyscf/grad/sacasscf.py: E501 + pyscf/lo/ibo.py: E501 + pyscf/pbc/cc/kccsd_t.py: E501 + pyscf/pbc/mpicc/kccsd_rhf.py: E501 + pyscf/pbc/mpicc/kintermediates_rhf.py: E501 + pyscf/pbc/tools/pywannier90.py: E501 diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..70a0e536 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,18 @@ +name: Lint + +on: [push, pull_request] + +jobs: + flake: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.9 + - name: Install flake8 + run: pip install "flake8>=3.7.0" + - name: Static analysis + run: flake8 --config .flake8 gpu4pyscf diff --git a/gpu4pyscf/__init__.py b/gpu4pyscf/__init__.py index 45869b62..ef7eb44d 100644 --- a/gpu4pyscf/__init__.py +++ b/gpu4pyscf/__init__.py @@ -1 +1 @@ -__version__ = '0.5.2' +__version__ = '0.6.0' diff --git a/gpu4pyscf/df/df.py b/gpu4pyscf/df/df.py index 2315984f..e24f7406 100644 --- a/gpu4pyscf/df/df.py +++ b/gpu4pyscf/df/df.py @@ -18,13 +18,13 @@ import cupy import ctypes import numpy as np -from pyscf import lib, __config__ +from pyscf import lib from pyscf.df import df, addons -from gpu4pyscf.lib.cupy_helper import * +from gpu4pyscf.lib.cupy_helper import ( + cholesky, tag_array, get_avail_mem, cart2sph, solve_triangular) from gpu4pyscf.df import int3c2e, df_jk from gpu4pyscf.lib import logger from gpu4pyscf import __config__ -from gpu4pyscf.lib.utils import to_cpu, to_gpu from cupyx import scipy MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) @@ -32,7 +32,7 @@ LINEAR_DEP_TOL = 1e-7 class DF(df.DF): - device = 'gpu' + from gpu4pyscf.lib.utils import to_gpu, device def __init__(self, mol, auxbasis=None): super().__init__(mol, auxbasis) @@ -44,11 +44,10 @@ def __init__(self, mol, auxbasis=None): self._cderi = None def to_cpu(self): + from gpu4pyscf.lib.utils import to_cpu obj = to_cpu(self) return obj.reset() - to_gpu = to_gpu - def build(self, direct_scf_tol=1e-14, omega=None): mol = self.mol auxmol = self.auxmol @@ -80,13 +79,13 @@ def build(self, direct_scf_tol=1e-14, omega=None): t0 = log.timer_debug1('2c2e', *t0) intopt = int3c2e.VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=False, aosym=True, group_size=256) - t1 = log.timer_debug1('prepare intopt', *t0) + log.timer_debug1('prepare intopt', *t0) self.j2c = j2c.copy() j2c = j2c[cupy.ix_(intopt.sph_aux_idx, intopt.sph_aux_idx)] try: self.cd_low = cholesky(j2c) self.cd_low = tag_array(self.cd_low, tag='cd') - except: + except Exception: w, v = cupy.linalg.eigh(j2c) idx = w > LINEAR_DEP_TOL self.cd_low = (v[:,idx] / cupy.sqrt(w[idx])) @@ -175,12 +174,12 @@ def reset(self, mol=None): ''' reset object for scanner ''' - if mol is not None: - self.mol = mol - self.auxmol = None - self._cderi = None - self._rsh_df = {} + super().reset(mol) self.intopt = None + self.nao = None + self.naux = None + self.cd_low = None + self._cderi = None return self def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): @@ -188,11 +187,9 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): Returns: 2D array of (naux,nao*(nao+1)/2) in C-contiguous ''' - nao = mol.nao naoaux, naux = cd_low.shape npair = len(intopt.cderi_row) log = logger.new_logger(mol, mol.verbose) - t0 = (logger.process_clock(), logger.perf_counter()) nq = len(intopt.log_qs) # if the matrix exceeds the limit, store CDERI in CPU memory @@ -201,7 +198,7 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): if naux * npair * 8 < 0.4 * avail_mem: try: cderi = cupy.empty([naux, npair], order='C') - except: + except Exception: use_gpu_memory = False else: use_gpu_memory = False @@ -224,7 +221,8 @@ def cholesky_eri_gpu(intopt, mol, auxmol, cd_low, omega=None, sr_only=False): lj = intopt.angular[cpj] i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] - ni = i1 - i0; nj = j1 - j0 + ni = i1 - i0 + nj = j1 - j0 if sr_only: # TODO: in-place implementation or short-range kernel ints_slices = cupy.zeros([naoaux, nj, ni], order='C') diff --git a/gpu4pyscf/df/df_jk.py b/gpu4pyscf/df/df_jk.py index 84e370b2..e6d40994 100644 --- a/gpu4pyscf/df/df_jk.py +++ b/gpu4pyscf/df/df_jk.py @@ -24,7 +24,6 @@ from pyscf.scf import dhf from pyscf.df import df_jk, addons from gpu4pyscf.lib.cupy_helper import contract, solve_triangular, take_last2d, transpose_sum, load_library, get_avail_mem -from gpu4pyscf.lib.utils import to_cpu, to_gpu from gpu4pyscf.dft import rks, numint from gpu4pyscf.scf import hf from gpu4pyscf.df import df, int3c2e @@ -122,8 +121,7 @@ class DensityFitting(df_jk._DFHF, mf_class): See also the documents of class %s for other SCF attributes. ''' % mf_class - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mf, dfobj, only_dfj): self.__dict__.update(mf.__dict__) @@ -265,8 +263,9 @@ def get_jk(dfobj, dms_tag, hermi=1, with_j=True, with_k=True, direct_scf_tol=1e- log.warn('CDERI not found, build...') dfobj.build(direct_scf_tol=direct_scf_tol, omega=omega) - nao, naux = dfobj.nao, dfobj.naux - vj = None; vk = None + assert nao == dfobj.nao + vj = None + vk = None ao_idx = dfobj.intopt.sph_ao_idx dms = take_last2d(dms, ao_idx) diff --git a/gpu4pyscf/df/grad/rhf.py b/gpu4pyscf/df/grad/rhf.py index 164762d7..c0d39b9a 100644 --- a/gpu4pyscf/df/grad/rhf.py +++ b/gpu4pyscf/df/grad/rhf.py @@ -19,11 +19,9 @@ from pyscf.df.grad import rhf from pyscf.lib import logger from pyscf import lib, scf, gto -from gpu4pyscf.scf.hf import _get_jk from gpu4pyscf.df import int3c2e -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import print_mem_info, solve_triangular, tag_array, unpack_tril, contract, load_library -from gpu4pyscf.grad.rhf import _grad_elec +from gpu4pyscf.grad.rhf import grad_elec from gpu4pyscf import __config__ libcupy_helper = load_library('libcupy_helper') @@ -31,7 +29,7 @@ MIN_BLK_SIZE = getattr(__config__, 'min_ao_blksize', 128) ALIGNED = getattr(__config__, 'ao_aligned', 64) -def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega=None): +def get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omega=None): if mol is None: mol = mf_grad.mol #TODO: dm has to be the SCF density matrix in this version. dm should be # extended to any 1-particle density matrix @@ -52,7 +50,7 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg auxmol = with_df.auxmol intopt = with_df.intopt - nao, naux = mol.nao, with_df.naux + naux = with_df.naux log = logger.new_logger(mol, mol.verbose) t0 = (logger.process_clock(), logger.perf_counter()) @@ -134,8 +132,8 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg block_size = with_df.get_blksize(nao=nao_cart) intopt.clear() # rebuild with aosym - intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, \ - group_size_aux=block_size)#, group_size=block_size) + intopt.build(mf.direct_scf_tol, diag_block_with_triu=True, aosym=False, + group_size_aux=block_size)#, group_size=block_size) # sph2cart for ao cart2sph = intopt.cart2sph @@ -225,12 +223,10 @@ def _get_jk(mf_grad, mol=None, dm0=None, hermi=0, with_j=True, with_k=True, omeg class Gradients(rhf.Gradients): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - device = 'gpu' - get_jk = patch_cpu_kernel(rhf.Gradients.get_jk)(_get_jk) - grad_elec = patch_cpu_kernel(rhf.Gradients.grad_elec)(_grad_elec) + get_jk = get_jk + grad_elec = grad_elec def get_j(self, mol=None, dm=None, hermi=0): vj, _, vjaux, _ = self.get_jk(mol, dm, with_k=False) diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py index 1765ac86..1ccf6ca6 100644 --- a/gpu4pyscf/df/grad/rks.py +++ b/gpu4pyscf/df/grad/rks.py @@ -19,11 +19,11 @@ from pyscf import lib from pyscf.df.grad import rks from gpu4pyscf.grad import rks as rks_grad -from gpu4pyscf.df.grad.rhf import _get_jk, _grad_elec +from gpu4pyscf.df.grad.rhf import get_jk, grad_elec from gpu4pyscf.lib.cupy_helper import contract, tag_array from gpu4pyscf.lib import logger -def _get_veff(ks_grad, mol=None, dm=None): +def get_veff(ks_grad, mol=None, dm=None): '''Coulomb + XC functional ''' @@ -117,13 +117,11 @@ def _get_veff(ks_grad, mol=None, dm=None): return vxc class Gradients(rks.Gradients): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - device = 'gpu' - get_jk = patch_cpu_kernel(rks.Gradients.get_jk)(_get_jk) - get_veff = patch_cpu_kernel(rks.Gradients.get_veff)(_get_veff) - grad_elec = patch_cpu_kernel(rks.Gradients.grad_elec)(_grad_elec) + get_jk = get_jk + get_veff = get_veff + grad_elec = grad_elec def get_j(self, mol=None, dm=None, hermi=0, omega=None): vj, _, vjaux, _ = self.get_jk(mol, dm, with_k=False, omega=omega) diff --git a/gpu4pyscf/df/hessian/rhf.py b/gpu4pyscf/df/hessian/rhf.py index f19718c9..335f01ca 100644 --- a/gpu4pyscf/df/hessian/rhf.py +++ b/gpu4pyscf/df/hessian/rhf.py @@ -41,7 +41,6 @@ from gpu4pyscf.lib.cupy_helper import contract, tag_array, release_gpu_stack from gpu4pyscf.df import int3c2e from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu, to_gpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -389,10 +388,8 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, aoslices = mol.aoslice_by_atom() auxslices = auxmol.aoslice_by_atom() - naux = auxmol.nao nao, nmo = mo_coeff.shape mocc = mo_coeff[:,mo_occ>0] - nocc = mocc.shape[1] dm0 = cupy.dot(mocc, mocc.T) * 2 if omega and omega > 1e-10: @@ -407,7 +404,6 @@ def _gen_jk(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, sph_ao_idx = intopt.sph_ao_idx sph_aux_idx = intopt.sph_aux_idx rev_ao_idx = np.argsort(intopt.sph_ao_idx) - rev_aux_idx = np.argsort(intopt.sph_aux_idx) mocc = mocc[sph_ao_idx, :] mo_coeff = mo_coeff[sph_ao_idx,:] @@ -511,8 +507,7 @@ def _ao2mo(mat): class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mf): self.auxbasis_response = 1 diff --git a/gpu4pyscf/df/hessian/rks.py b/gpu4pyscf/df/hessian/rks.py index 3b4bf62c..a90b93e7 100644 --- a/gpu4pyscf/df/hessian/rks.py +++ b/gpu4pyscf/df/hessian/rks.py @@ -33,7 +33,6 @@ from gpu4pyscf.hessian import rks as rks_hess from gpu4pyscf.df.hessian import rhf as df_rhf_hess from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu, to_gpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None, max_memory=4000, verbose=None): @@ -112,19 +111,19 @@ def make_h1(hessobj, mo_coeff, mo_occ, chkfile=None, atmlst=None, verbose=None): class Hessian(rks_hess.Hessian): '''Non-relativistic RKS hessian''' + from gpu4pyscf.lib.utils import to_gpu, device def __init__(self, mf): self.auxbasis_response = 1 rks_hess.Hessian.__init__(self, mf) def to_cpu(self): + from gpu4pyscf.lib.utils import to_cpu from pyscf.df.hessian.rks import Hessian # to_cpu returns an rhf.Hessian object obj = to_cpu(self) return obj.view(Hessian) - to_gpu = to_gpu - partial_hess_elec = partial_hess_elec make_h1 = make_h1 diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index ecc15e96..9a8d5984 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -43,7 +43,7 @@ def basis_seg_contraction(mol, allow_replica=False): bas_templates = {} _bas = [] _env = mol._env.copy() - + aoslices = mol.aoslice_by_atom() for ia, (ib0, ib1) in enumerate(aoslices[:,:2]): key = tuple(mol._bas[ib0:ib1,gto.PTR_EXP]) @@ -60,7 +60,7 @@ def basis_seg_contraction(mol, allow_replica=False): if nctr == 1: bas_of_ia.append(shell) continue - + # Only basis with nctr > 1 needs to be decontracted nprim = shell[gto.NPRIM_OF] pcoeff = shell[gto.PTR_COEFF] @@ -99,19 +99,19 @@ def make_fake_mol(): fakemol = gto.mole.Mole() fakemol._atm = np.zeros((1,gto.ATM_SLOTS), dtype=np.int32) fakemol._atm[0][[0,1,2,3]] = np.array([2,20,1,23]) - + ptr = gto.mole.PTR_ENV_START fakemol._bas = np.zeros((1,gto.BAS_SLOTS), dtype=np.int32) fakemol._bas[0,gto.NPRIM_OF ] = 1 fakemol._bas[0,gto.NCTR_OF ] = 1 fakemol._bas[0,gto.PTR_EXP ] = ptr+4 fakemol._bas[0,gto.PTR_COEFF] = ptr+5 - + fakemol._env = np.zeros(ptr+6) ptr_coeff = fakemol._bas[0,gto.PTR_COEFF] ptr_exp = fakemol._bas[0,gto.PTR_EXP] ''' - due to the common factor of normalization + due to the common factor of normalization https://github.com/sunqm/libcint/blob/be610546b935049d0cf65c1099244d45b2ff4e5e/src/g1e.c ''' fakemol._env[ptr_coeff] = 1.0/0.282094791773878143 @@ -142,7 +142,7 @@ def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen', self.sorted_auxmol = None self.sorted_mol = None - + self.cart_ao_idx = None self.sph_ao_idx = None self.cart_aux_idx = None @@ -153,7 +153,7 @@ def __init__(self, mol, auxmol, intor, prescreen='CVHFnoscreen', self.sph_ao_loc = [] self.sph_aux_loc = [] - self.cart2sph = None + self.cart2sph = None self.aux_cart2sph = None self.angular = None @@ -175,9 +175,9 @@ def __del__(self): self.clear() except AttributeError: pass - - def build(self, cutoff=1e-14, group_size=None, \ - group_size_aux=None, diag_block_with_triu=False, aosym=False): + + def build(self, cutoff=1e-14, group_size=None, + group_size_aux=None, diag_block_with_triu=False, aosym=False): ''' int3c2e is based on int2e with (ao,ao|aux,1) a tot_mol is created with concatenating [mol, fake_mol, aux_mol] @@ -188,11 +188,11 @@ def build(self, cutoff=1e-14, group_size=None, \ if group_size is not None : uniq_l_ctr, l_ctr_counts = _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size) self.sorted_mol = sorted_mol - + # sort fake mol fake_mol = make_fake_mol() _, _, fake_uniq_l_ctr, fake_l_ctr_counts = sort_mol(fake_mol) - + # sort auxiliary mol sorted_auxmol, sorted_aux_idx, aux_uniq_l_ctr, aux_l_ctr_counts = sort_mol(self.auxmol) if group_size_aux is not None: @@ -200,12 +200,12 @@ def build(self, cutoff=1e-14, group_size=None, \ self.sorted_auxmol = sorted_auxmol tmp_mol = gto.mole.conc_mol(fake_mol, sorted_auxmol) tot_mol = gto.mole.conc_mol(sorted_mol, tmp_mol) - + # Initialize vhfopt after reordering mol._bas _vhf.VHFOpt.__init__(self, sorted_mol, self._intor, self._prescreen, self._qcondname, self._dmcondname) self.direct_scf_tol = cutoff - + # TODO: is it more accurate to filter with overlap_cond (or exp_cond)? q_cond = self.get_q_cond() cput1 = logger.timer_debug1(sorted_mol, 'Initialize q_cond', *cput0) @@ -237,7 +237,7 @@ def build(self, cutoff=1e-14, group_size=None, \ self.cart2sph = block_c2s_diag(ncart, nsph, self.angular, l_ctr_counts) inv_idx = np.argsort(self.sph_ao_idx, kind='stable').astype(np.int32) self.coeff = self.cart2sph[:, inv_idx] - + # pairing auxiliary basis with fake basis set fake_l_ctr_offsets = np.append(0, np.cumsum(fake_l_ctr_counts)) fake_l_ctr_offsets += l_ctr_offsets[-1] @@ -255,7 +255,7 @@ def build(self, cutoff=1e-14, group_size=None, \ naux = sph_aux_loc[-1] ao_idx = np.array_split(np.arange(naux), sph_aux_loc[1:-1]) self.sph_aux_idx = np.hstack([ao_idx[i] for i in sorted_aux_idx]) - + # cartesian aux index naux = cart_aux_loc[-1] ao_idx = np.array_split(np.arange(naux), cart_aux_loc[1:-1]) @@ -282,23 +282,23 @@ def build(self, cutoff=1e-14, group_size=None, \ aux_pair2bra.append(np.arange(p0,p1)) aux_pair2ket.append(fake_l_ctr_offsets[0] * np.ones(p1-p0)) aux_log_qs.append(np.ones(p1-p0)) - + self.aux_log_qs = aux_log_qs.copy() pair2bra += aux_pair2bra pair2ket += aux_pair2ket uniq_l_ctr = np.concatenate([uniq_l_ctr, fake_uniq_l_ctr, aux_uniq_l_ctr]) l_ctr_offsets = np.concatenate([ - l_ctr_offsets, + l_ctr_offsets, fake_l_ctr_offsets[1:], - aux_l_ctr_offsets[1:]]) + aux_l_ctr_offsets[1:]]) bas_pair2shls = np.hstack(pair2bra + pair2ket).astype(np.int32).reshape(2,-1) bas_pairs_locs = np.append(0, np.cumsum([x.size for x in pair2bra])).astype(np.int32) log_qs = log_qs + aux_log_qs ao_loc = tot_mol.ao_loc_nr(cart=True) ncptype = len(log_qs) - + self.bpcache = ctypes.POINTER(BasisProdCache)() if diag_block_with_triu: scale_shellpair_diag = 1. @@ -319,8 +319,8 @@ def build(self, cutoff=1e-14, group_size=None, \ self.cp_idx, self.cp_jdx = np.tril_indices(ncptype) else: nl = int(round(np.sqrt(ncptype))) - self.cp_idx, self.cp_jdx = np.unravel_index(np.arange(ncptype), (nl, nl)) - + self.cp_idx, self.cp_jdx = np.unravel_index(np.arange(ncptype), (nl, nl)) + def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None): intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(thred, diag_block_with_triu=True, aosym=True, group_size_aux=64) @@ -331,7 +331,6 @@ def get_int3c2e_wjk(mol, auxmol, dm0_tag, thred=1e-12, omega=None): row, col = np.tril_indices(nao) wj = cupy.zeros([naux]) wk = cupy.zeros([naux,nao,nocc]) - nq = len(intopt.log_qs) for cp_kl_id, _ in enumerate(intopt.aux_log_qs): k0 = intopt.sph_aux_loc[cp_kl_id] k1 = intopt.sph_aux_loc[cp_kl_id+1] @@ -363,9 +362,9 @@ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None): cp_kl_id = cp_aux_id + len(intopt.log_qs) log_q_kl = intopt.aux_log_qs[cp_aux_id] - + k0, k1 = intopt.cart_aux_loc[cp_aux_id], intopt.cart_aux_loc[cp_aux_id+1] - ao_offsets = np.array([0,0,nao+1+k0,nao], dtype=np.int32) + ao_offsets = np.array([0,0,nao+1+k0,nao], dtype=np.int32) nk = k1 - k0 vj_ptr = vk_ptr = lib.c_null_ptr() @@ -407,9 +406,9 @@ def get_int3c2e_ip_jk(intopt, cp_aux_id, ip_type, rhoj, rhok, dm, omega=None): ctypes.c_int(ncp_ij), ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) - if err != 0: + if err != 0: raise RuntimeError(f'GINT_getjk_int2e_ip failed, err={err}') - + return vj, vk @@ -427,17 +426,16 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): if ip_type == 'ip1ip2': order = 2 if ip_type == 'ipvip1': order = 2 if ip_type == 'ipip2': order = 2 - + if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - - nao_sph = len(intopt.sph_ao_idx) + nao = intopt.mol.nao naux = intopt.auxmol.nao norb = nao + naux + 1 comp = 3**order - + nbins = 1 for aux_id, log_q_kl in enumerate(intopt.aux_log_qs): cp_kl_id = aux_id + len(intopt.log_qs) @@ -452,7 +450,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) @@ -464,9 +464,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): err = fn( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), @@ -476,7 +476,7 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): ctypes.c_double(omega)) if err != 0: raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') - + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -484,7 +484,7 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1] j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1] k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - + yield i0,i1,j0,j1,k0,k1,int3c_blk def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): @@ -511,7 +511,7 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): norb = nao + naux + 1 comp = 3**order - + nbins = 1 for aux_id, log_q_kl in enumerate(intopt.aux_log_qs): cp_kl_id = aux_id + len(intopt.log_qs) @@ -528,7 +528,9 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) @@ -540,9 +542,9 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): err = fn( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), @@ -552,7 +554,7 @@ def loop_aux_jk(intopt, ip_type='', omega=None, stream=None): ctypes.c_double(omega)) if err != 0: raise RuntimeError(f'GINT_fill_int3c2e general failed, err={err}') - + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -585,7 +587,7 @@ def get_j_int3c2e_pass1(intopt, dm0): get rhoj pass1 for int3c2e ''' n_dm = 1 - + naux = len(intopt.cart_aux_idx) rhoj = cupy.zeros([naux]) coeff = intopt.coeff @@ -596,7 +598,7 @@ def get_j_int3c2e_pass1(intopt, dm0): bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) - + ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) norb = dm_cart.shape[0] @@ -604,7 +606,7 @@ def get_j_int3c2e_pass1(intopt, dm0): intopt.bpcache, ctypes.cast(dm_cart.data.ptr, ctypes.c_void_p), ctypes.cast(rhoj.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), + ctypes.c_int(norb), ctypes.c_int(naux), ctypes.c_int(n_dm), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), @@ -613,7 +615,7 @@ def get_j_int3c2e_pass1(intopt, dm0): ctypes.c_int(ncp_kl)) if err != 0: raise RuntimeError('CUDA error in get_j_pass1') - + aux_coeff = intopt.aux_coeff rhoj = cupy.dot(rhoj, aux_coeff) return rhoj @@ -632,7 +634,7 @@ def get_j_int3c2e_pass2(intopt, rhoj): bins_locs_ij = np.append(0, np.cumsum(num_cp_ij)).astype(np.int32) bins_locs_kl = np.append(0, np.cumsum(num_cp_kl)).astype(np.int32) - + ncp_ij = len(intopt.log_qs) ncp_kl = len(intopt.aux_log_qs) @@ -643,14 +645,14 @@ def get_j_int3c2e_pass2(intopt, rhoj): intopt.bpcache, ctypes.cast(vj.data.ptr, ctypes.c_void_p), ctypes.cast(rhoj.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), + ctypes.c_int(norb), ctypes.c_int(naux), ctypes.c_int(n_dm), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), - ctypes.c_int(ncp_ij), + ctypes.c_int(ncp_ij), ctypes.c_int(ncp_kl)) - + if err != 0: raise RuntimeError('CUDA error in get_j_pass2') coeff = intopt.coeff @@ -707,15 +709,15 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg vk1_buf = cupy.zeros([3,nao_sph,nao_sph]) vj1 = cupy.zeros([natom,3,nao_sph,nocc]) vk1 = cupy.zeros([natom,3,nao_sph,nocc]) - + for aux_id, int3c_blk in loop_aux_jk(intopt, ip_type='ip1', omega=omega): k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] vj1_buf += cupy.einsum('xpji,p->xij', int3c_blk, rhoj[k0:k1]) - + if with_k: rhok0_slice = cupy.einsum('pio,Jo->piJ', rhok[k0:k1], orbo) * 2 vk1_buf += cupy.einsum('xpji,plj->xil', int3c_blk, rhok0_slice) - + rhoj0 = cupy.einsum('xpji,ij->xpi', int3c_blk, dm0_tag) vj1_ao = cupy.einsum('pjo,xpi->xijo', rhok[k0:k1], rhoj0) vj1 += 2.0*cupy.einsum('xiko,ia->axko', vj1_ao, ao2atom) @@ -723,7 +725,7 @@ def get_int3c2e_ip1_vjk(intopt, rhoj, rhok, dm0_tag, aoslices, with_k=True, omeg int3c_ip1_occ = cupy.einsum('xpji,jo->xpio', int3c_blk, orbo) vk1_ao = cupy.einsum('xpio,pki->xiko', int3c_ip1_occ, rhok0_slice) vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom) - + rhok0 = cupy.einsum('pli,lo->poi', rhok0_slice, orbo) vk1_ao = cupy.einsum('xpji,poi->xijo', int3c_blk, rhok0) vk1 += cupy.einsum('xiko,ia->axko', vk1_ao, ao2atom) @@ -736,7 +738,6 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome aux2atom = get_aux2atom(intopt, auxslices) natom = len(auxslices) nao_sph = len(intopt.sph_ao_idx) - naux_sph = len(intopt.sph_aux_idx) orbo = cupy.asarray(dm0_tag.occ_coeff, order='C') nocc = orbo.shape[1] vj1 = cupy.zeros([natom,3,nao_sph,nocc]) @@ -753,7 +754,7 @@ def get_int3c2e_ip2_vjk(intopt, rhoj, rhok, dm0_tag, auxslices, with_k=True, ome if with_k: rhok0_slice = cupy.einsum('pio,jo->pij', rhok[k0:k1], orbo) vk1_tmp = -cupy.einsum('xpjo,pij->xpio', wk2_P__, rhok0_slice) * 2 - + rhok0_oo = cupy.einsum('pio,ir->pro', rhok[k0:k1], orbo) vk1_tmp -= cupy.einsum('xpio,pro->xpir', wk2_P__, rhok0_oo) * 2 @@ -875,18 +876,18 @@ def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, strea nao = intopt.mol.nao naux = intopt.auxmol.nao - norb = nao + naux + 1 + norb = nao + naux + 1 nbins = 1 cp_kl_id = cp_aux_id + len(intopt.log_qs) log_q_kl = intopt.aux_log_qs[cp_aux_id] - + bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) k0, k1 = intopt.cart_aux_loc[cp_aux_id], intopt.cart_aux_loc[cp_aux_id+1] - + nk = k1 - k0 - ao_offsets = np.array([0,0,nao+1+k0,nao], dtype=np.int32) + ao_offsets = np.array([0,0,nao+1+k0,nao], dtype=np.int32) if out is None: int3c_blk = cupy.zeros([3, nk, nao, nao], order='C', dtype=np.float64) strides = np.array([1, nao, nao*nao, nao*nao*nk], dtype=np.int32) @@ -900,23 +901,23 @@ def get_int3c2e_ip_slice(intopt, cp_aux_id, ip_type, out=None, omega=None, strea err = libgint.GINTfill_int3c2e_ip( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), - ctypes.c_int(cp_ij_id), + ctypes.c_int(cp_ij_id), ctypes.c_int(cp_kl_id), ctypes.c_int(ip_type), ctypes.c_double(omega)) - if err != 0: + if err != 0: raise RuntimeError(f'GINT_fill_int2e_ip failed, err={err}') - + return int3c_blk - + def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_scf_tol=1e-13, omega=None, stream=None): ''' Generate full int3c2e_ip tensor on GPU @@ -926,15 +927,15 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - if auxmol is None: + if auxmol is None: auxmol = df.addons.make_auxmol(mol, auxbasis) - + nao_sph = mol.nao naux_sph = auxmol.nao intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - + nao = intopt.mol.nao naux = intopt.auxmol.nao norb = nao + naux + 1 @@ -946,13 +947,15 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] lj = intopt.angular[cpj] - + for aux_id, log_q_kl in enumerate(intopt.aux_log_qs): cp_kl_id = aux_id + len(intopt.log_qs) i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 lk = intopt.aux_angular[aux_id] bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) @@ -965,9 +968,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s err = fn( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), @@ -976,6 +979,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) + if err != 0: + raise RuntimeError("int3c2e_ip failed\n") + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) @@ -988,10 +994,10 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s ao_idx = np.argsort(intopt.sph_ao_idx) aux_idx = np.argsort(intopt.sph_aux_idx) int3c = int3c[cupy.ix_(np.arange(3), aux_idx, ao_idx, ao_idx)] - + return int3c.transpose([0,3,2,1]) - + def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', direct_scf_tol=1e-13, omega=None, stream=None): ''' Generate full int3c2e type tensor on GPU @@ -1007,15 +1013,15 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - if auxmol is None: + if auxmol is None: auxmol = df.addons.make_auxmol(mol, auxbasis) - + nao_sph = mol.nao naux_sph = auxmol.nao intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False, group_size=BLKSIZE, group_size_aux=BLKSIZE) - + nao = intopt.mol.nao naux = intopt.auxmol.nao norb = nao + naux + 1 @@ -1028,13 +1034,15 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di cpj = intopt.cp_jdx[cp_ij_id] li = intopt.angular[cpi] lj = intopt.angular[cpj] - + for aux_id, log_q_kl in enumerate(intopt.aux_log_qs): cp_kl_id = aux_id + len(intopt.log_qs) i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 lk = intopt.aux_angular[aux_id] bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) @@ -1047,9 +1055,9 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di err = fn( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), @@ -1057,21 +1065,21 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di ctypes.c_int(cp_ij_id), ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) - + int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) int3c_blk = cart2sph(int3c_blk, axis=3, ang=li) - + i0, i1 = intopt.sph_ao_loc[cpi], intopt.sph_ao_loc[cpi+1] j0, j1 = intopt.sph_ao_loc[cpj], intopt.sph_ao_loc[cpj+1] k0, k1 = intopt.sph_aux_loc[aux_id], intopt.sph_aux_loc[aux_id+1] - + int3c[:, k0:k1, j0:j1, i0:i1] = int3c_blk ao_idx = np.argsort(intopt.sph_ao_idx) aux_idx = np.argsort(intopt.sph_aux_idx) int3c = int3c[cupy.ix_(np.arange(comp), aux_idx, ao_idx, ao_idx)] - + return int3c.transpose([0,3,2,1]) def get_dh1e(mol, dm0): @@ -1100,13 +1108,13 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=N if omega is None: omega = 0.0 nao = intopt.nao naux = intopt.nao - + norb = nao + naux + 1 - + cpi = intopt.cp_idx[cp_ij_id] cpj = intopt.cp_jdx[cp_ij_id] cp_kl_id = cp_aux_id + len(intopt.log_qs) - + log_q_ij = intopt.log_qs[cp_ij_id] log_q_kl = intopt.aux_log_qs[cp_aux_id] @@ -1117,10 +1125,12 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=N i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[cp_aux_id], intopt.cart_aux_loc[cp_aux_id+1] - - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 lk = intopt.aux_angular[cp_aux_id] - + ao_offsets = np.array([i0,j0,nao+1+k0,nao], dtype=np.int32) ''' # if possible, write the data into the given allocated space @@ -1138,26 +1148,26 @@ def get_int3c2e_slice(intopt, cp_ij_id, cp_aux_id, aosym=None, out=None, omega=N err = libgint.GINTfill_int3c2e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), - ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + ctypes.cast(int3c_blk.data.ptr, ctypes.c_void_p), + ctypes.c_int(norb), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_ij.ctypes.data_as(ctypes.c_void_p), bins_locs_kl.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), - ctypes.c_int(cp_ij_id), + ctypes.c_int(cp_ij_id), ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) - - if err != 0: + + if err != 0: raise RuntimeError('GINT_fill_int2e failed') - + # move this operation to j2c? - if lk > 1: + if lk > 1: int3c_blk = cart2sph(int3c_blk, axis=0, ang=lk, out=out) - + return int3c_blk - + def get_int3c2e(mol, auxmol=None, auxbasis='weigend+etb', direct_scf_tol=1e-13, aosym=True, omega=None): ''' Generate full int3c2e tensor on GPU @@ -1170,7 +1180,7 @@ def get_int3c2e(mol, auxmol=None, auxbasis='weigend+etb', direct_scf_tol=1e-13, naux_sph = auxmol.nao intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=aosym, group_size=BLKSIZE, group_size_aux=BLKSIZE) - + int3c = cupy.zeros([naux_sph, nao_sph, nao_sph], order='C') for cp_ij_id, _ in enumerate(intopt.log_qs): cpi = intopt.cp_idx[cp_ij_id] @@ -1212,7 +1222,7 @@ def get_int2c2e_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, aosym=Non naux = intopt.sorted_auxmol.nao norb = nao + naux + 1 rows, cols = np.tril_indices(naux) - + int2c = cupy.zeros([naux, naux], order='F') ao_offsets = np.array([nao+1, nao, nao+1, nao], dtype=np.int32) strides = np.array([1, naux, naux, naux*naux], dtype=np.int32) @@ -1226,9 +1236,9 @@ def get_int2c2e_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, aosym=Non err = libgint.GINTfill_int2e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int2c.data.ptr, ctypes.c_void_p), + ctypes.cast(int2c.data.ptr, ctypes.c_void_p), ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_k.ctypes.data_as(ctypes.c_void_p), bins_locs_l.ctypes.data_as(ctypes.c_void_p), @@ -1239,11 +1249,11 @@ def get_int2c2e_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, aosym=Non if err != 0: raise RuntimeError("int2c2e failed\n") - + int2c[rows, cols] = int2c[cols, rows] coeff = intopt.aux_cart2sph int2c = coeff.T @ int2c @ coeff - + return int2c def get_int2c2e_ip_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, intor=None, aosym=None, stream=None): @@ -1260,7 +1270,7 @@ def get_int2c2e_ip_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, intor= naux = intopt.sorted_auxmol.nao norb = nao + naux + 1 rows, cols = np.tril_indices(naux) - + int2c = cupy.zeros([naux, naux], order='F') ao_offsets = np.array([nao+1, nao, nao+1, nao], dtype=np.int32) strides = np.array([1, naux, naux, naux*naux], dtype=np.int32) @@ -1274,28 +1284,28 @@ def get_int2c2e_ip_sorted(mol, auxmol, intopt=None, direct_scf_tol=1e-13, intor= err = libgint.GINTfill_int2e( ctypes.cast(stream.ptr, ctypes.c_void_p), intopt.bpcache, - ctypes.cast(int2c.data.ptr, ctypes.c_void_p), + ctypes.cast(int2c.data.ptr, ctypes.c_void_p), ctypes.c_int(norb), - strides.ctypes.data_as(ctypes.c_void_p), + strides.ctypes.data_as(ctypes.c_void_p), ao_offsets.ctypes.data_as(ctypes.c_void_p), bins_locs_k.ctypes.data_as(ctypes.c_void_p), bins_locs_l.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(nbins), - ctypes.c_int(cp_k_id), + ctypes.c_int(cp_k_id), ctypes.c_int(cp_l_id)) if err != 0: raise RuntimeError("int2c2e failed\n") - + int2c[rows, cols] = int2c[cols, rows] coeff = intopt.aux_cart2sph int2c = coeff.T @ int2c @ coeff - + return int2c def get_int2c2e(mol, auxmol, direct_scf_tol=1e-13): - ''' - Generate int2c2e on GPU + ''' + Generate int2c2e on GPU ''' intopt = VHFOpt(mol, auxmol, 'int2e') intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=True) @@ -1308,26 +1318,26 @@ def sort_mol(mol0, cart=True): ''' # Sort basis according to angular momentum and contraction patterns so # as to group the basis functions to blocks in GPU kernel. - ''' + ''' mol = copy.copy(mol0) l_ctrs = mol._bas[:,[gto.ANG_OF, gto.NPRIM_OF]] - + uniq_l_ctr, _, inv_idx, l_ctr_counts = np.unique( l_ctrs, return_index=True, return_inverse=True, return_counts=True, axis=0) - + if mol.verbose >= logger.DEBUG: logger.debug1(mol, 'Number of shells for each [l, nctr] group') for l_ctr, n in zip(uniq_l_ctr, l_ctr_counts): logger.debug(mol, ' %s : %s', l_ctr, n) - + sorted_idx = np.argsort(inv_idx, kind='stable').astype(np.int32) - + # Sort basis inplace mol._bas = mol._bas[sorted_idx] return mol, sorted_idx, uniq_l_ctr, l_ctr_counts -def get_pairing(p_offsets, q_offsets, q_cond, \ - cutoff=1e-14, diag_block_with_triu=True, aosym=True): +def get_pairing(p_offsets, q_offsets, q_cond, + cutoff=1e-14, diag_block_with_triu=True, aosym=True): ''' pair shells and return pairing indices ''' @@ -1359,13 +1369,13 @@ def get_pairing(p_offsets, q_offsets, q_cond, \ if not diag_block_with_triu: # Drop the shell pairs in the upper triangle for diagonal blocks mask &= ishs >= jshs - + ishs = ishs[mask] jshs = jshs[mask] ishs += p0 jshs += p0 if len(ishs) == 0 and len(jshs) == 0: continue - + pair2bra.append(ishs) pair2ket.append(jshs) @@ -1413,7 +1423,7 @@ def get_ao_pairs(pair2bra, pair2ket, ao_loc): bra_ctr.append(np.array([], dtype=np.int64)) ket_ctr.append(np.array([], dtype=np.int64)) continue - + i = bra_shl[0] j = ket_shl[0] indices = np.mgrid[:ao_loc[i+1]-ao_loc[i], :ao_loc[j+1]-ao_loc[j]] diff --git a/gpu4pyscf/df/patch_pyscf.py b/gpu4pyscf/df/patch_pyscf.py index 28d77ea7..0e032eb9 100644 --- a/gpu4pyscf/df/patch_pyscf.py +++ b/gpu4pyscf/df/patch_pyscf.py @@ -21,10 +21,9 @@ from gpu4pyscf.df.df_jk import _density_fit, _get_jk from pyscf import df -from gpu4pyscf.lib.utils import patch_cpu_kernel # The device attribute is patched to the pyscf base SCF module. It will be # seen by all subclasses. -df.df_jk.density_fit = patch_cpu_kernel(df.df_jk.density_fit)(_density_fit) -df.df_jk.get_jk = patch_cpu_kernel(df.df_jk.get_jk)(_get_jk) +df.df_jk.density_fit = _density_fit +df.df_jk.get_jk = _get_jk diff --git a/gpu4pyscf/df/tests/test_df_ecp.py b/gpu4pyscf/df/tests/test_df_ecp.py index 5a3370e2..e13e2ef3 100644 --- a/gpu4pyscf/df/tests/test_df_ecp.py +++ b/gpu4pyscf/df/tests/test_df_ecp.py @@ -52,13 +52,11 @@ def tearDownModule(): def run_dft(xc): mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) mf.grids.level = grids_level - mf.device = 'gpu' e_dft = mf.kernel() return e_dft def _check_grad(grid_response=False, tol=1e-5): mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) - mf.device = 'gpu' mf.grids.level = grids_level mf.conv_tol = 1e-12 e_tot = mf.kernel() @@ -81,9 +79,8 @@ def _check_grad(grid_response=False, tol=1e-5): mol.set_geom_(coords, unit='Bohr') mol.build() e0 = f_scanner(mol) - + mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) - mf.device = 'gpu' mf.grids.level = grids_level coords[i,j] -= 2.0 * eps @@ -92,7 +89,6 @@ def _check_grad(grid_response=False, tol=1e-5): e1 = f_scanner(mol) mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis) - mf.device = 'gpu' mf.grids.level = grids_level coords[i,j] += eps diff --git a/gpu4pyscf/df/tests/test_df_grad.py b/gpu4pyscf/df/tests/test_df_grad.py index 6daec726..4e70018f 100644 --- a/gpu4pyscf/df/tests/test_df_grad.py +++ b/gpu4pyscf/df/tests/test_df_grad.py @@ -56,7 +56,6 @@ def tearDownModule(): def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6): mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0) - mf.device = 'gpu' mf.grids.level = grids_level mf.conv_tol = 1e-12 e_tot = mf.kernel() @@ -79,9 +78,8 @@ def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6): mol.set_geom_(coords, unit='Bohr') mol.build() e0 = f_scanner(mol) - + mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0) - mf.device = 'gpu' mf.grids.level = grids_level coords[i,j] -= 2.0 * eps @@ -90,7 +88,6 @@ def _check_grad(grid_response=False, xc=xc0, disp=disp0, tol=1e-6): e1 = f_scanner(mol) mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0) - mf.device = 'gpu' mf.grids.level = grids_level coords[i,j] += eps diff --git a/gpu4pyscf/df/tests/test_df_scf.py b/gpu4pyscf/df/tests/test_df_scf.py index cb81c19b..ba959b69 100644 --- a/gpu4pyscf/df/tests/test_df_scf.py +++ b/gpu4pyscf/df/tests/test_df_scf.py @@ -46,7 +46,6 @@ def tearDownModule(): def run_dft(xc): mf = rks.RKS(mol, xc=xc).density_fit(auxbasis='def2-tzvpp-jkfit') mf.grids.level = grids_level - mf.device = 'gpu' e_dft = mf.kernel() return e_dft diff --git a/gpu4pyscf/df/tests/test_geomopt.py b/gpu4pyscf/df/tests/test_geomopt.py index c205975c..4960278f 100644 --- a/gpu4pyscf/df/tests/test_geomopt.py +++ b/gpu4pyscf/df/tests/test_geomopt.py @@ -52,7 +52,6 @@ def tearDownModule(): class KnownValues(unittest.TestCase): def test_geomopt(self): mf = rks.RKS(mol, xc=xc, disp=disp).density_fit() - mf.device = 'gpu' mf.grids.level = grids_level e_tot = mf.kernel() mol_eq = optimize(mf, maxsteps=20) diff --git a/gpu4pyscf/dft/gen_grid.py b/gpu4pyscf/dft/gen_grid.py index feaafbd8..09358fca 100644 --- a/gpu4pyscf/dft/gen_grid.py +++ b/gpu4pyscf/dft/gen_grid.py @@ -38,7 +38,6 @@ from cupyx.scipy.spatial.distance import cdist from gpu4pyscf.dft import radi from gpu4pyscf.lib.cupy_helper import load_library -from gpu4pyscf.lib.utils import to_cpu, to_gpu libdft = lib.load_library('libdft') libgdft = load_library('libgdft') @@ -362,7 +361,6 @@ def gen_grid_partition0(coords): ''' def gen_grid_partition(coords): grid_dist = cupy.linalg.norm(coords[None,:,:] - atm_coords[:,None,:], axis=-1) - ngrid = coords.shape[0] r12 = grid_dist[:,None,:] - grid_dist[None,:,:] rinv = 1.0/atm_dist cupy.fill_diagonal(rinv, 0.0) @@ -530,13 +528,15 @@ class Grids(lib.StreamObject): Eg, grids.atom_grid = {'H': (20,110)} will generate 20 radial grids and 110 angular grids for H atom. - Examples: + Examples: - >>> mol = gto.M(atom='H 0 0 0; H 0 0 1.1') - >>> grids = dft.gen_grid.Grids(mol) - >>> grids.level = 4 - >>> grids.build() - ''' + >>> mol = gto.M(atom='H 0 0 0; H 0 0 1.1') + >>> grids = dft.gen_grid.Grids(mol) + >>> grids.level = 4 + >>> grids.build() + ''' + + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device atomic_radii = _load_conf(radi, 'dft_gen_grid_Grids_atomic_radii', radi.BRAGG_RADII) @@ -552,9 +552,6 @@ class Grids(lib.StreamObject): alignment = ALIGNMENT_UNIT cutoff = CUTOFF - to_cpu = to_cpu - to_gpu = to_gpu - def __init__(self, mol): self.mol = mol self.stdout = mol.stdout diff --git a/gpu4pyscf/dft/gks.py b/gpu4pyscf/dft/gks.py index b432b036..53992f0e 100644 --- a/gpu4pyscf/dft/gks.py +++ b/gpu4pyscf/dft/gks.py @@ -17,27 +17,14 @@ from pyscf.dft import gks from gpu4pyscf.dft import numint -from gpu4pyscf.scf.ghf import get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.ghf import GHF class GKS(gks.GKS): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() - @property - def device(self): - return self._numint.device - @device.setter - def device(self, value): - self._numint.device = value - - @patch_cpu_kernel(gks.GKS.get_jk) - def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True, - omega=None): - return get_jk(mol, dm, hermi, with_j, with_k, omega) - - _eigh = patch_cpu_kernel(gks.GKS._eigh)(_eigh) + get_jk = GHF.get_jk + _eigh = GHF._eigh diff --git a/gpu4pyscf/dft/libxc.py b/gpu4pyscf/dft/libxc.py index ba8df6c6..4c6ebc90 100644 --- a/gpu4pyscf/dft/libxc.py +++ b/gpu4pyscf/dft/libxc.py @@ -56,7 +56,7 @@ class _xcfun(ctypes.Structure): libxc.xc_func_end.argtypes = (_xc_func_p, ) libxc.xc_func_free.argtypes = (_xc_func_p, ) -class XCfun(): +class XCfun: def __init__(self, xc, spin): assert spin == 'unpolarized' self._spin = 1 diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py index 6a21fb5f..389f8cd9 100644 --- a/gpu4pyscf/dft/numint.py +++ b/gpu4pyscf/dft/numint.py @@ -25,7 +25,6 @@ from pyscf.dft import numint from pyscf.gto.eval_gto import NBINS, CUTOFF, make_screen_index from gpu4pyscf.scf.hf import basis_seg_contraction -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import contract, get_avail_mem, load_library, add_sparse, release_gpu_stack from gpu4pyscf.dft import xc_deriv, xc_alias, libxc from gpu4pyscf import __config__ @@ -259,7 +258,7 @@ def eval_rho3(mol, ao, c0, mo1, non0tab=None, xctype='LDA', if ao.shape[0] > 4: XX, YY, ZZ = 4, 7, 9 ao2 = ao[XX] + ao[YY] + ao[ZZ] - c1 = _dot_ao_dm(mol, ao2, cpos, non0tab, shls_slice, ao_loc) + c1 = _dot_ao_dm(mol, ao2, cpos1, non0tab, shls_slice, ao_loc) #:rho[4] = numpy.einsum('pi,pi->p', c0, c1) rho[4] = _contract_rho(c0, c1) rho[4] += rho[5] @@ -363,7 +362,6 @@ def _vv10nlc(rho, coords, vvrho, vvweight, vvcoords, nlc_pars): vxc[1,threshind] = 1.5*W*dW0dG return exc,vxc -@patch_cpu_kernel(numint.nr_rks) def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): log = logger.new_logger(mol, verbose) @@ -374,7 +372,7 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, opt = ni.gdftopt mo_coeff = getattr(dms, 'mo_coeff', None) - mo_occ = getattr(dms,'mo_occ', None) + mo_occ = getattr(dms,'mo_occ', None) mol = opt.mol coeff = cupy.asarray(opt.coeff) @@ -427,7 +425,7 @@ def _make_rho(ao_value, dm, xctype=None): if USE_SPARSITY == 0: vmat[i] += ao.dot(_scale_ao(ao, wv).T) elif USE_SPARSITY == 1: - _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc, \ + _dot_ao_ao_sparse(ao, ao, wv, nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) elif USE_SPARSITY == 2: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[1]) @@ -446,7 +444,7 @@ def _make_rho(ao_value, dm, xctype=None): vmat[i] += ao[0].dot(_scale_ao(ao, wv).T) elif USE_SPARSITY == 1: aow = _scale_ao(ao, wv) - _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, \ + _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) elif USE_SPARSITY == 2: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) @@ -468,9 +466,9 @@ def _make_rho(ao_value, dm, xctype=None): vmat[i] += ao[0].dot(aow.T) vmat[i] += _tau_dot(ao, ao, wv[4]) elif USE_SPARSITY == 1: - _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, \ + _dot_ao_ao_sparse(ao[0], aow, None, nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) - _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc, \ + _tau_dot_sparse(ao, ao, wv[4], nbins, sindex, ao_loc, pair2shls_full, pairs_locs_full, vmat[i]) else: mask = cupy.any(cupy.abs(ao) > AO_THRESHOLD, axis=[0,2]) @@ -506,7 +504,6 @@ def _make_rho(ao_value, dm, xctype=None): vmat = vmat[0] return nelec, excsum, vmat#np.asarray(vmat) -@patch_cpu_kernel(numint.nr_uks) def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): log = logger.new_logger(mol, verbose) @@ -547,7 +544,6 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ngrids = grids.weights.size for p0, p1 in lib.prange(0, ngrids, block_size): - coords = grids.coords[p0:p1, :] ao = eval_ao(ni, opt.mol, grids.coords[p0:p1], ao_deriv) weight = grids.weights[p0:p1] for i in range(nset): @@ -609,7 +605,6 @@ def nr_uks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, return nelec, excsum, vmat -@patch_cpu_kernel(numint.get_rho) def get_rho(ni, mol, dm, grids, max_memory=2000): opt = getattr(ni, 'gdftopt', None) if opt is None: @@ -637,7 +632,6 @@ def get_rho(ni, mol, dm, grids, max_memory=2000): cupy.get_default_memory_pool().free_all_blocks() return rho -@patch_cpu_kernel(numint.nr_rks_fxc) def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): if fxc is None: @@ -657,7 +651,6 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= if with_mocc: mo1 = cupy.einsum('nio,pi->npo', dms.mo1, coeff) * 2.0**0.5 occ_coeff = cupy.einsum('io,pi->po', dms.occ_coeff, coeff) * 2.0**0.5 - mo_occ = dms.mo_occ dms = contract('nij,qj->niq', dms, coeff) dms = contract('pi,niq->npq', coeff, dms) nset = len(dms) @@ -667,7 +660,8 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= ao_deriv = 0 else: ao_deriv = 1 - p0 = 0; p1 = 0 + p0 = 0 + p1 = 0 t0 = (logger.process_clock(), logger.perf_counter()) for ao, mask, weights, coords in ni.block_loop(opt.mol, grids, nao, ao_deriv): p0, p1 = p1, p1+len(weights) @@ -720,7 +714,6 @@ def nr_rks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi= return cupy.asarray(vmat) -@patch_cpu_kernel(numint.nr_rks_fxc_st) def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, relativity=0, singlet=True, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): @@ -733,7 +726,6 @@ def nr_rks_fxc_st(ni, mol, grids, xc_code, dm0=None, dms_alpha=None, return nr_rks_fxc(ni, mol, grids, xc_code, dm0, dms_alpha, hermi=0, fxc=fxc) -@patch_cpu_kernel(numint.nr_uks_fxc) def nr_uks_fxc(ni, mol, grids, xc_code, dm0=None, dms=None, relativity=0, hermi=0, rho0=None, vxc=None, fxc=None, max_memory=2000, verbose=None): if fxc is None: @@ -890,10 +882,9 @@ def nr_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, vmat = vmat + vmat.T vmat = contract('pi,pq->iq', coeff, vmat) vmat = contract('qj,iq->ij', coeff, vmat) - t1 = log.timer_debug1('eval vv10', *t0) + log.timer_debug1('eval vv10', *t0) return nelec, excsum, vmat -@patch_cpu_kernel(numint.cache_xc_kernel) def cache_xc_kernel(ni, mol, grids, xc_code, mo_coeff, mo_occ, spin=0, max_memory=2000): xctype = ni._xc_type(xc_code) @@ -986,7 +977,7 @@ def eval_xc_eff(ni, xc_code, rho, deriv=1, omega=None, xctype=None, verbose=None do_kxc = deriv > 2 vxc_labels = ["vrho", "vsigma", "vlapl", "vtau"] - fxc_labels = ["v2rho2", "v2rhosigma", "v2sigma2", "v2lapl2", "v2tau2", \ + fxc_labels = ["v2rho2", "v2rhosigma", "v2sigma2", "v2lapl2", "v2tau2", "v2rholapl", "v2rhotau", "v2lapltau", "v2sigmalapl", "v2sigmatau"] kxc_labels = ["v3rho3", "v3rho2sigma", "v3rhosigma2", "v3sigma3", "v3rho2lapl", "v3rho2tau", @@ -1074,10 +1065,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000, yield ao, sindex, weight, coords class NumInt(numint.NumInt): - to_cpu = to_cpu - to_gpu = to_gpu - - device = 'gpu' + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, xc='LDA'): super().__init__() diff --git a/gpu4pyscf/dft/patch_pyscf.py b/gpu4pyscf/dft/patch_pyscf.py index 3b3676fe..75e7a27c 100644 --- a/gpu4pyscf/dft/patch_pyscf.py +++ b/gpu4pyscf/dft/patch_pyscf.py @@ -22,10 +22,8 @@ from pyscf.dft.rks import KohnShamDFT from pyscf.dft.numint import NumInt from gpu4pyscf.dft import numint as gpu_numint -from gpu4pyscf.lib.utils import patch_cpu_kernel print(f'{NumInt} monkey-patched') -NumInt.device = 'gpu' NumInt.get_rho = gpu_numint.get_rho NumInt.nr_rks = gpu_numint.nr_rks NumInt.nr_uks = gpu_numint.nr_uks @@ -33,10 +31,3 @@ NumInt.nr_uks_fxc = gpu_numint.nr_uks_fxc NumInt.nr_rks_fxc_st = gpu_numint.nr_rks_fxc_st NumInt.cache_xc_kernel = gpu_numint.cache_xc_kernel - -print(f'{KohnShamDFT} monkey-patched') -def _get_device(obj): - return getattr(obj._numint, 'device', 'cpu') -def _set_device(obj, value): - obj._numint.device = value -KohnShamDFT.device = property(_get_device, _set_device) diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py index ea995601..868c4b2d 100644 --- a/gpu4pyscf/dft/rks.py +++ b/gpu4pyscf/dft/rks.py @@ -23,11 +23,10 @@ from pyscf.scf import hf as pyscf_hf from pyscf.dft import rks -from gpu4pyscf import scf from gpu4pyscf.scf import diis from gpu4pyscf.lib import logger from gpu4pyscf.dft import numint, gen_grid -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.hf import RHF from gpu4pyscf.lib.cupy_helper import load_library, tag_array libcupy_helper = load_library('libcupy_helper') @@ -100,7 +99,7 @@ def initialize_grids(ks, mol=None, dm=None): return ks -def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): +def get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): '''Coulomb + XC functionals .. note:: This function will modify the input ks object. @@ -203,9 +202,8 @@ def _get_veff(ks, mol=None, dm=None, dm_last=0, vhf_last=0, hermi=1): vxc = tag_array(vxc, ecoul=ecoul, exc=exc, vj=vj, vk=vk) return vxc -class RKS(rks.RKS, scf.hf.RHF): - to_cpu = to_cpu - to_gpu = to_gpu +class RKS(rks.RKS): + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, xc='LDA,VWN', disp=None): super().__init__(mol, xc) @@ -214,13 +212,6 @@ def __init__(self, mol, xc='LDA,VWN', disp=None): self.screen_tol = 1e-14 self.grids = gen_grid.Grids(mol) - @property - def device(self): - return self._numint.device - @device.setter - def device(self, value): - self._numint.device = value - def get_dispersion(self): if self.disp is None: return 0.0 @@ -269,4 +260,6 @@ def energy_tot(self, dm, h1e, vhf=None): self.scf_summary['nuc'] = nuc.real return e_tot - get_veff = patch_cpu_kernel(rks.RKS.get_veff)(_get_veff) + get_jk = RHF.get_jk + get_veff = get_veff + _eigh = RHF._eigh diff --git a/gpu4pyscf/dft/roks.py b/gpu4pyscf/dft/roks.py index cb83d395..6ec07008 100644 --- a/gpu4pyscf/dft/roks.py +++ b/gpu4pyscf/dft/roks.py @@ -17,23 +17,14 @@ from pyscf.dft import roks from gpu4pyscf.dft import numint -from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.rohf import ROHF class ROKS(roks.ROKS): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() - @property - def device(self): - return self._numint.device - @device.setter - def device(self, value): - self._numint.device = value - - get_jk = patch_cpu_kernel(roks.ROKS.get_jk)(_get_jk) - _eigh = patch_cpu_kernel(roks.ROKS._eigh)(_eigh) + get_jk = ROHF.get_jk + _eigh = ROHF._eigh diff --git a/gpu4pyscf/dft/tests/test_ao_values.py b/gpu4pyscf/dft/tests/test_ao_values.py index 02aeff94..11c2c0de 100644 --- a/gpu4pyscf/dft/tests/test_ao_values.py +++ b/gpu4pyscf/dft/tests/test_ao_values.py @@ -71,7 +71,7 @@ def test_ao_sph_deriv2(self): ao_cpu = cupy.asarray(ao) ni = NumInt(xc='LDA') ao_gpu = numint.eval_ao(ni, mol_sph, coords, deriv=2) - idx = cupy.argwhere(cupy.abs(ao_gpu - ao_cpu) > 1e-10) + #idx = cupy.argwhere(cupy.abs(ao_gpu - ao_cpu) > 1e-10) assert cupy.linalg.norm(ao_cpu - ao_gpu) < 1e-8 def test_ao_sph_deriv3(self): diff --git a/gpu4pyscf/dft/tests/test_dft.py b/gpu4pyscf/dft/tests/test_dft.py index ce4ea0a4..0a6585e5 100644 --- a/gpu4pyscf/dft/tests/test_dft.py +++ b/gpu4pyscf/dft/tests/test_dft.py @@ -40,7 +40,6 @@ def run_dft(xc): mf = rks.RKS(mol, xc=xc) mf.grids.level = grids_level mf.nlcgrids.level = nlcgrids_level - mf.device = 'gpu' e_dft = mf.kernel() return e_dft diff --git a/gpu4pyscf/dft/tests/test_dft_ecp.py b/gpu4pyscf/dft/tests/test_dft_ecp.py index 57f1d2f3..fcfab5c2 100644 --- a/gpu4pyscf/dft/tests/test_dft_ecp.py +++ b/gpu4pyscf/dft/tests/test_dft_ecp.py @@ -43,7 +43,6 @@ def tearDownModule(): def run_dft(xc): mf = rks.RKS(mol, xc=xc) mf.grids.level = grids_level - mf.device = 'gpu' e_dft = mf.kernel() return e_dft diff --git a/gpu4pyscf/dft/tests/test_numint.py b/gpu4pyscf/dft/tests/test_numint.py index be3ffe20..0862976e 100644 --- a/gpu4pyscf/dft/tests/test_numint.py +++ b/gpu4pyscf/dft/tests/test_numint.py @@ -62,19 +62,17 @@ def tearDownModule(): del mol class KnownValues(unittest.TestCase): - + def _check_vxc(self, method, xc): ni = NumInt(xc=xc) fn = getattr(ni, method) - ni.device = 'gpu' n, e, v = fn(mol, grids_gpu, xc, dm0, hermi=1) v = [i.get() for i in v] - + ni_pyscf = pyscf_numint() - ni_pyscf.device = 'cpu' fn = getattr(ni_pyscf, method) nref, eref, vref = fn(mol, grids_cpu, xc, dm0, hermi=1) - + cupy.allclose(e, eref) cupy.allclose(n, nref) cupy.allclose(v, vref) @@ -86,7 +84,6 @@ def _check_rks_fxc(self, xc, hermi=1): t1 = dm spin = 0 ni_pyscf = pyscf_numint() - ni_pyscf.device = 'cpu' rho, vxc, fxc = ni_pyscf.cache_xc_kernel(mol, grids_cpu, xc, mo_coeff, mo_occ, spin) vref = ni_pyscf.nr_rks_fxc( mol, grids_cpu, xc, dm0=dm0, dms=t1, rho0=rho, vxc=vxc, fxc=fxc, hermi=hermi) @@ -95,7 +92,6 @@ def _check_rks_fxc(self, xc, hermi=1): vxc0 = vxc.copy() fxc0 = fxc.copy() ni = NumInt() - ni.device = 'gpu' rho, vxc, fxc = ni.cache_xc_kernel(mol, grids_gpu, xc, cupy.asarray(mo_coeff), cupy.asarray(mo_occ), spin) v = ni.nr_rks_fxc(mol, grids_gpu, xc, dms=t1, fxc=fxc, hermi=hermi) if xc == MGGA_M06: @@ -110,14 +106,12 @@ def _check_rks_fxc(self, xc, hermi=1): def _check_rks_fxc_st(self, xc, fpref): ni = NumInt() spin = 1 - ni.device = 'gpu' rho, vxc, fxc = ni.cache_xc_kernel( mol, grids_gpu, xc, [mo_coeff]*2, [mo_occ*.5]*2, spin) v = ni.nr_rks_fxc_st(mol, grids_gpu, xc, dms_alpha=dm, fxc=fxc) self.assertAlmostEqual(lib.fp(v), fpref, 12) - + ni_pyscf = pyscf_numint() - ni_pyscf.device = 'cpu' rho, vxc, fxc = ni_pyscf.cache_xc_kernel( mol, grids_cpu, xc, [mo_coeff]*2, [mo_occ*.5]*2, spin) vref = ni_pyscf.nr_rks_fxc_st( @@ -131,29 +125,28 @@ def _check_uks_fxc(self, xc, fpref, hermi=1): t1 = dm ni = NumInt() spin = 1 - ni.device = 'gpu' rho, vxc, fxc = ni.cache_xc_kernel( mol, grids_gpu, xc, [mo_coeff]*2, [mo_occ, 1-mo_occ], spin) v = ni.nr_uks_fxc(mol, grids_gpu, xc, dms=t1, fxc=fxc, hermi=hermi) self.assertAlmostEqual(lib.fp(v), fpref, 12) - - ni.device = 'cpu' + + ni = ni.to_cpu() dm0 = mo_coeff.dot(mo_coeff.T) rho, vxc, fxc = ni.cache_xc_kernel( mol, grids_cpu, xc, [mo_coeff]*2, [mo_occ, 1-mo_occ], spin) vref = ni.nr_uks_fxc( mol, grids_cpu, xc, dm0=dm0, dms=t1, rho0=rho, vxc=vxc, fxc=fxc, hermi=hermi) self.assertAlmostEqual(abs(v - vref).max(), 0, 12) - + def test_rks_lda(self): self._check_vxc('nr_rks', LDA) def test_rks_gga(self): self._check_vxc('nr_rks', GGA_PBE) - + def test_rks_mgga(self): self._check_vxc('nr_rks', MGGA_M06) - + # Not implemented yet ''' def test_uks_lda(self): @@ -165,13 +158,13 @@ def test_uks_gga(self): def test_uks_mgga(self): self._check_vxc('nr_uks', 'm06', 83.5606316500255) ''' - + def test_rks_fxc_lda(self): self._check_rks_fxc(LDA, hermi=1) - + def test_rks_fxc_gga(self): self._check_rks_fxc(GGA_PBE, hermi=1) - + def test_rks_fxc_mgga(self): self._check_rks_fxc(MGGA_M06, hermi=1) @@ -187,13 +180,13 @@ def test_uks_fxc_gga(self): def test_uks_fxc_mgga(self): self._check_uks_fxc('m06', 0.7005336565753997, hermi=1) self._check_uks_fxc('m06', 0.35026682828770006, hermi=0) - + def test_rks_fxc_st_lda(self): self._check_rks_fxc_st('lda', -0.06358425564270553) def test_rks_fxc_st_gga(self): self._check_rks_fxc_st('pbe', -0.006650911990898234) - + def test_rks_fxc_st_mgga(self): self._check_rks_fxc_st('m06', 1.2456987899337242) ''' diff --git a/gpu4pyscf/dft/uks.py b/gpu4pyscf/dft/uks.py index 1f7775be..395d9e9f 100644 --- a/gpu4pyscf/dft/uks.py +++ b/gpu4pyscf/dft/uks.py @@ -17,23 +17,14 @@ from pyscf.dft import uks from gpu4pyscf.dft import numint -from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.hf import UHF class UKS(uks.UKS): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, xc='LDA,VWN'): super().__init__(mol, xc) self._numint = numint.NumInt() - @property - def device(self): - return self._numint.device - @device.setter - def device(self, value): - self._numint.device = value - - get_jk = patch_cpu_kernel(uks.UKS.get_jk)(_get_jk) - _eigh = patch_cpu_kernel(uks.UKS._eigh)(_eigh) + get_jk = UHF.get_jk + _eigh = UHF._eigh diff --git a/gpu4pyscf/grad/patch_pyscf.py b/gpu4pyscf/grad/patch_pyscf.py index 39f09c2c..40d83bba 100644 --- a/gpu4pyscf/grad/patch_pyscf.py +++ b/gpu4pyscf/grad/patch_pyscf.py @@ -20,7 +20,6 @@ ''' from gpu4pyscf.grad.rhf import _get_jk -from gpu4pyscf.lib.utils import patch_cpu_kernel import pyscf.grad.rhf as RHF -RHF.Gradients.get_jk = patch_cpu_kernel(RHF.Gradients.get_jk)(_get_jk) \ No newline at end of file +RHF.Gradients.get_jk = _get_jk diff --git a/gpu4pyscf/grad/rhf.py b/gpu4pyscf/grad/rhf.py index 65b35162..79fd4976 100644 --- a/gpu4pyscf/grad/rhf.py +++ b/gpu4pyscf/grad/rhf.py @@ -23,7 +23,6 @@ from pyscf.grad import rhf from gpu4pyscf.lib.cupy_helper import load_library from gpu4pyscf.scf.hf import _VHFOpt -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import tag_array from gpu4pyscf.df import int3c2e #TODO: move int3c2e to out of df @@ -452,7 +451,7 @@ def get_dh1e_ecp(mol, dm): dh1e_ecp[ia] = cupy.einsum('xij,ij->x', ecp, dm) return 2.0 * dh1e_ecp -def _grad_nuc(mf_grad, atmlst=None): +def grad_nuc(mf_grad, atmlst=None): ''' Derivatives of nuclear repulsion energy wrt nuclear coordinates ''' @@ -469,7 +468,7 @@ def _grad_nuc(mf_grad, atmlst=None): gs = gs[atmlst] return gs -def _grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): +def grad_elec(mf_grad, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None): ''' Electronic part of RHF/RKS gradients Args: @@ -548,11 +547,9 @@ def calculate_h1e(h1_gpu, s1_gpu): return de.get() class Gradients(rhf.Gradients): - to_cpu = to_cpu - to_gpu = to_gpu - - device = 'gpu' - grad_elec = patch_cpu_kernel(rhf.Gradients.grad_elec)(_grad_elec) - grad_nuc = patch_cpu_kernel(rhf.Gradients.grad_nuc)(_grad_nuc) - get_veff = patch_cpu_kernel(rhf.Gradients.get_veff)(_get_veff) - get_jk = patch_cpu_kernel(rhf.Gradients.get_jk)(_get_jk) + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device + + grad_elec = grad_elec + grad_nuc = grad_nuc + get_veff = get_veff + get_jk = _get_jk diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py index 32654709..041a4f5d 100644 --- a/gpu4pyscf/grad/rks.py +++ b/gpu4pyscf/grad/rks.py @@ -136,7 +136,6 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, max_memory=2000, verbose=None): - log = logger.new_logger(mol, verbose) xctype = ni._xc_type(xc_code) opt = getattr(ni, 'gdftopt', None) if opt is None: @@ -162,7 +161,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ao_deriv = 2 vvrho = [] for ao, mask, weight, coords \ - in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory): + in ni.block_loop(mol, grids, nao, ao_deriv, max_memory=max_memory): rho = numint.eval_rho2(opt.mol, ao[:4], mo_coeff, mo_occ, None, xctype) vvrho.append(rho) rho = cupy.hstack(vvrho) @@ -247,9 +246,9 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, dms = cupy.asarray(dms) dms = [cupy.einsum('pi,ij,qj->pq', coeff, dm, coeff) for dm in dms.reshape(-1,nao0,nao0)] - nset = len(dms) + #nset = len(dms) #make_rho, nset, nao = ni._gen_rho_evaluator(mol, dms.get(), hermi, False, grids) - ao_loc = mol.ao_loc_nr() + #ao_loc = mol.ao_loc_nr() excsum = 0 vmat = cupy.zeros((3,nao,nao)) @@ -272,8 +271,8 @@ def get_vxc_full_response(ni, mol, grids, xc_code, dms, relativity=0, hermi=1, ngrids = weight.size for p0, p1 in lib.prange(0,ngrids,block_size): ao = numint.eval_ao(ni, opt.mol, coords[p0:p1, :], ao_deriv) - rho = numint.eval_rho(opt.mol, ao, dms[0], \ - xctype=xctype, hermi=1, with_lapl=False) + rho = numint.eval_rho(opt.mol, ao, dms[0], + xctype=xctype, hermi=1, with_lapl=False) vxc = ni.eval_xc_eff(xc_code, rho, 1, xctype=xctype)[1] if xctype == 'LDA': @@ -420,4 +419,4 @@ def get_du(ia, ib): # JCP 98, 5612 (1993); (B10) w1 -= pbecke[ia] * z**2 * dpbecke.sum(axis=1) w1 *= vol w0 = vol * pbecke[ia] * z - yield coords, w0, w1 \ No newline at end of file + yield coords, w0, w1 diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py index 5c2eb19a..6f642889 100644 --- a/gpu4pyscf/hessian/rhf.py +++ b/gpu4pyscf/hessian/rhf.py @@ -38,7 +38,6 @@ from gpu4pyscf.scf import cphf from gpu4pyscf.lib.cupy_helper import contract, tag_array from gpu4pyscf.lib import logger -from gpu4pyscf.lib.utils import to_cpu, to_gpu def hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, mo1=None, mo_e1=None, h1ao=None, @@ -373,7 +372,6 @@ def gen_vind(mf, mo_coeff, mo_occ): def fx(mo1): mo1 = cupy.asarray(mo1) mo1 = mo1.reshape(-1,nmo,nocc) - nset = len(mo1) mo1_mo = cupy.einsum('npo,ip->nio', mo1, mo_coeff) dm1 = cupy.einsum('nio,jo->nij', 2.0*mo1_mo, mocc) dm1 = dm1 + dm1.transpose(0,2,1) @@ -473,8 +471,7 @@ def h_op(x): class Hessian(rhf_hess.Hessian): '''Non-relativistic restricted Hartree-Fock hessian''' - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, scf_method): self.verbose = scf_method.verbose diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py index ae0f3148..19041963 100644 --- a/gpu4pyscf/hessian/rks.py +++ b/gpu4pyscf/hessian/rks.py @@ -32,7 +32,6 @@ # import pyscf.grad.rks to activate nuc_grad_method method from gpu4pyscf.grad import rks # noqa -from gpu4pyscf.lib.utils import to_cpu def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, @@ -606,6 +605,8 @@ def _get_vxc_deriv1(hessobj, mo_coeff, mo_occ, max_memory): class Hessian(rhf_hess.Hessian): '''Non-relativistic RKS hessian''' + from gpu4pyscf.lib.utils import to_gpu, device + def __init__(self, mf): rhf_hess.Hessian.__init__(self, mf) self.grids = None @@ -613,13 +614,12 @@ def __init__(self, mf): self._keys = self._keys.union(['grids']) def to_cpu(self): + from gpu4pyscf.lib.utils import to_cpu from pyscf.hessian.rks import Hessian # to_cpu returns an rhf.Hessian object obj = to_cpu(self) return obj.view(Hessian) - to_gpu = to_gpu - def get_dispersion(self): if self.base.disp[:2].upper() == 'D3': from pyscf import lib diff --git a/gpu4pyscf/lib/cupy_helper.py b/gpu4pyscf/lib/cupy_helper.py index d02b90cb..0976a57e 100644 --- a/gpu4pyscf/lib/cupy_helper.py +++ b/gpu4pyscf/lib/cupy_helper.py @@ -65,8 +65,8 @@ def print_mem_info(): total_mem = mempool.total_bytes() used_mem = mempool.used_bytes() mem_limit = mempool.get_limit() - stack_size_per_thread = cupy.cuda.runtime.deviceGetLimit(0x00) - mem_stack = stack_size_per_thread + #stack_size_per_thread = cupy.cuda.runtime.deviceGetLimit(0x00) + #mem_stack = stack_size_per_thread GB = 1024 * 1024 * 1024 print(f'mem_avail: {mem_avail/GB:.3f} GB, total_mem: {total_mem/GB:.3f} GB, used_mem: {used_mem/GB:.3f} GB,mem_limt: {mem_limit/GB:.3f} GB') @@ -169,12 +169,13 @@ def eigh(h, s): ''' n = h.shape[0] w = cupy.zeros(n) - A = h.copy(); B = s.copy() + A = h.copy() + B = s.copy() cusolver_handle = device.get_cusolver_handle() err = libcupy_helper.eigh( ctypes.cast(cusolver_handle, ctypes.c_void_p), - ctypes.cast(A.data.ptr, ctypes.c_void_p), - ctypes.cast(B.data.ptr, ctypes.c_void_p), + ctypes.cast(A.data.ptr, ctypes.c_void_p), + ctypes.cast(B.data.ptr, ctypes.c_void_p), ctypes.cast(w.data.ptr, ctypes.c_void_p), ctypes.c_int(n)) if err != 0: @@ -300,7 +301,7 @@ def take_last2d(a, indices, out=None): reorder the last 2 dimensions with 'indices', the first n-2 indices do not change shape in the last 2 dimensions have to be the same ''' - assert a.flags.c_contiguous == True + assert a.flags.c_contiguous assert a.shape[-1] == a.shape[-2] nao = a.shape[-1] if a.ndim == 2: @@ -327,7 +328,7 @@ def transpose_sum(a): ''' transpose (0,2,1) ''' - assert a.flags.c_contiguous == True + assert a.flags.c_contiguous assert a.ndim == 3 n = a.shape[-1] count = a.shape[0] @@ -402,9 +403,9 @@ def contract323(a, b, alpha=1.0, beta=0.0, out=None): else: c = cupy.zeros([a.shape[0], b.shape[1], a.shape[2]], order='C') - if a.flags['OWNDATA'] == False: + if not a.flags['OWNDATA']: a = a.copy(order='C') - if b.flags['OWNDATA'] == False: + if not b.flags['OWNDATA']: a = a.copy(order='C') desc_a = cutensor.create_tensor_descriptor(a) diff --git a/gpu4pyscf/lib/cutensor.py b/gpu4pyscf/lib/cutensor.py index e2901c2b..0a9e68fe 100644 --- a/gpu4pyscf/lib/cutensor.py +++ b/gpu4pyscf/lib/cutensor.py @@ -57,8 +57,8 @@ def get_handle(): return handle return _handles[dev] -def create_contraction_descriptor(handle, - a, desc_a, mode_a, +def create_contraction_descriptor(handle, + a, desc_a, mode_a, b, desc_b, mode_b, c, desc_c, mode_c): alignment_req_A = cutensor_lib.getAlignmentRequirement(handle, a.data.ptr, desc_a) @@ -114,7 +114,7 @@ def contraction(pattern, a, b, alpha, beta, out=None): ws_size = cutensor_lib.contractionGetWorkspaceSize(handle, desc, find, cutensor_lib.WORKSPACE_RECOMMENDED) try: ws = cupy.empty(ws_size, dtype=np.int8) - except: + except Exception: ws_size = cutensor_lib.contractionGetWorkspaceSize(handle, desc, find, cutensor_lib.WORKSPACE_MIN) ws = cupy.empty(ws_size, dtype=np.int8) @@ -127,4 +127,3 @@ def contraction(pattern, a, b, alpha, beta, out=None): beta.ctypes.data, c.data.ptr, out.data.ptr, ws.data.ptr, ws_size) return out - \ No newline at end of file diff --git a/gpu4pyscf/lib/logger.py b/gpu4pyscf/lib/logger.py index abfa72d7..58c3f45f 100644 --- a/gpu4pyscf/lib/logger.py +++ b/gpu4pyscf/lib/logger.py @@ -17,7 +17,6 @@ import time import cupy from pyscf import lib -from gpu4pyscf.lib.utils import patch_cpu_kernel from pyscf.lib import parameters as param import pyscf.__config__ @@ -54,7 +53,7 @@ def _timer_debug1(rec, msg, cpu0=None, wall0=None, sync=True): class Logger(lib.logger.Logger): def __init__(self, stdout=sys.stdout, verbose=NOTE): super().__init__(stdout=stdout, verbose=verbose) - timer_debug1 = patch_cpu_kernel(lib.logger.timer_debug1)(_timer_debug1) + timer_debug1 = _timer_debug1 def new_logger(rec=None, verbose=None): '''Create and return a :class:`Logger` object diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index b5ec3659..1b6eb5b2 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -54,3 +54,10 @@ def identity(x): return x to_gpu = identity + +@property +def device(obj): + if 'gpu4pyscf' in obj.__class__.__module__: + return 'gpu' + else: + return 'cpu' diff --git a/gpu4pyscf/patch_pyscf.py b/gpu4pyscf/patch_pyscf.py index 7ab45f91..5b53188b 100644 --- a/gpu4pyscf/patch_pyscf.py +++ b/gpu4pyscf/patch_pyscf.py @@ -19,9 +19,9 @@ Aggresively patch all PySCF modules if applicable This patch may break some pyscf modules. ''' -from gpu4pyscf.df import patch_pyscf -from gpu4pyscf.scf import patch_pyscf -from gpu4pyscf.dft import patch_pyscf -from gpu4pyscf.grad import patch_pyscf +from gpu4pyscf.df import patch_pyscf # noqa: F811 +from gpu4pyscf.scf import patch_pyscf # noqa: F811 +from gpu4pyscf.dft import patch_pyscf # noqa: F811 +from gpu4pyscf.grad import patch_pyscf # noqa: F811 del patch_pyscf diff --git a/gpu4pyscf/scf/ghf.py b/gpu4pyscf/scf/ghf.py index 03edafe1..a4ddcebf 100644 --- a/gpu4pyscf/scf/ghf.py +++ b/gpu4pyscf/scf/ghf.py @@ -15,36 +15,28 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import cupy from pyscf.scf import ghf -from gpu4pyscf.scf.hf import get_jk as _get_jk_nr -from gpu4pyscf.scf.hf import _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu - -def get_jk(mol=None, dm=None, hermi=0, with_j=True, with_k=True, - omega=None): - if mol is None: mol = self.mol - if dm is None: dm = self.make_rdm1() - nao = mol.nao - dm = numpy.asarray(dm) - - def jkbuild(mol, dm, hermi, with_j, with_k, omega=None): - return _get_jk_nr(mf, mol, dm, hermi, with_j, with_k, omega) - - if nao == dm.shape[-1]: - vj, vk = jkbuild(mol, dm, hermi, with_j, with_k, omega) - else: # GHF density matrix, shape (2N,2N) - vj, vk = ghf.get_jk(mol, dm, hermi, with_j, with_k, jkbuild, omega) - return vj, vk +from gpu4pyscf.scf.hf import _get_jk as _get_jk_nr +from gpu4pyscf.scf.hf import eigh class GHF(ghf.GHF): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - device = 'gpu' - - @patch_cpu_kernel(ghf.GHF.get_jk) def get_jk(self, mol=None, dm=None, hermi=0, with_j=True, with_k=True, omega=None): - return get_jk(mol, dm, hermi, with_j, with_k, omega) + if mol is None: mol = self.mol + if dm is None: dm = self.make_rdm1() + nao = mol.nao + dm = cupy.asarray(dm) + + def jkbuild(mol, dm, hermi, with_j, with_k, omega=None): + return _get_jk_nr(self, mol, dm, hermi, with_j, with_k, omega) + + if nao == dm.shape[-1]: + vj, vk = jkbuild(mol, dm, hermi, with_j, with_k, omega) + else: # GHF density matrix, shape (2N,2N) + vj, vk = ghf.get_jk(mol, dm, hermi, with_j, with_k, jkbuild, omega) + return vj, vk - _eigh = patch_cpu_kernel(ghf.GHF._eigh)(_eigh) + _eigh = staticmethod(eigh) diff --git a/gpu4pyscf/scf/hf.py b/gpu4pyscf/scf/hf.py index 4d387d2d..833919ec 100644 --- a/gpu4pyscf/scf/hf.py +++ b/gpu4pyscf/scf/hf.py @@ -28,7 +28,6 @@ from pyscf.lib import logger from pyscf.scf import hf, jk, _vhf from gpu4pyscf import lib -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu from gpu4pyscf.lib.cupy_helper import eigh, load_library, tag_array from gpu4pyscf.scf import diis @@ -120,7 +119,6 @@ def get_jk(mol, dm, hermi=1, vhfopt=None, with_j=True, with_k=True, omega=None, continue for cp_kl_id, log_q_kl in enumerate(log_qs[:cp_ij_id+1]): - t0 = time.perf_counter() cpk = cp_idx[cp_kl_id] cpl = cp_jdx[cp_kl_id] lk = vhfopt.uniq_l_ctr[cpk,0] @@ -279,10 +277,7 @@ def _get_jk(mf, mol=None, dm=None, hermi=1, with_j=True, with_k=True, log.timer('vj and vk on gpu', *cput0) return vj, vk -def _eigh(mf, h, s): - return eigh(h, s) - -def _make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs): +def make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs): if mo_occ is None: mo_occ = mf.mo_occ if mo_coeff is None: mo_coeff = mf.mo_coeff is_occ = mo_occ > 0 @@ -291,43 +286,42 @@ def _make_rdm1(mf, mo_coeff=None, mo_occ=None, **kwargs): occ_coeff = mo_coeff[:, mo_occ>1.0] return tag_array(dm, occ_coeff=occ_coeff, mo_occ=mo_occ, mo_coeff=mo_coeff) -def _get_occ(mf, mo_energy=None, mo_coeff=None): +def get_occ(mf, mo_energy=None, mo_coeff=None): if mo_energy is None: mo_energy = mf.mo_energy e_idx = cupy.argsort(mo_energy) - e_sort = mo_energy[e_idx] nmo = mo_energy.size mo_occ = cupy.zeros(nmo) nocc = mf.mol.nelectron // 2 mo_occ[e_idx[:nocc]] = 2 return mo_occ -def _get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1, vhfopt=None): +def get_veff(mf, mol=None, dm=None, dm_last=None, vhf_last=None, hermi=1, vhfopt=None): if dm_last is None: - vj, vk = _get_jk(mf, mol, cupy.asarray(dm), hermi) + vj, vk = mf.get_jk(mol, cupy.asarray(dm), hermi) return vj - vk * .5 else: ddm = cupy.asarray(dm) - cupy.asarray(dm_last) - vj, vk = _get_jk(mf, mol, ddm, hermi) + vj, vk = mf.get_jk(mol, ddm, hermi) return vj - vk * .5 + cupy.asarray(vhf_last) -def _get_grad(mo_coeff, mo_occ, fock_ao): +def get_grad(mo_coeff, mo_occ, fock_ao): occidx = mo_occ > 0 viridx = ~occidx g = reduce(cupy.dot, (mo_coeff[:,viridx].conj().T, fock_ao, mo_coeff[:,occidx])) * 2 return g.ravel() -def _damping(s, d, f, factor): +def damping(s, d, f, factor): dm_vir = cupy.eye(s.shape[0]) - cupy.dot(s, d) f0 = reduce(cupy.dot, (dm_vir, f, d, s)) f0 = (f0+f0.conj().T) * (factor/(factor+1.)) return f - f0 -def _level_shift(s, d, f, factor): +def level_shift(s, d, f, factor): dm_vir = s - reduce(cupy.dot, (s, d, s)) return f + dm_vir * factor -def _get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, +def get_fock(mf, h1e=None, s1e=None, vhf=None, dm=None, cycle=-1, diis=None, diis_start_cycle=None, level_shift_factor=None, damp_factor=None): if h1e is None: h1e = mf.get_hcore() if vhf is None: vhf = mf.get_veff(mf.mol, dm) @@ -405,19 +399,24 @@ def _kernel(mf, conv_tol=1e-10, conv_tol_grad=None, dm_last = dm last_hf_e = e_tot - f = mf.get_fock(h1e, s1e, vhf, dm, cycle, mf_diis); t1 = log.timer_debug1('DIIS', *t0) - mo_energy, mo_coeff = eigh(f, s1e); t1 = log.timer_debug1('eig', *t1) + f = mf.get_fock(h1e, s1e, vhf, dm, cycle, mf_diis) + t1 = log.timer_debug1('DIIS', *t0) + mo_energy, mo_coeff = mf.eig(f, s1e) + t1 = log.timer_debug1('eig', *t1) mo_occ = mf.get_occ(mo_energy, mo_coeff) - dm = _make_rdm1(mf, mo_coeff, mo_occ); t1 = log.timer_debug1('dm', *t1) - vhf = mf.get_veff(mol, dm, dm_last, vhf); t1 = log.timer_debug1('veff', *t1) - e_tot = mf.energy_tot(dm, h1e, vhf); t1 = log.timer_debug1('energy', *t1) + dm = mf.make_rdm1(mo_coeff, mo_occ) + t1 = log.timer_debug1('dm', *t1) + vhf = mf.get_veff(mol, dm, dm_last, vhf) + t1 = log.timer_debug1('veff', *t1) + e_tot = mf.energy_tot(dm, h1e, vhf) + t1 = log.timer_debug1('energy', *t1) norm_ddm = cupy.linalg.norm(dm-dm_last) t1 = log.timer_debug1('total', *t0) logger.info(mf, 'cycle= %d E= %.15g delta_E= %4.3g |ddm|= %4.3g', cycle+1, e_tot, e_tot-last_hf_e, norm_ddm) e_diff = abs(e_tot-last_hf_e) - norm_gorb = cupy.linalg.norm(_get_grad(mo_coeff, mo_occ, f)) + norm_gorb = cupy.linalg.norm(mf.get_grad(mo_coeff, mo_occ, f)) if(e_diff < conv_tol and norm_gorb < conv_tol_grad): scf_conv = True break @@ -537,19 +536,17 @@ def _quad_moment(mf, mol=None, dm=None, unit='Debye-Ang'): class RHF(hf.RHF): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device screen_tol = 1e-14 - device = 'gpu' DIIS = diis.SCF_DIIS - get_jk = patch_cpu_kernel(hf.RHF.get_jk)(_get_jk) - _eigh = patch_cpu_kernel(hf.RHF._eigh)(_eigh) - make_rdm1 = patch_cpu_kernel(hf.RHF.make_rdm1)(_make_rdm1) - get_fock = patch_cpu_kernel(hf.RHF.get_fock)(_get_fock) - get_occ = patch_cpu_kernel(hf.RHF.get_occ)(_get_occ) - get_veff = patch_cpu_kernel(hf.RHF.get_veff)(_get_veff) - get_grad = patch_cpu_kernel(hf.RHF.get_grad)(_get_grad) + get_jk = _get_jk + _eigh = staticmethod(eigh) + make_rdm1 = make_rdm1 + get_fock = get_fock + get_occ = get_occ + get_veff = get_veff + get_grad = staticmethod(get_grad) gen_response = _gen_rhf_response quad_moment = _quad_moment @@ -579,7 +576,7 @@ def scf(self, dm0=None, **kwargs): kernel = pyscf_lib.alias(scf, alias_name='kernel') class _VHFOpt(_vhf.VHFOpt): - to_cpu = to_cpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device def __init__(self, mol, intor, prescreen='CVHFnoscreen', qcondname='CVHFsetnr_direct_scf', dmcondname=None): @@ -875,5 +872,3 @@ def _split_l_ctr_groups(uniq_l_ctr, l_ctr_counts, group_size): uniq_l_ctr = np.vstack(_l_ctrs) l_ctr_counts = np.hstack(_l_ctr_counts) return uniq_l_ctr, l_ctr_counts - -del patch_cpu_kernel diff --git a/gpu4pyscf/scf/int4c2e.py b/gpu4pyscf/scf/int4c2e.py index c31de424..7620ac18 100644 --- a/gpu4pyscf/scf/int4c2e.py +++ b/gpu4pyscf/scf/int4c2e.py @@ -44,7 +44,6 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() - nao_sph = len(intopt.sph_ao_idx) nao = intopt.mol.nao naux = intopt.auxmol.nao norb = nao + naux + 1 @@ -65,7 +64,9 @@ def loop_int3c2e_general(intopt, ip_type='', omega=None, stream=None): i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) bins_locs_kl = np.array([0, len(log_q_kl)], dtype=np.int32) @@ -107,17 +108,19 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s ip_type == 1: int3c2e_ip1 ip_type == 2: int3c2e_ip2 ''' + from gpu4pyscf.scf.hf import _VHFOpt fn = getattr(libgint, 'GINTfill_int3c2e_' + ip_type) if omega is None: omega = 0.0 if stream is None: stream = cupy.cuda.get_current_stream() if auxmol is None: - auxmol = df.addons.make_auxmol(mol, auxbasis) + from pyscf.df.addons import make_auxmol + auxmol = make_auxmol(mol, auxbasis) nao_sph = mol.nao naux_sph = auxmol.nao - intopt = VHFOpt(mol, auxmol, 'int2e') - intopt.build(direct_scf_tol, diag_block_with_triu=True, aosym=False) + intopt = _VHFOpt(mol, auxmol, 'int2e') + intopt.build(direct_scf_tol, diag_block_with_triu=True) nao = intopt.mol.nao naux = intopt.auxmol.nao @@ -136,7 +139,9 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s i0, i1 = intopt.cart_ao_loc[cpi], intopt.cart_ao_loc[cpi+1] j0, j1 = intopt.cart_ao_loc[cpj], intopt.cart_ao_loc[cpj+1] k0, k1 = intopt.cart_aux_loc[aux_id], intopt.cart_aux_loc[aux_id+1] - ni = i1 - i0; nj = j1 - j0; nk = k1 - k0 + ni = i1 - i0 + nj = j1 - j0 + nk = k1 - k0 lk = intopt.aux_angular[aux_id] bins_locs_ij = np.array([0, len(log_q_ij)], dtype=np.int32) @@ -159,6 +164,8 @@ def get_int3c2e_ip(mol, auxmol=None, ip_type=1, auxbasis='weigend+etb', direct_s ctypes.c_int(cp_ij_id), ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) + if err != 0: + raise RuntimeError("int3c2e_ip failed\n") int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) @@ -307,4 +314,4 @@ def get_int4c2e_jk(mol, dm, vhfopt=None, direct_scf_tol=1e-13, with_k=True, omeg vj = vj + vj.T vj *= 2.0 vk = vk + vk.T - return vj, vk \ No newline at end of file + return vj, vk diff --git a/gpu4pyscf/scf/patch_pyscf.py b/gpu4pyscf/scf/patch_pyscf.py index e1b1bbbc..67d71821 100644 --- a/gpu4pyscf/scf/patch_pyscf.py +++ b/gpu4pyscf/scf/patch_pyscf.py @@ -19,14 +19,9 @@ Patch pyscf SCF modules to make all subclass of SCF class support GPU mode. ''' -from gpu4pyscf.scf.hf import _get_jk, _eigh +from gpu4pyscf.scf.hf import _get_jk, eigh from pyscf.scf.hf import SCF -from gpu4pyscf.lib.utils import patch_cpu_kernel - -# The device attribute is patched to the pyscf base SCF module. It will be -# seen by all subclasses. -SCF.device = 'gpu' print(f'{SCF} monkey-patched') -SCF.get_jk = patch_cpu_kernel(SCF.get_jk)(_get_jk) -SCF._eigh = patch_cpu_kernel(SCF._eigh)(_eigh) +SCF.get_jk = _get_jk +SCF._eigh = staticmethod(eigh) diff --git a/gpu4pyscf/scf/rohf.py b/gpu4pyscf/scf/rohf.py index 5ac284e8..ccb68946 100644 --- a/gpu4pyscf/scf/rohf.py +++ b/gpu4pyscf/scf/rohf.py @@ -16,14 +16,11 @@ # along with this program. If not, see . from pyscf.scf import rohf -from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.hf import _get_jk, eigh class ROHF(rohf.ROHF): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - device = 'gpu' - get_jk = patch_cpu_kernel(rohf.ROHF.get_jk)(_get_jk) - _eigh = patch_cpu_kernel(rohf.ROHF._eigh)(_eigh) + get_jk = _get_jk + _eigh = staticmethod(eigh) diff --git a/gpu4pyscf/scf/tests/test_rhf.py b/gpu4pyscf/scf/tests/test_rhf.py index f792fe56..8b53d717 100644 --- a/gpu4pyscf/scf/tests/test_rhf.py +++ b/gpu4pyscf/scf/tests/test_rhf.py @@ -20,6 +20,7 @@ import cupy import pyscf from pyscf import lib +from gpu4pyscf import scf mol = pyscf.M( atom=''' @@ -79,13 +80,12 @@ def test_get_jk(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vj, vk = mf.get_jk(mol, dm) self.assertAlmostEqual(lib.fp(vj), -498.6834601181653 , 7) self.assertAlmostEqual(lib.fp(vk), -13.552287262014744, 7) - mf.device = 'cpu' - refj, refk = mf.get_jk(mol, dm) + mf1 = mf.to_cpu() + refj, refk = mf1.get_jk(mol, dm) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) with lib.temporary_env(mol, cart=True): @@ -93,14 +93,13 @@ def test_get_jk(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vj, vk = mf.get_jk(mol, dm) self.assertAlmostEqual(lib.fp(vj), -3530.1507509846288, 7) self.assertAlmostEqual(lib.fp(vk), -845.7403732632113 , 7) - mf.device = 'cpu' - refj, refk = mf.get_jk(mol, dm) + mf1 = mf.to_cpu() + refj, refk = mf1.get_jk(mol, dm) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) @@ -109,13 +108,12 @@ def test_get_j(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vj = mf.get_j(mol, dm) self.assertAlmostEqual(lib.fp(vj), -498.6834601181653 , 7) - mf.device = 'cpu' - refj = mf.get_j(mol, dm) + mf1 = mf.to_cpu() + refj = mf1.get_j(mol, dm) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) with lib.temporary_env(mol, cart=True): @@ -123,13 +121,12 @@ def test_get_j(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vj = mf.get_j(mol, dm) self.assertAlmostEqual(lib.fp(vj), -3530.1507509846288, 7) - mf.device = 'cpu' - refj = mf.get_j(mol, dm) + mf1 = mf.to_cpu() + refj = mf1.get_j(mol, dm) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) def test_get_k(self): @@ -137,13 +134,12 @@ def test_get_k(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vk = mf.get_k(mol, dm) self.assertAlmostEqual(lib.fp(vk), -13.552287262014744, 7) - mf.device = 'cpu' - refk = mf.get_k(mol, dm) + mf1 = mf.to_cpu() + refk = mf1.get_k(mol, dm) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) with lib.temporary_env(mol, cart=True): @@ -151,13 +147,12 @@ def test_get_k(self): nao = mol.nao dm = np.random.random((nao,nao)) dm = dm + dm.T - mf = mol.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol) vk = mf.get_k(mol, dm) self.assertAlmostEqual(lib.fp(vk), -845.7403732632113 , 7) - mf.device = 'cpu' - refk = mf.get_k(mol, dm) + mf1 = mf.to_cpu() + refk = mf1.get_k(mol, dm) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) def test_get_jk1(self): @@ -166,14 +161,13 @@ def test_get_jk1(self): nao = mol1.nao dm = np.random.random((2,nao,nao)) dm = dm + dm.transpose(0,2,1) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vj, vk = mf.get_jk(mol1, dm, hermi=1) self.assertAlmostEqual(lib.fp(vj), 179.14526555375858, 7) self.assertAlmostEqual(lib.fp(vk), -34.851182918643005, 7) - mf.device = 'cpu' - refj, refk = mf.get_jk(mol1, dm, hermi=1) + mf1 = mf.to_cpu() + refj, refk = mf1.get_jk(mol1, dm, hermi=1) self.assertAlmostEqual(abs(vj - refj).max(), 0, 8) self.assertAlmostEqual(abs(vk - refk).max(), 0, 8) @@ -182,14 +176,13 @@ def test_get_jk1_hermi0(self): np.random.seed(1) nao = mol1.nao dm = np.random.random((2,nao,nao)) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vj, vk = mf.get_jk(mol1, cupy.asarray(dm), hermi=0) self.assertAlmostEqual(lib.fp(vj.get()), 89.57263277687994, 7) self.assertAlmostEqual(lib.fp(vk.get()),-26.36969769724246, 7) - mf.device = 'cpu' - refj, refk = mf.get_jk(mol1, dm, hermi=0) + mf1 = mf.to_cpu() + refj, refk = mf1.get_jk(mol1, dm, hermi=0) self.assertAlmostEqual(abs(vj.get() - refj).max(), 0, 8) self.assertAlmostEqual(abs(vk.get() - refk).max(), 0, 8) @@ -199,13 +192,12 @@ def test_get_j1(self): nao = mol1.nao dm = np.random.random((2,nao,nao)) dm = dm + dm.transpose(0,2,1) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vj = mf.get_j(mol1, dm, hermi=1) self.assertAlmostEqual(lib.fp(vj), 179.14526555375858, 7) - mf.device = 'cpu' - refj = mf.get_j(mol1, dm, hermi=1) + mf1 = mf.to_cpu() + refj = mf1.get_j(mol1, dm, hermi=1) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) @unittest.skip('hermi=0') @@ -213,13 +205,12 @@ def test_get_j1_hermi0(self): np.random.seed(1) nao = mol1.nao dm = np.random.random((2,nao,nao)) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vj = mf.get_j(mol1, dm, hermi=0).get() self.assertAlmostEqual(lib.fp(vj), 89.57263277687994, 7) - mf.device = 'cpu' - refj = mf.get_j(mol1, dm, hermi=0) + mf1 = mf.to_cpu() + refj = mf1.get_j(mol1, dm, hermi=0) self.assertAlmostEqual(abs(vj - refj).max(), 0, 7) def test_get_k1(self): @@ -228,13 +219,12 @@ def test_get_k1(self): nao = mol1.nao dm = np.random.random((2,nao,nao)) dm = dm + dm.transpose(0,2,1) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vk = mf.get_k(mol1, dm, hermi=1) self.assertAlmostEqual(lib.fp(vk), -34.851182918643005, 7) - mf.device = 'cpu' - refk = mf.get_k(mol1, dm, hermi=1) + mf1 = mf.to_cpu() + refk = mf1.get_k(mol1, dm, hermi=1) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) @unittest.skip('hermi=0') @@ -242,13 +232,12 @@ def test_get_k1_hermi0(self): np.random.seed(1) nao = mol1.nao dm = np.random.random((2,nao,nao)) - mf = mol1.RHF() - mf.device = 'gpu' + mf = scf.RHF(mol1) vk = mf.get_k(mol1, dm, hermi=0).get() self.assertAlmostEqual(lib.fp(vk),-26.36969769724246, 7) - mf.device = 'cpu' - refk = mf.get_k(mol1, dm, hermi=0) + mf1 = mf.to_cpu() + refk = mf1.get_k(mol1, dm, hermi=0) self.assertAlmostEqual(abs(vk - refk).max(), 0, 7) ''' # end to end test diff --git a/gpu4pyscf/scf/uhf.py b/gpu4pyscf/scf/uhf.py index 633b163b..9566cf0b 100644 --- a/gpu4pyscf/scf/uhf.py +++ b/gpu4pyscf/scf/uhf.py @@ -16,13 +16,10 @@ # along with this program. If not, see . from pyscf.scf import uhf -from gpu4pyscf.scf.hf import _get_jk, _eigh -from gpu4pyscf.lib.utils import patch_cpu_kernel, to_cpu, to_gpu +from gpu4pyscf.scf.hf import _get_jk, eigh class UHF(uhf.UHF): - to_cpu = to_cpu - to_gpu = to_gpu + from gpu4pyscf.lib.utils import to_cpu, to_gpu, device - device = 'gpu' - get_jk = patch_cpu_kernel(uhf.UHF.get_jk)(_get_jk) - _eigh = patch_cpu_kernel(uhf.UHF._eigh)(_eigh) + get_jk = _get_jk + _eigh = staticmethod(eigh) From bcdadc7f2d09135ea80ed64e99f59e5b9fccf569 Mon Sep 17 00:00:00 2001 From: Qiming Sun Date: Sun, 8 Oct 2023 17:01:07 -0700 Subject: [PATCH 4/4] Modify to_gpu --- gpu4pyscf/df/int3c2e.py | 2 ++ gpu4pyscf/lib/utils.py | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gpu4pyscf/df/int3c2e.py b/gpu4pyscf/df/int3c2e.py index 9a8d5984..c3998ad8 100644 --- a/gpu4pyscf/df/int3c2e.py +++ b/gpu4pyscf/df/int3c2e.py @@ -1065,6 +1065,8 @@ def get_int3c2e_general(mol, auxmol=None, ip_type='', auxbasis='weigend+etb', di ctypes.c_int(cp_ij_id), ctypes.c_int(cp_kl_id), ctypes.c_double(omega)) + if err != 0: + raise RuntimeError("int3c2e failed\n") int3c_blk = cart2sph(int3c_blk, axis=1, ang=lk) int3c_blk = cart2sph(int3c_blk, axis=2, ang=lj) diff --git a/gpu4pyscf/lib/utils.py b/gpu4pyscf/lib/utils.py index 1b6eb5b2..bbe3d2df 100644 --- a/gpu4pyscf/lib/utils.py +++ b/gpu4pyscf/lib/utils.py @@ -50,10 +50,8 @@ def to_cpu(method): setattr(method, key, val.to_cpu()) return method -def identity(x): - return x - -to_gpu = identity +def to_gpu(method, device=None): + return method @property def device(obj):