diff --git a/benchmarks/df/dft_driver.py b/benchmarks/df/dft_driver.py
index 7e0d8d41..2e6a288b 100644
--- a/benchmarks/df/dft_driver.py
+++ b/benchmarks/df/dft_driver.py
@@ -37,12 +37,12 @@
     # Warmup
     for i in range(3):
         warmup(atom='../molecules/organic/020_Vitamin_C.xyz')
-    '''
+
     # Generate benchmark data for different xc
     config = config_template.copy()
     for xc in ['LDA', 'PBE', 'B3LYP', 'M06']:
         config['xc'] = xc
-        config['output_dir'] = './organic/xc/' + xc 
+        config['output_dir'] = './organic/xc/' + xc
         config['basis'] = 'def2-tzvpp'
         config['verbose'] = 4
         for mol_name in config['molecules']:
@@ -61,7 +61,7 @@
         if mol_name in ["095_Azadirachtin.xyz","113_Taxol.xyz","168_Valinomycin.xyz"]:
             continue
         run_dft(mol_name, config)
-    
+
     # Generate benchmark data for different basis
     config = config_template.copy()
     for bas in ['sto-3g', '6-31g', 'def2-svp', 'def2-tzvpp', 'def2-tzvpd']:
@@ -72,7 +72,7 @@
             if mol_name in ["095_Azadirachtin.xyz", "113_Taxol.xyz","168_Valinomycin.xyz"]:
                 continue
             run_dft(mol_name, config)
-    '''
+
     # Generate benchmark data for different solvent
     config = config_template.copy()
     for mol_name in config['molecules']:
@@ -81,7 +81,7 @@
         config['xc'] = 'b3lyp'
         config['basis'] = 'def2-tzvpp'
         config['with_solvent'] = True
-        
+
         solvent_method = "CPCM"
         config['solvent']['method'] = solvent_method
         config['output_dir'] = './organic/solvent/' + solvent_method
@@ -90,4 +90,5 @@
         solvent_method = "IEFPCM"
         config['solvent']['method'] = solvent_method
         config['output_dir'] = './organic/solvent/' + solvent_method
-        run_dft(mol_name, config)            
+        run_dft(mol_name, config)
+
diff --git a/gpu4pyscf/df/grad/rks.py b/gpu4pyscf/df/grad/rks.py
index b0d7269e..debecf10 100644
--- a/gpu4pyscf/df/grad/rks.py
+++ b/gpu4pyscf/df/grad/rks.py
@@ -43,7 +43,7 @@ def get_veff(ks_grad, mol=None, dm=None):
         grids.build(with_non0tab=False)
 
     nlcgrids = None
-    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+    if mf.do_nlc():
         if ks_grad.nlcgrids is not None:
             nlcgrids = ks_grad.nlcgrids
         else:
@@ -51,8 +51,6 @@ def get_veff(ks_grad, mol=None, dm=None):
         if nlcgrids.coords is None:
             nlcgrids.build(with_non0tab=False)
 
-    if mf.nlc != '':
-        raise NotImplementedError
     #enabling range-separated hybrids
     omega, alpha, hyb = ni.rsh_and_hybrid_coeff(mf.xc, spin=mol.spin)
 
@@ -63,13 +61,13 @@ def get_veff(ks_grad, mol=None, dm=None):
                 ni, mol, grids, mf.xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
         #logger.debug1(ks_grad, 'sum(grids response) %s', exc.sum(axis=0))
-        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        if mf.do_nlc():
             raise NotImplementedError
     else:
         exc, vxc = rks_grad.get_vxc(
                 ni, mol, grids, mf.xc, dm,
                 max_memory=max_memory, verbose=ks_grad.verbose)
-        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        if mf.do_nlc():
             if ni.libxc.is_nlc(mf.xc):
                 xc = mf.xc
             else:
diff --git a/gpu4pyscf/df/grad/uks.py b/gpu4pyscf/df/grad/uks.py
index 1029f519..63586833 100644
--- a/gpu4pyscf/df/grad/uks.py
+++ b/gpu4pyscf/df/grad/uks.py
@@ -33,7 +33,7 @@ def get_veff(ks_grad, mol=None, dm=None):
     if mol is None: mol = ks_grad.mol
     if dm is None: dm = ks_grad.base.make_rdm1()
     t0 = (logger.process_clock(), logger.perf_counter())
-    
+
     mf = ks_grad.base
     ni = mf._numint
     if ks_grad.grids is not None:
@@ -45,7 +45,7 @@ def get_veff(ks_grad, mol=None, dm=None):
         grids.build(sort_grids=True)
 
     nlcgrids = None
-    if mf.nlc or ni.libxc.is_nlc(mf.xc):
+    if mf.do_nlc():
         if ks_grad.nlcgrids is not None:
             nlcgrids = ks_grad.nlcgrids
         else:
@@ -60,12 +60,12 @@ def get_veff(ks_grad, mol=None, dm=None):
         exc, vxc_tmp = uks_grad.get_vxc_full_response(ni, mol, grids, mf.xc, dm,
                                          max_memory=max_memory,
                                          verbose=ks_grad.verbose)
-        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        if mf.do_nlc():
             raise NotImplementedError
     else:
         exc, vxc_tmp = uks_grad.get_vxc(ni, mol, grids, mf.xc, dm,
                            max_memory=max_memory, verbose=ks_grad.verbose)
-        if mf.nlc or ni.libxc.is_nlc(mf.xc):
+        if mf.do_nlc():
             if ni.libxc.is_nlc(mf.xc):
                 xc = mf.xc
             else:
@@ -131,7 +131,7 @@ def get_veff(ks_grad, mol=None, dm=None):
         e1_aux = None
 
     vxc = tag_array(vxc, aux=e1_aux)
-    
+
     return vxc
 
 
diff --git a/gpu4pyscf/df/tests/test_df_hessian.py b/gpu4pyscf/df/tests/test_df_hessian.py
index 24f8edaa..9a7bae2e 100644
--- a/gpu4pyscf/df/tests/test_df_hessian.py
+++ b/gpu4pyscf/df/tests/test_df_hessian.py
@@ -271,6 +271,24 @@ def test_hessian_uks_D4(self):
         h = hobj.kernel()
         _check_dft_hessian(mf, h, ix=0,iy=0)
 
+    def test_hessian_rks_wb97m_d3bj(self):
+        print('----------- testing DFRKS, wb97m-d3bj --------')
+        mf = _make_rks(mol_sph, 'wb97m-d3bj')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+
+    def test_hessian_uks_wb97m_d3bj(self):
+        print('------------- testing DFUKS, wb97m-d3bj ---------')
+        mf = _make_uks(mol_sph, 'wb97m-d3bj')
+        mf.conv_tol_cpscf = 1e-7
+        hobj = mf.Hessian()
+        hobj.set(auxbasis_response=2)
+        h = hobj.kernel()
+        _check_dft_hessian(mf, h, ix=0,iy=0)
+
     def test_hessian_cart(self):
         print('-----testing DF Hessian (cartesian)----')
         mf = _make_rks(mol_cart, 'b3lyp')
diff --git a/gpu4pyscf/df/tests/test_df_rks.py b/gpu4pyscf/df/tests/test_df_rks.py
index 3881bcef..4cd40701 100644
--- a/gpu4pyscf/df/tests/test_df_rks.py
+++ b/gpu4pyscf/df/tests/test_df_rks.py
@@ -136,9 +136,16 @@ def test_to_gpu(self):
     def test_rks_cart(self):
         print('-------- B3LYP (CART) -------------')
         e_tot = run_dft('B3LYP', mol_cart)
-        e_qchem = -76.46723795965626 # data from PySCF
-        print(f'diff from pyscf {e_tot - e_qchem}')
-        assert np.abs(e_tot - e_qchem) < 1e-5
+        e_ref = -76.46723795965626 # data from PySCF
+        print(f'diff from PySCF {e_tot - e_ref}')
+        assert np.abs(e_tot - e_ref) < 1e-5
+
+    def test_rks_wb97m_d3bj(self):
+        print('-------- wB97m-d3bj -------------')
+        e_tot = run_dft('wb97m-d3bj', mol_sph)
+        e_ref = -76.47679432135077
+        print(f'diff from PySCF {e_tot - e_ref}')
+        assert np.abs(e_tot - e_ref) < 1e-5
 
 if __name__ == "__main__":
     print("Full Tests for restricted Kohn-Sham")
diff --git a/gpu4pyscf/df/tests/test_df_rks_grad.py b/gpu4pyscf/df/tests/test_df_rks_grad.py
index 76fa3740..74e27639 100644
--- a/gpu4pyscf/df/tests/test_df_rks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_rks_grad.py
@@ -58,7 +58,8 @@ def tearDownModule():
     del mol_sph, mol_cart
 
 def _check_grad(mol, grid_response=False, xc=xc0, disp=disp0, tol=1e-6):
-    mf = rks.RKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
+    mf = rks.RKS(mol, xc=xc).density_fit(auxbasis=auxbasis0)
+    mf.disp = disp
     mf.grids.level = grids_level
     mf.nlcgrids.level = nlcgrids_level
     mf.conv_tol = 1e-10
@@ -145,6 +146,10 @@ def test_grad_d4(self):
         print('------ B3LYP with d4 --------')
         _check_grad(mol_cart, xc='B3LYP', disp='d4', tol=1e-6)
 
+    def test_grad_wb97m_d3bj(self):
+        print('------ wB97m-d3bj --------')
+        _check_grad(mol_sph, xc='wb97m-d3bj', tol=1e-6)
+
     def test_to_cpu(self):
         mf = rks.RKS(mol_sph, xc='b3lyp').density_fit()
         mf.kernel()
diff --git a/gpu4pyscf/df/tests/test_df_uks.py b/gpu4pyscf/df/tests/test_df_uks.py
index 919e25ab..7cc45ef2 100644
--- a/gpu4pyscf/df/tests/test_df_uks.py
+++ b/gpu4pyscf/df/tests/test_df_uks.py
@@ -141,6 +141,13 @@ def test_uks_cart(self):
         print(f'diff from pyscf {e_tot - e_pyscf}')
         assert np.abs(e_tot - e_pyscf) < 1e-5
 
+    def test_uks_wb97m_d3bj(self):
+        print('-------- wB97m-d3bj -------------')
+        e_tot = run_dft(mol_sph, 'wb97m-d3bj')
+        e_ref = -76.00969751095374
+        print(f'diff from pyscf {e_tot - e_ref}')
+        assert np.abs(e_tot - e_ref) < 1e-5
+
 if __name__ == "__main__":
     print("Full Tests for unrestricted Kohn-Sham")
     unittest.main()
diff --git a/gpu4pyscf/df/tests/test_df_uks_grad.py b/gpu4pyscf/df/tests/test_df_uks_grad.py
index 3a998f41..845fa340 100644
--- a/gpu4pyscf/df/tests/test_df_uks_grad.py
+++ b/gpu4pyscf/df/tests/test_df_uks_grad.py
@@ -59,12 +59,12 @@ def tearDownModule():
     del mol_sph, mol_cart
 
 def _check_grad(mol, grid_response=True, xc=xc0, disp=disp0, tol=1e-5):
-    mf = uks.UKS(mol, xc=xc, disp=disp).density_fit(auxbasis=auxbasis0)
+    mf = uks.UKS(mol, xc=xc).density_fit(auxbasis=auxbasis0)
+    mf.disp = disp
     mf.grids.level = grids_level
     mf.nlcgrids.level = nlcgrids_level
     mf.conv_tol = 1e-10
     mf.verbose = 1
-    mf.disp = disp
     mf.kernel()
 
     g = mf.nuc_grad_method()
@@ -150,6 +150,10 @@ def test_grad_cart(self):
         print('------ Cart testing--------')
         _check_grad(mol_cart, xc='B3LYP', disp=None, tol=1e-5)
 
+    def test_grad_wb97m_d3bj(self):
+        print('------ wB97m-d3bj --------')
+        _check_grad(mol_sph, xc='wb97m-d3bj', tol=1e-5)
+
     def test_to_cpu(self):
         mf = uks.UKS(mol_sph, xc='b3lyp').density_fit()
         mf.kernel()
diff --git a/gpu4pyscf/df/tests/test_geomopt.py b/gpu4pyscf/df/tests/test_geomopt.py
index f9ea444e..440f8604 100644
--- a/gpu4pyscf/df/tests/test_geomopt.py
+++ b/gpu4pyscf/df/tests/test_geomopt.py
@@ -54,7 +54,8 @@ def tearDownModule():
 
 class KnownValues(unittest.TestCase):
     def test_rks_geomopt(self):
-        mf = rks.RKS(mol, xc=xc, disp=disp).density_fit()
+        mf = rks.RKS(mol, xc=xc).density_fit()
+        mf.disp = disp
         mf.grids.level = grids_level
         mf.kernel()
         mol_eq = optimize(mf, maxsteps=20)
diff --git a/gpu4pyscf/dft/numint.py b/gpu4pyscf/dft/numint.py
index 88127b51..3179d62a 100644
--- a/gpu4pyscf/dft/numint.py
+++ b/gpu4pyscf/dft/numint.py
@@ -517,10 +517,10 @@ def nr_rks(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
         if xctype == 'MGGA':
             wv[i][[0,4]] *= .5
     t0 = log.timer_debug1('eval vxc', *t0)
-    
+
     if USE_SPARSITY != 2:
         raise NotImplementedError(f'USE_SPARSITY = {USE_SPARSITY} is not implemented')
-    
+
     t1 = t0
     p0 = p1 = 0
     for ao_mask, idx, weight, _ in ni.block_loop(_sorted_mol, grids, nao, ao_deriv):
@@ -936,7 +936,7 @@ def get_rho(ni, mol, dm, grids, max_memory=2000, verbose=None):
     blksize = min(blksize, MIN_BLK_SIZE)
     GB = 1024*1024*1024
     log.debug(f'GPU Memory {mem_avail/GB:.1f} GB available, block size {blksize}')
-    
+
     ngrids = grids.weights.size
     rho = cupy.empty(ngrids)
     with opt.gdft_envs_cache():
@@ -1519,7 +1519,7 @@ def _block_loop(ni, mol, grids, nao=None, deriv=0, max_memory=2000,
     ngrids = grids.coords.shape[0]
     comp = (deriv+1)*(deriv+2)*(deriv+3)//6
     log = logger.new_logger(ni, ni.verbose)
-    
+
     if blksize is None:
         #cupy.get_default_memory_pool().free_all_blocks()
         mem_avail = get_avail_mem()
diff --git a/gpu4pyscf/dft/rks.py b/gpu4pyscf/dft/rks.py
index 4679fdda..c2d5ba8f 100644
--- a/gpu4pyscf/dft/rks.py
+++ b/gpu4pyscf/dft/rks.py
@@ -47,7 +47,7 @@ def prune_small_rho_grids_(ks, mol, dm, grids):
     if abs(n-mol.nelectron) < gen_grid.NELEC_ERROR_TOL*n:
         rho *= grids.weights
         idx = cupy.abs(rho) > threshold / grids.weights.size
-        
+
         grids.coords  = cupy.asarray(grids.coords [idx], order='C')
         grids.weights = cupy.asarray(grids.weights[idx], order='C')
         logger.debug(grids, 'Drop grids %d', rho.size - grids.weights.size)
@@ -272,12 +272,10 @@ class RKS(KohnShamDFT, hf.RHF):
 
     to_gpu = utils.to_gpu
     device = utils.device
-    _keys = {'disp'}
 
-    def __init__(self, mol, xc='LDA,VWN', disp=None):
+    def __init__(self, mol, xc='LDA,VWN'):
         hf.RHF.__init__(self, mol)
         KohnShamDFT.__init__(self, xc)
-        self.disp = disp
 
     def dump_flags(self, verbose=None):
         hf.RHF.dump_flags(self, verbose)
diff --git a/gpu4pyscf/dft/tests/test_rks.py b/gpu4pyscf/dft/tests/test_rks.py
index 07ef39e9..14aae215 100644
--- a/gpu4pyscf/dft/tests/test_rks.py
+++ b/gpu4pyscf/dft/tests/test_rks.py
@@ -155,6 +155,7 @@ def test_rks_b3lyp_d4(self):
         e_ref = -76.4669590803
         print('| CPU - GPU |:', e_tot - e_ref)
         assert np.abs(e_tot - e_ref) < 1e-5 #-76.4728129216)
+
 if __name__ == "__main__":
     print("Full Tests for dft")
     unittest.main()
diff --git a/gpu4pyscf/dft/tests/test_uks.py b/gpu4pyscf/dft/tests/test_uks.py
index 0b2139a1..d569d4fc 100644
--- a/gpu4pyscf/dft/tests/test_uks.py
+++ b/gpu4pyscf/dft/tests/test_uks.py
@@ -113,6 +113,14 @@ def test_uks_d4(self):
         e_ref = -75.9988910961
         print('diff:', e_tot - e_ref)
         assert np.abs(e_tot - e_ref) < 1e-5#-76.00306439862237)
+
+    def test_uks_wb97m_d3bj(self):
+        print('-------- wB97m-d3bj ----------------')
+        e_tot = run_dft('wb97m-d3bj')
+        e_ref = -76.009645802806       # From Psi4
+        print('| CPU - GPU |:', e_tot - e_ref)
+        assert np.abs(e_tot - e_ref) < 1e-5 #-76.4728129216)
+
 if __name__ == "__main__":
     print("Full Tests for dft")
     unittest.main()
diff --git a/gpu4pyscf/grad/rks.py b/gpu4pyscf/grad/rks.py
index 6c18eca7..cea04ba5 100644
--- a/gpu4pyscf/grad/rks.py
+++ b/gpu4pyscf/grad/rks.py
@@ -188,7 +188,7 @@ def get_nlc_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
 
     mo_occ = cupy.asarray(dms.mo_occ)
     mo_coeff = cupy.asarray(dms.mo_coeff)
-    
+
     mol = None
     _sorted_mol = opt._sorted_mol
     coeff = cupy.asarray(opt.coeff)
diff --git a/gpu4pyscf/grad/uks.py b/gpu4pyscf/grad/uks.py
index c563a095..328afe40 100644
--- a/gpu4pyscf/grad/uks.py
+++ b/gpu4pyscf/grad/uks.py
@@ -195,7 +195,7 @@ def get_vxc(ni, mol, grids, xc_code, dms, relativity=0, hermi=1,
     exc = None
     if nset == 1:
         vmat = vmat[0]
-        
+
     # - sign because nabla_X = -nabla_x
     return exc, -cupy.array(vmat)
 
diff --git a/gpu4pyscf/hessian/rhf.py b/gpu4pyscf/hessian/rhf.py
index bbb49df8..4a56e07a 100644
--- a/gpu4pyscf/hessian/rhf.py
+++ b/gpu4pyscf/hessian/rhf.py
@@ -331,13 +331,13 @@ def _ao2mo(mat):
         tmp = contract('xij,jo->xio', mat, mocc)
         return contract('xik,ip->xpk', tmp, mo_coeff)
     cupy.get_default_memory_pool().free_all_blocks()
-    
+
     avail_mem = get_avail_mem()
     blksize = int(avail_mem*0.4) // (8*3*nao*nao*4) // ALIGNED * ALIGNED
     blksize = min(32, blksize)
     log.debug(f'GPU memory {avail_mem/GB:.1f} GB available')
     log.debug(f'{blksize} atoms in each block CPHF equation')
-    
+
     # sort atoms to improve the convergence
     sorted_idx = sort_atoms(mol)
     atom_groups = []
@@ -359,14 +359,14 @@ def _ao2mo(mat):
             s1ao[:,:,p0:p1] += s1a[:,p0:p1].transpose(0,2,1)
             s1vo.append(_ao2mo(s1ao))
             h1vo.append(h1mo[ia])
-            
+
         log.info(f'Solving CPHF equation for atoms {len(group)}/{mol.natm}')
         h1vo = cupy.vstack(h1vo)
         s1vo = cupy.vstack(s1vo)
         tol = mf.conv_tol_cpscf
-        mo1, e1 = cphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo, 
+        mo1, e1 = cphf.solve(fx, mo_energy, mo_occ, h1vo, s1vo,
                              level_shift=level_shift, tol=tol, verbose=verbose)
-        
+
         mo1 = mo1.reshape(-1,3,nao,nocc)
         e1 = e1.reshape(-1,3,nocc,nocc)
 
@@ -537,7 +537,7 @@ def kernel(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None, atmlst=None):
         atmlst = hessobj.atmlst
     else:
         hessobj.atmlst = atmlst
-    
+
     if hessobj.verbose >= logger.INFO:
         hessobj.dump_flags()
 
diff --git a/gpu4pyscf/hessian/rks.py b/gpu4pyscf/hessian/rks.py
index 252dc652..af5ee3ac 100644
--- a/gpu4pyscf/hessian/rks.py
+++ b/gpu4pyscf/hessian/rks.py
@@ -51,7 +51,7 @@ def partial_hess_elec(hessobj, mo_energy=None, mo_coeff=None, mo_occ=None,
     mocc = mo_coeff[:,mo_occ>0]
     dm0 = cupy.dot(mocc, mocc.T) * 2
 
-    if mf.nlc != '':
+    if mf.do_nlc():
         raise NotImplementedError
     #enabling range-separated hybrids
     omega, alpha, beta = mf._numint.rsh_coeff(mf.xc)
@@ -209,7 +209,7 @@ def _get_vxc_diag(hessobj, mo_coeff, mo_occ, max_memory):
     coeff = cupy.asarray(opt.coeff)
     mo_coeff = coeff @ mo_coeff
     nao = mo_coeff.shape[0]
-    
+
     vmat = cupy.zeros((6,nao,nao))
     if xctype == 'LDA':
         ao_deriv = 2