Skip to content

Commit

Permalink
Merge pull request #91 from mikovtun/UKS_device
Browse files Browse the repository at this point in the history
Device UKS/GKS Implementation
  • Loading branch information
wavefunction91 authored Jul 30, 2024
2 parents 1a47c11 + 448bec8 commit 2e489d4
Show file tree
Hide file tree
Showing 69 changed files with 2,136 additions and 869 deletions.
2 changes: 1 addition & 1 deletion include/gauxc/exceptions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class magma_exception;
class cutlass_exception;
#endif

/// C++ Excpetion for genertic GauXC errors
/// C++ Exception for generic GauXC errors
class generic_gauxc_exception : public std::exception {

std::string file_;
Expand Down
2 changes: 1 addition & 1 deletion src/xc_integrator/integrator_util/exx_screening.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ void exx_ek_screening(
device_data.reset_allocations();
device_data.allocate_static_data_exx_ek_screening( ntasks, nbf, nshells,
shpairs.npairs(), basis_map.max_l() );
device_data.send_static_data_density_basis( P_abs, ldp, basis );
device_data.send_static_data_density_basis( P_abs, ldp, nullptr, 0, nullptr, 0, nullptr, 0, basis );
device_data.send_static_data_exx_ek_screening( V_shell_max, ldv, basis_map,
shpairs );

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,5 +63,6 @@ void syr2k( device_blas_handle handle,
int M, int K, T ALPHA,
const T* A, int LDA, const T* B, int LDB,
T BETA, T* C, int LDC );

}

17 changes: 10 additions & 7 deletions src/xc_integrator/local_work_driver/device/common/uvvars.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,23 @@
*/
#pragma once
#include "device/xc_device_task.hpp"
#include "device/xc_device_data.hpp"
#include "device/device_queue.hpp"

namespace GauXC {

void eval_uvvars_lda( size_t ntasks, int32_t nbe_max, int32_t npts_max,

void eval_uvars_lda( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
XCDeviceTask* device_tasks, device_queue queue );

void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbe_max,
int32_t npts_max, XCDeviceTask* device_tasks, const double* denx,
const double* deny, const double* denz, double* gamma, device_queue queue );
void eval_uvars_gga( size_t ntasks, int32_t npts_max, integrator_ks_scheme ks_scheme,
XCDeviceTask* device_tasks, device_queue queue );

void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbe_max,
int32_t npts_max, XCDeviceTask* device_tasks, const double* denx,
const double* deny, const double* denz, double* gamma, bool do_lapl,
void eval_uvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
int32_t npts_max, bool do_lapl, XCDeviceTask* device_tasks,
device_queue queue );

void eval_vvar( size_t ntasks, int32_t nbf_max, int32_t npts_max, bool do_grad, density_id den_select,
XCDeviceTask* device_tasks, device_queue queue );

}
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
* See LICENSE.txt for details
*/
#include "device/xc_device_task.hpp"
#include "device/xc_device_data.hpp"
#include "device/device_queue.hpp"

namespace GauXC {
Expand All @@ -14,12 +15,16 @@ void zmat_lda_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
integrator_ks_scheme s,
density_id sel,
device_queue queue );

void zmat_gga_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
integrator_ks_scheme s,
density_id sel,
device_queue queue );

void zmat_mgga_vxc( size_t ntasks,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,88 @@ void eval_collocation_shell_to_task_hessian(
}


uint32_t max_threads_shell_to_task_collocation_laplacian( int32_t l, bool pure ) {
if( pure ) {
switch(l) {
case 0: return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_0 );\
$for( L in range(1, L_max + 1) )
case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_spherical_laplacian_$(L) );
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
} else {
switch(l) {\
$for( L in range(L_max + 1) )
case $(L): return util::cuda_kernel_max_threads_per_block( collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L) );\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
}
return 0;
}





template <typename... Args>
void dispatch_shell_to_task_collocation_laplacian( cudaStream_t stream, int32_t l,
bool pure, uint32_t ntask_average, uint32_t nshells, Args&&... args ) {

dim3 threads = max_threads_shell_to_task_collocation(l,pure);
int nwarp_per_block = threads.x / cuda::warp_size;
int n_task_blocks = util::div_ceil( ntask_average, nwarp_per_block );
dim3 block(n_task_blocks, 1, nshells);

if( pure ) {
switch(l) {
case 0:
collocation_device_shell_to_task_kernel_cartesian_laplacian_0<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;
$for( L in range(1, L_max + 1) )
case $(L):
collocation_device_shell_to_task_kernel_spherical_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
} else {
switch(l) {\
$for( L in range(0, L_max + 1) )
case $(L):
collocation_device_shell_to_task_kernel_cartesian_laplacian_$(L)<<<block,threads,0,stream>>>( nshells, std::forward<Args>(args)... );
break;\
$endfor
default: GAUXC_GENERIC_EXCEPTION("CUDA L_MAX = $(L_max)");
}
}

}



void eval_collocation_shell_to_task_laplacian(
uint32_t max_l,
AngularMomentumShellToTaskBatch* l_batched_shell_to_task,
XCDeviceTask* device_tasks,
device_queue queue
) {

cudaStream_t stream = queue.queue_as<util::cuda_stream>() ;

for( auto l = 0u; l <= max_l; ++l ) {
auto pure = l_batched_shell_to_task[l].pure;
auto shell_to_task_device = l_batched_shell_to_task[l].shell_to_task_device;
auto nshells = l_batched_shell_to_task[l].nshells_in_batch;
auto ntask_average = std::max(1ul, l_batched_shell_to_task[l].ntask_average);
dispatch_shell_to_task_collocation_laplacian( stream, l, pure,
ntask_average, nshells, shell_to_task_device, device_tasks );
auto stat = cudaGetLastError();
GAUXC_CUDA_ERROR("LAP", stat);
}


}



Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_hessian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_cartesian_l$(L)_laplacian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L).hpp"\
$endfor
Expand All @@ -30,3 +34,7 @@
$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_hessian.hpp"\
$endfor

$for( L in range(L_max + 1))
#include "collocation/collocation_shell_to_task_kernels_spherical_l$(L)_laplacian.hpp"\
$endfor
Original file line number Diff line number Diff line change
Expand Up @@ -181,5 +181,6 @@ void syr2k( device_blas_handle generic_handle,

}


}

Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,9 @@ __global__ __launch_bounds__(512,1) void increment_exc_grad_gga_kernel(
const auto* __restrict__ vrho = task->vrho;
const auto* __restrict__ vgamma = task->vgamma;

const auto* __restrict__ den_x = task->ddenx;
const auto* __restrict__ den_y = task->ddeny;
const auto* __restrict__ den_z = task->ddenz;
const auto* __restrict__ den_x = task->dden_sx;
const auto* __restrict__ den_y = task->dden_sy;
const auto* __restrict__ den_z = task->dden_sz;

#pragma unroll 1
for( uint32_t ipt = threadIdx.x % cuda::warp_size;
Expand Down
Loading

0 comments on commit 2e489d4

Please sign in to comment.