Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIP compilation #135

Open
wants to merge 26 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ddf0071
HIP compilation
ryanstocks00 Jul 18, 2024
6070afd
Add hip version of mgga kernels
ryanstocks00 Jul 18, 2024
bb4559f
Removed commented print
ryanstocks00 Jul 18, 2024
6d2d5f0
Copy paste cleanup
ryanstocks00 Jul 18, 2024
6b46eb6
More missing HIP functions
ryanstocks00 Jul 18, 2024
c81132f
Missing HIP kernel
ryanstocks00 Jul 18, 2024
c2e3cc2
Removed register from hip
ryanstocks00 Jul 18, 2024
e9bf3a2
Reduced shared mem req
ryanstocks00 Jul 18, 2024
e9c616d
HIP discovery fixes
ajaypanyala Jul 20, 2024
b781db3
update readme [skip ci]
ajaypanyala Jul 20, 2024
08ae311
update ExchEXX hash
ajaypanyala Jul 28, 2024
6c80aff
Merge remote-tracking branch 'upstream/master'
ryanstocks00 Sep 21, 2024
5b2273e
Fixed HIP compilation
ryanstocks00 Sep 21, 2024
9d9145d
hipblas.h -> hipblas/hipblas.h
ryanstocks00 Sep 21, 2024
1ad6fd4
Renamed SM_BLOCK_Y for cuda compilation
ryanstocks00 Sep 21, 2024
af72d05
Move a bunch of cuda -> hip
ryanstocks00 Sep 22, 2024
7c26939
Allow passing additional flags to obara saika host compilation
ryanstocks00 Sep 26, 2024
2bb4783
Moved obara saika compile flags override
ryanstocks00 Sep 26, 2024
ecf6eac
Compiling HIP on NVIDIA
ryanstocks00 Sep 30, 2024
23c78e9
Pseudofunctional HIP on NVIDIA
ryanstocks00 Oct 8, 2024
031fb0a
Fixed mem access violation
ryanstocks00 Oct 8, 2024
eeff105
Copy zmat from cuda
ryanstocks00 Oct 8, 2024
2089af6
Small refactor of cuda vvar kernel to support any grid/block dims
ryanstocks00 Oct 9, 2024
d4675df
Revert SM block size changes
ryanstocks00 Oct 9, 2024
bfd8803
More forceful double instead of double2
ryanstocks00 Oct 9, 2024
f0b1a51
AMD compilation
ryanstocks00 Oct 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
{
dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
Copy link
Author

@ryanstocks00 ryanstocks00 Jul 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the nbf_max usage here was potentially a bug? Have replaced with npts_max

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a bug, but potentially not optimal on some hardware. What have you been testing on? In principle, these parameters should be tuned, these were just the ones that were found to perform best on V100/A100.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it guaranteed that npts_max is greater than nbf_max? My understanding from a perusal of the function is that the y axis is iterating over the points rather than basis functions so the nbf_max could have been problematic? I'm still working on testing it all out, so certainly haven't got as far as performance tuning yet

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not guaranteed, although it usually is. I agree, what's there is likely a typo, but the kernel is hardened to take any block/grid dimension and still give the right results (i.e. whether or not loops get executed is based on the number of warps in the thread block).

I'll check to see whether this kneecaps the performance of this kernel in prod - if there's no change or it's better, I'll accept it for being "correct", if it's worse I'll come back with a hand-wavy/tin-foil-hat reason for why that's the case :).

ntasks );
eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
}
Expand All @@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
{
dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
ntasks );
eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
if(do_lapl)
Expand Down
136 changes: 136 additions & 0 deletions src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
Original file line number Diff line number Diff line change
Expand Up @@ -193,12 +193,148 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,

}

#define GGA_KERNEL_SM_BLOCK_Y 16

template <bool need_lapl>
__global__ void eval_uvars_mgga_kernel( size_t ntasks,
XCDeviceTask* tasks_device ) {

constexpr auto warp_size = hip::warp_size;
//constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block;

const int batch_idx = blockIdx.z;
if( batch_idx >= ntasks ) return;

auto& task = tasks_device[ batch_idx ];

const auto npts = task.npts;
const auto nbf = task.bfn_screening.nbe;

auto* tau_eval_device = task.tau;
decltype(tau_eval_device) lapl_eval_device = nullptr;
if constexpr (need_lapl) {
lapl_eval_device = task.denlapl;
}

//const auto* basis_eval_device = task.bf;
const auto* dbasis_x_eval_device = task.dbfx;
const auto* dbasis_y_eval_device = task.dbfy;
const auto* dbasis_z_eval_device = task.dbfz;
decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr;
if constexpr (need_lapl) {
basis_lapl_eval_device = task.d2bflapl;
}

//const auto* den_basis_prod_device = task.zmat;
const auto* den_basis_dx_prod_device = task.xmat_x;
const auto* den_basis_dy_prod_device = task.xmat_y;
const auto* den_basis_dz_prod_device = task.xmat_z;
decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr;
if constexpr (need_lapl) {
den_basis_prod_device = task.zmat;
}

__shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1];

for ( int bid_x = blockIdx.x * blockDim.x;
bid_x < nbf;
bid_x += blockDim.x * gridDim.x ) {

for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y;
bid_y < npts;
bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) {

for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
den_shared[0][threadIdx.x][sm_y] = 0.;
den_shared[1][threadIdx.x][sm_y] = 0.;
den_shared[2][threadIdx.x][sm_y] = 0.;
if constexpr (need_lapl)
den_shared[3][threadIdx.x][sm_y] = 0.;

if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) {
const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts;
const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts;
const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts;

const double* bf_x_col = dbasis_x_eval_device + (bid_x + sm_y)*npts;
const double* bf_y_col = dbasis_y_eval_device + (bid_x + sm_y)*npts;
const double* bf_z_col = dbasis_z_eval_device + (bid_x + sm_y)*npts;


den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ];
den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ];
den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ];


if constexpr (need_lapl) {
const double* db_col = den_basis_prod_device + (bid_x + sm_y)*npts;
const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts;
den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
}
}
}
__syncthreads();


for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
const int tid_y = bid_y + sm_y;

double tx_reg = den_shared[0][sm_y][threadIdx.x];
double ty_reg = den_shared[1][sm_y][threadIdx.x];
double tz_reg = den_shared[2][sm_y][threadIdx.x];
// Warp blocks are stored col major
double tau_reg = 0.0;
tau_reg = 0.5 * hip::warp_reduce_sum<warp_size>( tx_reg );
tau_reg += 0.5 * hip::warp_reduce_sum<warp_size>( ty_reg );
tau_reg += 0.5 * hip::warp_reduce_sum<warp_size>( tz_reg );

double lapl_reg = 0.0;
if constexpr (need_lapl) {
lapl_reg = den_shared[3][sm_y][threadIdx.x];
lapl_reg = hip::warp_reduce_sum<warp_size>(lapl_reg);
lapl_reg = 2. * lapl_reg + 4. * tau_reg;
}

if( threadIdx.x == 0 and tid_y < npts ) {
atomicAdd( tau_eval_device + tid_y, tau_reg );
if constexpr (need_lapl) {
atomicAdd( lapl_eval_device + tid_y, lapl_reg );
}
}
}
__syncthreads();
}
}
}


void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
int32_t npts_max, XCDeviceTask* device_tasks, const double* denx,
const double* deny, const double* denz, double* gamma, bool do_lapl,
device_queue queue ) {

hipStream_t stream = queue.queue_as<util::hip_stream>();

// U Variables
{
dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );
dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
ntasks );
eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
if(do_lapl)
eval_uvars_mgga_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
else
eval_uvars_mgga_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
}

// V variables (GAMMA)
dim3 threads( hip::max_threads_per_thread_block );
dim3 blocks( util::div_ceil( npts_total, threads.x ) );
eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>(
npts_total, denx, deny, denz, gamma
);
}


}
130 changes: 130 additions & 0 deletions src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,137 @@ void zmat_gga_vxc( size_t ntasks,
}


template <bool need_lapl>
__global__ void zmat_mgga_vxc_kernel( size_t ntasks,
XCDeviceTask* tasks_device ) {

const int batch_idx = blockIdx.z;
if( batch_idx >= ntasks ) return;

auto& task = tasks_device[ batch_idx ];
const auto npts = task.npts;
const auto nbf = task.bfn_screening.nbe;
const auto* vrho_device = task.vrho;
const auto* vgamma_device = task.vgamma;
const double* vlapl_device = need_lapl ? task.vlapl : nullptr;
const auto* den_x_eval_device = task.ddenx;
const auto* den_y_eval_device = task.ddeny;
const auto* den_z_eval_device = task.ddenz;

const auto* basis_eval_device = task.bf;
const auto* dbasis_x_eval_device = task.dbfx;
const auto* dbasis_y_eval_device = task.dbfy;
const auto* dbasis_z_eval_device = task.dbfz;
const double* d2basis_lapl_eval_device =
need_lapl ? task.d2bflapl : nullptr;


auto* z_matrix_device = task.zmat;

const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;

if( tid_x < npts and tid_y < nbf ) {

const size_t ibfoff = tid_y * npts + tid_x;
const double fact_1 = 0.5 * vrho_device[tid_x] ;
const double fact_2 = 2.0 * vgamma_device[tid_x];

const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ];
const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ];
const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ];

double val =
fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz );

if constexpr (need_lapl) {
val += vlapl_device[tid_x] * d2basis_lapl_eval_device[ibfoff];
}

z_matrix_device[ ibfoff ] = val;
}
}

void zmat_mgga_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
bool do_lapl,
device_queue queue ) {

hipStream_t stream = queue.queue_as<util::hip_stream>() ;


dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1);
dim3 blocks( util::div_ceil( max_npts, threads.x ),
util::div_ceil( max_nbf, threads.y ),
ntasks );

if(do_lapl)
zmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
else
zmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );

}


template <bool need_lapl>
__global__ void mmat_mgga_vxc_kernel( size_t ntasks,
XCDeviceTask* tasks_device ) {

const int batch_idx = blockIdx.z;
if( batch_idx >= ntasks ) return;

auto& task = tasks_device[ batch_idx ];
const auto npts = task.npts;
const auto nbf = task.bfn_screening.nbe;
const auto* vtau_device = task.vtau;
const double* vlapl_device = need_lapl ? task.vlapl : nullptr;

const auto* dbasis_x_eval_device = task.dbfx;
const auto* dbasis_y_eval_device = task.dbfy;
const auto* dbasis_z_eval_device = task.dbfz;

auto* mmat_x = task.xmat_x;
auto* mmat_y = task.xmat_y;
auto* mmat_z = task.xmat_z;

const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;

if( tid_x < npts and tid_y < nbf ) {

const size_t ibfoff = tid_y * npts + tid_x;
const double fact_1 = 0.25 * vtau_device[tid_x] +
(need_lapl ? vlapl_device[tid_x] : 0.0);

mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ];
mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ];
mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ];
}
}

void mmat_mgga_vxc( size_t ntasks,
int32_t max_nbf,
int32_t max_npts,
XCDeviceTask* tasks_device,
bool do_lapl,
device_queue queue ) {

hipStream_t stream = queue.queue_as<util::hip_stream>() ;


dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1);
dim3 blocks( util::div_ceil( max_npts, threads.x ),
util::div_ceil( max_nbf, threads.y ),
ntasks );

if(do_lapl)
mmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
else
mmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );

}


}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,4 +27,14 @@ void eval_kern_exc_vxc_gga( const functional_type& func, size_t npts,

}

void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts,
const double* rho, const double* gamma, const double* tau, const double* lapl,
double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl,
device_queue queue ) {

hipStream_t stream = queue.queue_as<util::hip_stream>();
func.eval_exc_vxc_device( npts, rho, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau, stream );

}

}
4 changes: 2 additions & 2 deletions src/xc_integrator/local_work_driver/device/scheme1_base.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -441,9 +441,9 @@ void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) {
eval_collocation_shell_to_task_hessian( max_l,
data->l_batched_shell_to_task.data(), aos_stack.device_tasks,
data->device_backend_->queue() );
#endif

data->device_backend_->check_error("collocation hess" __FILE__ ": " + std::to_string(__LINE__));
#endif
}

void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) {
Expand All @@ -461,9 +461,9 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) {
eval_collocation_shell_to_task_laplacian( max_l,
data->l_batched_shell_to_task.data(), aos_stack.device_tasks,
data->device_backend_->queue() );
#endif

data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__));
#endif
}


Expand Down