Reduced shared mem req

wavefunction91 · Jul 18, 2024 · e9bf3a2 · e9bf3a2
1 parent c2e3cc2
commit e9bf3a2
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 4 deletions.
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
@@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   }
@@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   if(do_lapl)

diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
@@ -193,7 +193,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
 
 }
 
-#define GGA_KERNEL_SM_BLOCK_Y 32
+#define GGA_KERNEL_SM_BLOCK_Y 16
 
 template <bool need_lapl>
 __global__ void eval_uvars_mgga_kernel( size_t           ntasks,
@@ -319,7 +319,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   if(do_lapl)