wavefunction91 · ryanstocks00 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024 · Jul 18, 2024
diff --git a/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu b/src/xc_integrator/local_work_driver/device/cuda/kernels/uvvars.cu
@@ -306,7 +306,7 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel<<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   }
@@ -330,7 +330,7 @@ void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max,
   {
   dim3 threads( cuda::warp_size, cuda::max_warps_per_thread_block / 2, 1 );
   dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
-               std::min(uint64_t(16), util::div_ceil( nbf_max, 16 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
                ntasks );
   eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
   if(do_lapl)

diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/uvvars.hip
@@ -193,12 +193,148 @@ void eval_uvvars_gga( size_t ntasks, size_t npts_total, int32_t nbf_max,
 
 }
 
+#define GGA_KERNEL_SM_BLOCK_Y 16
 
+template <bool need_lapl>
+__global__ void eval_uvars_mgga_kernel( size_t           ntasks,
+                                       XCDeviceTask* tasks_device ) {
+
+  constexpr auto warp_size = hip::warp_size;
+  //constexpr auto max_warps_per_thread_block = hip::max_warps_per_thread_block;
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+
+  auto* tau_eval_device   = task.tau;
+  decltype(tau_eval_device) lapl_eval_device = nullptr;
+  if constexpr (need_lapl) {
+    lapl_eval_device = task.denlapl;
+  }
+
+  //const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+  decltype(dbasis_x_eval_device) basis_lapl_eval_device = nullptr;
+  if constexpr (need_lapl) {
+    basis_lapl_eval_device = task.d2bflapl;
+  }
+
+  //const auto* den_basis_prod_device    = task.zmat;
+  const auto* den_basis_dx_prod_device = task.xmat_x;
+  const auto* den_basis_dy_prod_device = task.xmat_y;
+  const auto* den_basis_dz_prod_device = task.xmat_z;
+  decltype(den_basis_dx_prod_device) den_basis_prod_device = nullptr;
+  if constexpr (need_lapl) {
+    den_basis_prod_device = task.zmat;
+  }
+
+  __shared__ double den_shared[3+!!need_lapl][warp_size][GGA_KERNEL_SM_BLOCK_Y+1];
+
+  for ( int bid_x = blockIdx.x * blockDim.x; 
+        bid_x < nbf;
+        bid_x += blockDim.x * gridDim.x ) {
+
+    for ( int bid_y = blockIdx.y * GGA_KERNEL_SM_BLOCK_Y; 
+          bid_y < npts;
+          bid_y += GGA_KERNEL_SM_BLOCK_Y * gridDim.y ) {
+
+      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
+        den_shared[0][threadIdx.x][sm_y] = 0.;
+        den_shared[1][threadIdx.x][sm_y] = 0.;
+        den_shared[2][threadIdx.x][sm_y] = 0.;
+        if constexpr (need_lapl)
+          den_shared[3][threadIdx.x][sm_y] = 0.;
+
+        if (bid_y + threadIdx.x < npts and bid_x + sm_y < nbf) { 
+          const double* db_x_col = den_basis_dx_prod_device + (bid_x + sm_y)*npts;
+          const double* db_y_col = den_basis_dy_prod_device + (bid_x + sm_y)*npts;
+          const double* db_z_col = den_basis_dz_prod_device + (bid_x + sm_y)*npts;
+
+          const double* bf_x_col = dbasis_x_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_y_col = dbasis_y_eval_device  + (bid_x + sm_y)*npts;
+          const double* bf_z_col = dbasis_z_eval_device  + (bid_x + sm_y)*npts;
+
+
+          den_shared[0][threadIdx.x][sm_y] = bf_x_col[ bid_y + threadIdx.x ] * db_x_col[ bid_y + threadIdx.x ];
+          den_shared[1][threadIdx.x][sm_y] = bf_y_col[ bid_y + threadIdx.x ] * db_y_col[ bid_y + threadIdx.x ];
+          den_shared[2][threadIdx.x][sm_y] = bf_z_col[ bid_y + threadIdx.x ] * db_z_col[ bid_y + threadIdx.x ];
+
+
+          if constexpr (need_lapl) {
+            const double* db_col   = den_basis_prod_device  + (bid_x + sm_y)*npts;
+            const double* bf_l_col = basis_lapl_eval_device + (bid_x + sm_y)*npts;
+            den_shared[3][threadIdx.x][sm_y] = bf_l_col[ bid_y + threadIdx.x ] * db_col[ bid_y + threadIdx.x ];
+          }
+        }
+      }
+      __syncthreads();
+
+
+      for (int sm_y = threadIdx.y; sm_y < GGA_KERNEL_SM_BLOCK_Y; sm_y += blockDim.y) {
+        const int tid_y = bid_y + sm_y;
+
+        double tx_reg  = den_shared[0][sm_y][threadIdx.x];
+        double ty_reg  = den_shared[1][sm_y][threadIdx.x];
+        double tz_reg  = den_shared[2][sm_y][threadIdx.x];
+        // Warp blocks are stored col major
+        double tau_reg = 0.0;
+        tau_reg  = 0.5 * hip::warp_reduce_sum<warp_size>( tx_reg );
+        tau_reg += 0.5 * hip::warp_reduce_sum<warp_size>( ty_reg );
+        tau_reg += 0.5 * hip::warp_reduce_sum<warp_size>( tz_reg );
+
+        double lapl_reg = 0.0;
+        if constexpr (need_lapl) {
+          lapl_reg = den_shared[3][sm_y][threadIdx.x];
+          lapl_reg = hip::warp_reduce_sum<warp_size>(lapl_reg);
+          lapl_reg = 2. * lapl_reg + 4. * tau_reg;
+        }
+
+        if( threadIdx.x == 0 and tid_y < npts ) {
+          atomicAdd( tau_eval_device   + tid_y, tau_reg );
+          if constexpr (need_lapl) {
+            atomicAdd( lapl_eval_device   + tid_y, lapl_reg );
+          }
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
 
 
+void eval_uvvars_mgga( size_t ntasks, size_t npts_total, int32_t nbf_max, 
+  int32_t npts_max, XCDeviceTask* device_tasks, const double* denx, 
+  const double* deny, const double* denz, double* gamma, bool do_lapl,
+  device_queue queue ) {
 
+  hipStream_t stream = queue.queue_as<util::hip_stream>();
 
+  // U Variables
+  {
+  dim3 threads( hip::warp_size, hip::max_warps_per_thread_block / 2, 1 );
+  dim3 blocks( std::min(uint64_t(4), util::div_ceil( nbf_max, 4 )),
+               std::min(uint64_t(GGA_KERNEL_SM_BLOCK_Y), util::div_ceil( npts_max, GGA_KERNEL_SM_BLOCK_Y )),
+               ntasks );
+  eval_uvars_gga_kernel <<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  if(do_lapl)
+    eval_uvars_mgga_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  else
+    eval_uvars_mgga_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, device_tasks );
+  }
 
+  // V variables (GAMMA)
+  dim3 threads( hip::max_threads_per_thread_block );
+  dim3 blocks( util::div_ceil( npts_total, threads.x ) );
+  eval_vvars_gga_kernel<<< blocks, threads, 0, stream >>>(
+    npts_total, denx, deny, denz, gamma
+  );
+}
 
 
 }
diff --git a/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip b/src/xc_integrator/local_work_driver/device/hip/kernels/zmat_vxc.hip
@@ -143,7 +143,137 @@ void zmat_gga_vxc( size_t            ntasks,
 }
 
 
+template <bool need_lapl>
+__global__ void zmat_mgga_vxc_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const auto* vrho_device    = task.vrho;
+  const auto* vgamma_device  = task.vgamma;
+  const double* vlapl_device = need_lapl ? task.vlapl : nullptr;
+  const auto* den_x_eval_device = task.ddenx;
+  const auto* den_y_eval_device = task.ddeny;
+  const auto* den_z_eval_device = task.ddenz;
+
+  const auto* basis_eval_device = task.bf;
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+  const double* d2basis_lapl_eval_device = 
+    need_lapl ? task.d2bflapl : nullptr;
+
+
+  auto* z_matrix_device = task.zmat;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const double fact_1 = 0.5 * vrho_device[tid_x]  ;
+    const double fact_2 = 2.0 * vgamma_device[tid_x];
+
+    const double dx = den_x_eval_device[ tid_x ] * dbasis_x_eval_device[ ibfoff ];
+    const double dy = den_y_eval_device[ tid_x ] * dbasis_y_eval_device[ ibfoff ];
+    const double dz = den_z_eval_device[ tid_x ] * dbasis_z_eval_device[ ibfoff ];
 
+    double val = 
+      fact_1 * basis_eval_device[ ibfoff ] + fact_2 * ( dx + dy + dz ); 
+
+    if constexpr (need_lapl) {
+      val += vlapl_device[tid_x] * d2basis_lapl_eval_device[ibfoff];
+    }
+
+    z_matrix_device[ ibfoff ] = val;
+  }
+}
+
+void zmat_mgga_vxc( size_t            ntasks,
+                    int32_t           max_nbf,
+                    int32_t           max_npts,
+                    XCDeviceTask*     tasks_device,
+                    bool              do_lapl,
+                    device_queue queue ) {
+
+  hipStream_t stream = queue.queue_as<util::hip_stream>() ;
+
+
+  dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1);
+  dim3 blocks( util::div_ceil( max_npts, threads.x ),
+               util::div_ceil( max_nbf,  threads.y ),
+               ntasks );
+
+  if(do_lapl)
+    zmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  else
+    zmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+
+}
+
+
+template <bool need_lapl>
+__global__ void mmat_mgga_vxc_kernel( size_t        ntasks,
+                                     XCDeviceTask* tasks_device ) {
+
+  const int batch_idx = blockIdx.z;
+  if( batch_idx >= ntasks ) return;
+
+  auto& task = tasks_device[ batch_idx ];
+  const auto npts            = task.npts;
+  const auto nbf             = task.bfn_screening.nbe;
+  const auto* vtau_device    = task.vtau;
+  const double* vlapl_device = need_lapl ? task.vlapl : nullptr;
+
+  const auto* dbasis_x_eval_device = task.dbfx;
+  const auto* dbasis_y_eval_device = task.dbfy;
+  const auto* dbasis_z_eval_device = task.dbfz;
+
+  auto* mmat_x = task.xmat_x;
+  auto* mmat_y = task.xmat_y;
+  auto* mmat_z = task.xmat_z;
+
+  const int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
+  const int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+  if( tid_x < npts and tid_y < nbf ) {
+
+    const size_t ibfoff = tid_y * npts + tid_x;
+    const double fact_1 = 0.25 * vtau_device[tid_x] + 
+      (need_lapl ? vlapl_device[tid_x] : 0.0);
+
+    mmat_x[ ibfoff ] = fact_1 * dbasis_x_eval_device[ ibfoff ]; 
+    mmat_y[ ibfoff ] = fact_1 * dbasis_y_eval_device[ ibfoff ]; 
+    mmat_z[ ibfoff ] = fact_1 * dbasis_z_eval_device[ ibfoff ]; 
+  }
+}
+
+void mmat_mgga_vxc( size_t            ntasks,
+                    int32_t           max_nbf,
+                    int32_t           max_npts,
+                    XCDeviceTask*     tasks_device,
+                    bool              do_lapl,
+                    device_queue queue ) {
+
+  hipStream_t stream = queue.queue_as<util::hip_stream>() ;
+
+
+  dim3 threads(hip::warp_size,hip::max_warps_per_thread_block,1);
+  dim3 blocks( util::div_ceil( max_npts, threads.x ),
+               util::div_ceil( max_nbf,  threads.y ),
+               ntasks );
+
+  if(do_lapl)
+    mmat_mgga_vxc_kernel<true><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+  else
+    mmat_mgga_vxc_kernel<false><<< blocks, threads, 0, stream >>>( ntasks, tasks_device );
+
+}
 
 
 }

diff --git a/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx b/src/xc_integrator/local_work_driver/device/hip/xc_functional_eval_wrapper.cxx
@@ -27,4 +27,14 @@ void eval_kern_exc_vxc_gga( const functional_type& func, size_t npts,
 
 }
 
+void eval_kern_exc_vxc_mgga( const functional_type& func, size_t npts,
+  const double* rho, const double* gamma, const double* tau, const double* lapl,
+  double* eps, double* vrho, double* vgamma, double* vtau, double* vlapl, 
+  device_queue queue ) {
+
+  hipStream_t stream = queue.queue_as<util::hip_stream>();
+  func.eval_exc_vxc_device( npts, rho, gamma, lapl, tau, eps, vrho, vgamma, vlapl, vtau,  stream );
+
+}
+
 }
diff --git a/src/xc_integrator/local_work_driver/device/scheme1_base.cxx b/src/xc_integrator/local_work_driver/device/scheme1_base.cxx
@@ -441,9 +441,9 @@ void AoSScheme1Base::eval_collocation_hessian( XCDeviceData* _data ) {
   eval_collocation_shell_to_task_hessian( max_l, 
     data->l_batched_shell_to_task.data(), aos_stack.device_tasks,
     data->device_backend_->queue() );
-#endif
 
   data->device_backend_->check_error("collocation hess" __FILE__ ": " + std::to_string(__LINE__));
+#endif
 }
 
 void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) {
@@ -461,9 +461,9 @@ void AoSScheme1Base::eval_collocation_laplacian( XCDeviceData* _data ) {
   eval_collocation_shell_to_task_laplacian( max_l, 
     data->l_batched_shell_to_task.data(), aos_stack.device_tasks,
     data->device_backend_->queue() );
-#endif
 
   data->device_backend_->check_error("collocation lapl" __FILE__ ": " + std::to_string(__LINE__));
+#endif
 }