From a374af3bd3b8fa0c7961d0adf17b88e458d31ba7 Mon Sep 17 00:00:00 2001 From: alserene Date: Fri, 8 Nov 2024 17:12:16 +1100 Subject: [PATCH] First draft of SpinTemp kernel. --- src/py21cmfast/src/SpinTemperatureBox.c | 75 ++++--- src/py21cmfast/src/SpinTemperatureBox.cu | 249 +++++++++++++++++++++++ src/py21cmfast/src/SpinTemperatureBox.h | 8 + 3 files changed, 299 insertions(+), 33 deletions(-) create mode 100644 src/py21cmfast/src/SpinTemperatureBox.cu diff --git a/src/py21cmfast/src/SpinTemperatureBox.c b/src/py21cmfast/src/SpinTemperatureBox.c index 0affacb3..7ffd6fef 100644 --- a/src/py21cmfast/src/SpinTemperatureBox.c +++ b/src/py21cmfast/src/SpinTemperatureBox.c @@ -935,42 +935,51 @@ void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, } } - #pragma omp parallel num_threads(user_params_global->N_THREADS) - { - unsigned long long int box_ct; - double curr_dens; - double curr_mcrit = 0.; - double fcoll, dfcoll; - double fcoll_MINI=0; - - #pragma omp for reduction(+:ave_sfrd_buf,ave_sfrd_buf_mini) - for (box_ct=0; box_ctUSE_MINI_HALOS) - curr_mcrit = Mcrit_R_grid[box_ct]; - - if(flag_options_global->USE_MASS_DEPENDENT_ZETA){ - fcoll = EvaluateSFRD_Conditional(curr_dens,zpp_growth[R_ct],M_min_R[R_ct],M_max_R[R_ct],M_max_R[R_ct],sigma_max[R_ct], - Mcrit_atom_interp_table[R_ct],Mlim_Fstar_g); - sfrd_grid[box_ct] = (1.+curr_dens)*fcoll; - - if (flag_options_global->USE_MINI_HALOS){ - fcoll_MINI = EvaluateSFRD_Conditional_MINI(curr_dens,curr_mcrit,zpp_growth[R_ct],M_min_R[R_ct],M_max_R[R_ct],M_max_R[R_ct], - sigma_max[R_ct],Mcrit_atom_interp_table[R_ct],Mlim_Fstar_MINI_g); - sfrd_grid_mini[box_ct] = (1.+curr_dens)*fcoll_MINI; - } - } - else{ - fcoll = EvaluateFcoll_delta(curr_dens,zpp_growth[R_ct],sigma_min[R_ct],sigma_max[R_ct]); - dfcoll = EvaluatedFcolldz(curr_dens,zpp_for_evolve_list[R_ct],sigma_min[R_ct],sigma_max[R_ct]); - sfrd_grid[box_ct] = (1.+curr_dens)*dfcoll; + // -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + // If GPU is to be used and flags are ideal, call GPU version of reduction + if (true && flag_options_global->USE_MASS_DEPENDENT_ZETA && user_params_global->USE_INTERPOLATION_TABLES && !flag_options_global->USE_MINI_HALOS) { + calculate_sfrd_from_grid_gpu(&SFRD_conditional_table, dens_R_grid, zpp_growth, R_ct, sfrd_grid, ave_sfrd_buf, HII_TOT_NUM_PIXELS); + } else { + // Else, run CPU reduction + #pragma omp parallel num_threads(user_params_global->N_THREADS) + { + unsigned long long int box_ct; + double curr_dens; + double curr_mcrit = 0.; + double fcoll, dfcoll; + double fcoll_MINI=0; + + #pragma omp for reduction(+:ave_sfrd_buf,ave_sfrd_buf_mini) + for (box_ct=0; box_ctUSE_MINI_HALOS) + curr_mcrit = Mcrit_R_grid[box_ct]; + + if(flag_options_global->USE_MASS_DEPENDENT_ZETA){ + fcoll = EvaluateSFRD_Conditional(curr_dens,zpp_growth[R_ct],M_min_R[R_ct],M_max_R[R_ct],M_max_R[R_ct],sigma_max[R_ct], + Mcrit_atom_interp_table[R_ct],Mlim_Fstar_g); + sfrd_grid[box_ct] = (1.+curr_dens)*fcoll; + + if (flag_options_global->USE_MINI_HALOS){ + fcoll_MINI = EvaluateSFRD_Conditional_MINI(curr_dens,curr_mcrit,zpp_growth[R_ct],M_min_R[R_ct],M_max_R[R_ct],M_max_R[R_ct], + sigma_max[R_ct],Mcrit_atom_interp_table[R_ct],Mlim_Fstar_MINI_g); + sfrd_grid_mini[box_ct] = (1.+curr_dens)*fcoll_MINI; + } + } + else{ + fcoll = EvaluateFcoll_delta(curr_dens,zpp_growth[R_ct],sigma_min[R_ct],sigma_max[R_ct]); + dfcoll = EvaluatedFcolldz(curr_dens,zpp_for_evolve_list[R_ct],sigma_min[R_ct],sigma_max[R_ct]); + sfrd_grid[box_ct] = (1.+curr_dens)*dfcoll; + } + ave_sfrd_buf += fcoll; + ave_sfrd_buf_mini += fcoll_MINI; } - ave_sfrd_buf += fcoll; - ave_sfrd_buf_mini += fcoll_MINI; } } - *ave_sfrd = ave_sfrd_buf/HII_TOT_NUM_PIXELS; - *ave_sfrd_mini = ave_sfrd_buf_mini/HII_TOT_NUM_PIXELS; + // -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + + *ave_sfrd = ave_sfrd_buf / HII_TOT_NUM_PIXELS; + *ave_sfrd_mini = ave_sfrd_buf_mini / HII_TOT_NUM_PIXELS; //These functions check for allocation free_conditional_tables(); diff --git a/src/py21cmfast/src/SpinTemperatureBox.cu b/src/py21cmfast/src/SpinTemperatureBox.cu new file mode 100644 index 00000000..ebcc2cd5 --- /dev/null +++ b/src/py21cmfast/src/SpinTemperatureBox.cu @@ -0,0 +1,249 @@ +// Most of the following includes likely can be removed. +#include +#include +#include +#include +#include +#include + +// we use thrust for reduction +#include +#include +#include // thrust::plus + +#include "cexcept.h" +#include "exceptions.h" +#include "logger.h" +#include "Constants.h" +#include "indexing.h" +#include "InputParameters.h" +#include "OutputStructs.h" +#include "heating_helper_progs.h" +#include "elec_interp.h" +#include "interp_tables.h" +#include "debugging.h" +#include "cosmology.h" +#include "hmf.h" +#include "dft.h" +#include "filtering.h" +#include "thermochem.h" + +#include "SpinTemperatureBox.h" + + +__device__ inline double EvaluateRGTable1D_f_gpu(double x, RGTable1D_f *table) { + + double x_min = table->x_min; + double x_width = table->x_width; + + int idx = (int)floor((x - x_min) / x_width); + + double table_val = x_min + x_width * (float)idx; + double interp_point = (x - table_val) / x_width; + + return table->y_arr[idx] * (1 - interp_point) + table->y_arr[idx + 1] * (interp_point); +} + +template +__device__ void warp_reduce(volatile double *sdata, unsigned int tid) { + // Reduce by half + // No syncing required with threads < 32 + if (threadsPerBlock >= 64) sdata[tid] += sdata[tid + 32]; + if (threadsPerBlock >= 32) sdata[tid] += sdata[tid + 16]; + if (threadsPerBlock >= 16) sdata[tid] += sdata[tid + 8]; + if (threadsPerBlock >= 8) sdata[tid] += sdata[tid + 4]; + if (threadsPerBlock >= 4) sdata[tid] += sdata[tid + 2]; + if (threadsPerBlock >= 2) sdata[tid] += sdata[tid + 1]; +} + +template +__global__ void compute_and_reduce( + RGTable1D_f *SFRD_conditional_table, // input data + float *dens_R_grid, // input data + double zpp_growth_R_ct, // input value + float *sfrd_grid, // star formation rate density grid to be updated + double *ave_sfrd_buf, // output buffer of length ceil(n / (threadsPerBlock * 2)) + unsigned int num_pixels // length of input data + ) { + + // An array to store intermediate summations + // Shared between all threads in block + extern __shared__ double sdata[]; + + unsigned int tid = threadIdx.x; // thread within current block + unsigned int i = blockIdx.x * (threadsPerBlock * 2) + tid; // index of input data + unsigned int gridSize = threadsPerBlock * 2 * gridDim.x; + + sdata[tid] = 0; + + // In bounds of gridSize, sum pairs of collapse fraction data together + // And update the star formation rate density grid. + double curr_dens_i; + double curr_dens_j; + double fcoll_i; + double fcoll_j; + + while (i < num_pixels) { + // Compute current density from density grid value * redshift-scaled growth factor + curr_dens_i = dens_R_grid[i] * zpp_growth_R_ct; + curr_dens_j = dens_R_grid[i + threadsPerBlock] * zpp_growth_R_ct; + + // Compute fraction of mass that has collapsed to form stars/other structures + fcoll_i = exp(EvaluateRGTable1D_f_gpu(curr_dens_i, &SFRD_conditional_table)); + fcoll_j = exp(EvaluateRGTable1D_f_gpu(curr_dens_j, &SFRD_conditional_table)); + + // Update the shared buffer with the collapse fractions + sdata[tid] += fcoll_i + fcoll_j; + + // Update the relevant cells in the star formation rate density grid + sfrd_grid[i] = (1. + curr_dens_i) * fcoll_i; + sfrd_grid[i + threadsPerBlock] = (1. + curr_dens_j) * fcoll_j; + + i += gridSize; + } + __syncthreads(); + + // Reduce by half and sync (and repeat) + if (threadsPerBlock >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } + if (threadsPerBlock >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } + if (threadsPerBlock >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } + + // Final reduction by separate kernel + if (tid < 32) warp_reduce(sdata, tid); + + // The first thread of each block updates the block totals + if (tid == 0) ave_sfrd_buf[blockIdx.x] = sdata[0]; +} + +void calculate_sfrd_from_grid_gpu( + RGTable1D_f *SFRD_conditional_table, // input data + float *dens_R_grid, // input data + double *zpp_growth, // input data + int R_ct, // input data + float *sfrd_grid, // star formation rate density grid to be updated + double *ave_sfrd_buf, // final output (to be divided by HII_TOT_NUM_PIXELS) + unsigned int num_pixels // length of input data +) { + // Input data + double zpp_growth_R_ct = zpp_growth[R_ct]; + + // The kernel only needs access to some fields of the SFRD_conditional_table struct + // so we allocate device memory and copy data only for required fields. + + // Create device pointers + double *x_min, *x_width, *y_arr; + // Allocate device memory + cudaMalloc(&x_min, sizeof(double)); + cudaMalloc(&x_width, sizeof(double)); + cudaMalloc(&y_arr, sizeof(double) * SFRD_conditional_table->n_bin); + // Copy data from host to device + cudaMemcpy(x_min, &SFRD_conditional_table->x_min, sizeof(double), cudaMemcpyHostToDevice); // Can also pass in + cudaMemcpy(x_width, &SFRD_conditional_table->x_width, sizeof(double), cudaMemcpyHostToDevice); // Can also pass in + cudaMemcpy(y_arr, SFRD_conditional_table->y_arr, sizeof(double) * SFRD_conditional_table->n_bin, cudaMemcpyHostToDevice); + + // Allocate & populate device memory for other inputs. + + // Create device pointers + float *d_dens_R_grid, *d_sfrd_grid; + // Allocate device memory + cudaMalloc(&d_dens_R_grid, sizeof(float) * num_pixels); + cudaMalloc(&d_sfrd_grid, sizeof(float) * num_pixels); + // Copy data from host to device + cudaMemcpy(d_dens_R_grid, &dens_R_grid, sizeof(float) * num_pixels, cudaMemcpyHostToDevice); + cudaMemcpy(d_sfrd_grid, &sfrd_grid, sizeof(float) * num_pixels, cudaMemcpyHostToDevice); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Get max threads/block for device + int maxThreadsPerBlock; + cudaDeviceGetAttribute(&maxThreadsPerBlock, cudaDevAttrMaxThreadsPerBlock, 0); + + // Set threads/block based on device max + int threadsPerBlock; + if (maxThreadsPerBlock >= 512) { + threadsPerBlock = 512; + } else if (maxThreadsPerBlock >= 256) { + threadsPerBlock = 256; + } else if (maxThreadsPerBlock >= 128) { + threadsPerBlock = 128; + } else if (maxThreadsPerBlock >= 64) { + threadsPerBlock = 64; + } else if (maxThreadsPerBlock >= 32) { + threadsPerBlock = 32; + } else { + threadsPerBlock = 16; + } + int numBlocks = (num_pixels + threadsPerBlock - 1) / threadsPerBlock; // 91m & 256 -> 355959 + int smemSize = threadsPerBlock * sizeof(double); // shared memory + + // Allocate device memory for output buffer and set to 0 + double* d_ave_sfrd_buf; + unsigned int buffer_length = ceil(num_pixels / (threadsPerBlock * 2)); + cudaMalloc(&d_ave_sfrd_buf, sizeof(double) * buffer_length); // 91m & 256 -> 177979 + // cudaMalloc((void**)&d_ave_sfrd_buf, sizeof(double) * buffer_length); + cudaMemset(d_ave_sfrd_buf, 0, sizeof(double) * buffer_length); // fill with byte=0 + + // Invoke kernel + switch (threadsPerBlock) { + case 512: + compute_and_reduce<512><<< numBlocks, threadsPerBlock, smemSize >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + break; + case 256: + compute_and_reduce<256><<< numBlocks, threadsPerBlock, smemSize >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + break; + case 128: + compute_and_reduce<128><<< numBlocks, threadsPerBlock, smemSize >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + break; + case 64: + compute_and_reduce<64><<< numBlocks, threadsPerBlock, smemSize >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + break; + case 32: + compute_and_reduce<32><<< numBlocks, threadsPerBlock, smemSize >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + break; + default: + // LOG_WARNING("Thread size invalid; defaulting to 256.") + compute_and_reduce<256><<< numBlocks, 256, 256 * sizeof(double) >>>(&x_min, &x_width, &y_arr, &d_dens_R_grid, zpp_growth_R_ct, &d_sfrd_grid, &d_ave_sfrd_buf, num_pixels); + } + + // Only use during development! + err = cudaDeviceSynchronize(); + CATCH_CUDA_ERROR(err); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("Kernel launch error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Use thrust to reduce computed buffer values to one value. + + // Wrap device pointer in a thrust::device_ptr + thrust::device_ptr d_ave_sfrd_buf_ptr(d_ave_sfrd_buf); + // Reduce final buffer values to one value + ave_sfrd_buf = thrust::reduce(d_ave_sfrd_buf_ptr, d_ave_sfrd_buf_ptr + buffer_length, 0., thrust::plus()); + + // Copy results from device to host. + err = cudaMemcpy(sfrd_grid, d_sfrd_grid, sizeof(float) * num_pixels, cudaMemcpyDeviceToHost); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } + + // Deallocate device memory. + cudaFree(x_min); + cudaFree(x_width); + cudaFree(y_arr); + cudaFree(d_dens_R_grid); + cudaFree(d_sfrd_grid); + cudaFree(d_ave_sfrd_buf); + + err = cudaGetLastError(); + if (err != cudaSuccess) { + LOG_ERROR("CUDA error: %s", cudaGetErrorString(err)); + Throw(CUDAError); + } +} diff --git a/src/py21cmfast/src/SpinTemperatureBox.h b/src/py21cmfast/src/SpinTemperatureBox.h index 2ace07e9..e90b87ac 100644 --- a/src/py21cmfast/src/SpinTemperatureBox.h +++ b/src/py21cmfast/src/SpinTemperatureBox.h @@ -1,6 +1,8 @@ #ifndef _SPINTEMP_H #define _SPINTEMP_H +#include + #include "InputParameters.h" #include "OutputStructs.h" @@ -17,6 +19,12 @@ int UpdateXraySourceBox(UserParams *user_params, CosmoParams *cosmo_params, AstroParams *astro_params, FlagOptions *flag_options, HaloBox *halobox, double R_inner, double R_outer, int R_ct, XraySourceBox *source_box); +void calculate_sfrd_from_grid(int R_ct, float *dens_R_grid, float *Mcrit_R_grid, float *sfrd_grid, + float *sfrd_grid_mini, double *ave_sfrd, double *ave_sfrd_mini); + +void calculate_sfrd_from_grid_gpu(RGTable1D_f *SFRD_conditional_table, float *dens_R_grid, double *zpp_growth, + int R_ct, float *sfrd_grid, double *ave_sfrd_buf, unsigned int num_pixels); + #ifdef __cplusplus } #endif