Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Re-engineer reduce functions on device #606

Draft
wants to merge 2 commits into
base: process-cons-gpu-dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 53 additions & 9 deletions src/LinAlg/VectorCudaKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,33 @@ __global__ void add_linear_damping_term_cu(int n, double* data, const double* ix
}

/** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
__global__ void is_posive_w_pattern_cu(int n, double* data, const double* vd, const double* id)
__global__ void is_posive_w_pattern_cu(int n, int* data, const double* vd, const double* id)
{
extern __shared__ int shared_sum[];
const int num_threads = blockDim.x * gridDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
int sum = 0;
for (int i = tid; i < n; i += num_threads) {
sum += (id[i] == 1.0 && vd[i] > 0.0) ? 1 : 0;
}
shared_sum[threadIdx.x] = sum;
__syncthreads();

for(int s = blockDim.x / 2; s > 0; s >>= 1) {
if(threadIdx.x < s) {
shared_sum[threadIdx.x] += shared_sum[threadIdx.x + s];
}
__syncthreads();
}

// Store the result for this block in global memory
if(threadIdx.x == 0) {
data[blockIdx.x] = shared_sum[0];
}
}

/** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
__global__ void is_posive_w_pattern_cu_back(int n, double* data, const double* vd, const double* id)
{
const int num_threads = blockDim.x * gridDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
Expand Down Expand Up @@ -914,13 +940,30 @@ void add_linear_damping_term_kernel(int n_local,
}

/** @brief Checks if selected elements of `this` are positive */
void is_posive_w_pattern_kernel(int n_local,
double* yd,
int is_posive_w_pattern_kernel(int n_local,
// double* yd,
const double* xd,
const double* id)
{
int num_blocks = (n_local+block_size-1)/block_size;
is_posive_w_pattern_cu<<<num_blocks,block_size>>>(n_local, yd, xd, id);
// is_posive_w_pattern_cu<<<num_blocks,block_size>>>(n_local, yd, xd, id);

int *h_retval = (int*)malloc(num_blocks*sizeof(int));
int *d_retval;
cudaMalloc(&d_retval,num_blocks*sizeof(int));

is_posive_w_pattern_cu<<<num_blocks,block_size, block_size*sizeof(int)>>>(n_local, d_retval, xd, id);
cudaDeviceSynchronize();
cudaMemcpy(h_retval, d_retval, num_blocks*sizeof(int), cudaMemcpyDeviceToHost);

int sum_result = 0;
for(int i=0;i<num_blocks;i++) {
sum_result += h_retval[i];
}

cudaFree(d_retval);
free(h_retval);
return sum_result;
}

/// set value with pattern
Expand Down Expand Up @@ -1197,12 +1240,13 @@ double min_local_kernel(int n, double* d1)
int all_positive_w_pattern_kernel(int n, const double* d1, const double* id)
{
// TODO: how to avoid this temp vec?
thrust::device_vector<double> v_temp(n);
double* dv_ptr = thrust::raw_pointer_cast(v_temp.data());
// thrust::device_vector<double> v_temp(n);
// double* dv_ptr = thrust::raw_pointer_cast(v_temp.data());
// is_posive_w_pattern_kernel(n, dv_ptr, d1, id);
// return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus<int>());

hiop::cuda::is_posive_w_pattern_kernel(n, dv_ptr, d1, id);

return thrust::reduce(thrust::device, v_temp.begin(), v_temp.end(), (int)0, thrust::plus<int>());
int irev = hiop::cuda::is_posive_w_pattern_kernel(n, d1, id);
return irev;
}

/** @brief compute min(d1) for selected elements*/
Expand Down
4 changes: 2 additions & 2 deletions src/LinAlg/VectorCudaKernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ void add_linear_damping_term_kernel(int n_local,
double ct);

/** @brief y[i] = 1.0 if x[i] is positive and id[i] = 1.0, otherwise y[i] = 0 */
void is_posive_w_pattern_kernel(int n_local,
double* yd,
int is_posive_w_pattern_kernel(int n_local,
// double* yd,
const double* xd,
const double* id);

Expand Down