Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Process variable bounds on device #604

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 164 additions & 1 deletion src/LinAlg/VectorCudaKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -640,6 +640,89 @@ __global__ void copyToStartingAt_w_pattern_cu(int n_src,
}
}

/** @brief process variable bounds */
__global__ void process_bounds_cu(int n,
const double* xl,
const double* xu,
double* ixl,
double* ixu,
int* bnds_low,
int* bnds_upp,
int* bnds_lu,
int* fixed_vars,
double fixed_var_tol)
{
const int num_threads = blockDim.x * gridDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;

for (int i = tid; i < n; i += num_threads) {
// preemptive loop to reduce number of iterations?
bnds_low[i] = 0;
bnds_upp[i] = 0;
bnds_lu[i] = 0;
fixed_vars[i] = 0;

if(xl[i] > -1e20) {
ixl[i] = 1.;
bnds_low[i] = 1;
if(xu[i] < 1e20) {
bnds_lu[i] = 1;
}
} else {
ixl[i] = 0.;
}

if(xu[i] < 1e20) {
ixu[i] = 1.;
bnds_upp[i] = 1;
} else {
ixu[i] = 0.;
}

if(xu[i] < 1e20 &&
fabs(xl[i]-xu[i]) <= fixed_var_tol*fmax(1.,std::fabs(xu[i]))) {
fixed_vars[i] = 1;
}
}
}

/** @brief relax variable bounds */
__global__ void relax_bounds_cu(int n,
double* xla,
double* xua,
double fixed_var_tol,
double fixed_var_perturb)
{
const int num_threads = blockDim.x * gridDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;

for (int i = tid; i < n; i += num_threads) {
double xuabs = fabs(xua[i]);
if(fabs(xua[i]-xla[i]) <= fixed_var_tol*fmax(1.,xuabs)) {
xua[i] += fixed_var_perturb * std::fmax(1.,xuabs);
xla[i] -= fixed_var_perturb * std::fmax(1.,xuabs);
}
}
}

/** @brief set d_ptr[i] = 1 if d1[i] == d2[i], otherwirse 0 */
__global__ void set_if_match_cu(int n,
int* d_ptr,
const double* d1,
const double* d2)
{
const int num_threads = blockDim.x * gridDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;

for (int i = tid; i < n; i += num_threads) {
if(d1[i]==d2[i]) {
d_ptr[i] = 1;
} else {
d_ptr[i] = 0;
}
}
}

namespace hiop
{
namespace cuda
Expand Down Expand Up @@ -1112,7 +1195,6 @@ double min_w_pattern_kernel(int n, const double* d1, const double* id, double ma

thrust::device_ptr<double> ret_dev_ptr = thrust::min_element(thrust::device, dv_ptr, dv_ptr+n);

// TODO: how to return double from device to host?
double *ret_host = new double[1];
double *ret_ptr = thrust::raw_pointer_cast(ret_dev_ptr);
cudaError_t cuerr = cudaMemcpy(ret_host, ret_ptr, (1)*sizeof(double), cudaMemcpyDeviceToHost);
Expand Down Expand Up @@ -1276,7 +1358,88 @@ void copyToStartingAt_w_pattern_kernel(int n_src,
dd);
}

int num_match_local_kernel(int n, const double* d1, const double* d2)
{
int num_blocks = (n+block_size-1)/block_size;

// TODO: how to avoid this temp vec?
thrust::device_ptr<int> dv_ptr = thrust::device_malloc(n*sizeof(int));
int* d_ptr = thrust::raw_pointer_cast(dv_ptr);

// set d_ptr[i] = 1 if d1[i] == d2[i], otherwirse 0
set_if_match_cu<<<num_blocks,block_size>>>(n, d_ptr, d1, d2);

int rval = thrust::reduce(thrust::device, d_ptr, d_ptr+n, 0, thrust::plus<int>());

thrust::device_free(dv_ptr);

return rval;
}

/** @brief process variable bounds */
void process_bounds_local_kernel(int n_local,
const double* xl,
const double* xu,
double* ixl,
double* ixu,
int& n_bnds_low,
int& n_bnds_upp,
int& n_bnds_lu,
int& n_fixed_vars,
double fixed_var_tol)
{
int num_blocks = (n_local+block_size-1)/block_size;

thrust::device_ptr<int> bnds_low_d_ptr = thrust::device_malloc(n_local*sizeof(int));
int* bnds_low_r_ptr = thrust::raw_pointer_cast(bnds_low_d_ptr);

thrust::device_ptr<int> bnds_upp_d_ptr = thrust::device_malloc(n_local*sizeof(int));
int* bnds_upp_r_ptr = thrust::raw_pointer_cast(bnds_upp_d_ptr);

thrust::device_ptr<int> bnds_lu_d_ptr = thrust::device_malloc(n_local*sizeof(int));
int* bnds_lu_r_ptr = thrust::raw_pointer_cast(bnds_lu_d_ptr);

thrust::device_ptr<int> n_fixed_vars_d_ptr = thrust::device_malloc(n_local*sizeof(int));
int* n_fixed_vars_r_ptr = thrust::raw_pointer_cast(n_fixed_vars_d_ptr);

// set values
process_bounds_cu<<<num_blocks,block_size>>>(n_local,
xl,
xu,
ixl,
ixu,
bnds_low_r_ptr,
bnds_upp_r_ptr,
bnds_lu_r_ptr,
n_fixed_vars_r_ptr,
fixed_var_tol);

// compute sum
n_bnds_low = thrust::reduce(thrust::device, bnds_low_d_ptr, bnds_low_d_ptr+n_local, 0.0, thrust::plus<int>());
n_bnds_upp = thrust::reduce(thrust::device, bnds_upp_d_ptr, bnds_upp_d_ptr+n_local, 0.0, thrust::plus<int>());
n_bnds_lu = thrust::reduce(thrust::device, bnds_lu_d_ptr, bnds_lu_d_ptr+n_local, 0.0, thrust::plus<int>());
n_fixed_vars = thrust::reduce(thrust::device, n_fixed_vars_d_ptr, n_fixed_vars_d_ptr+n_local, 0.0, thrust::plus<int>());

thrust::device_free(bnds_low_d_ptr);
thrust::device_free(bnds_upp_d_ptr);
thrust::device_free(bnds_lu_d_ptr);
thrust::device_free(n_fixed_vars_d_ptr);
}

/** @brief relax variable bounds */
void relax_bounds_kernel(int n_local,
double* xl,
double* xu,
double fixed_var_tol,
double fixed_var_perturb)
{
int num_blocks = (n_local+block_size-1)/block_size;
relax_bounds_cu<<<num_blocks,block_size>>>(n_local,
xl,
xu,
fixed_var_tol,
fixed_var_perturb);
}

/// for hiopVectorIntCuda
/**
Expand Down
22 changes: 22 additions & 0 deletions src/LinAlg/VectorCudaKernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,28 @@ void copyToStartingAt_w_pattern_kernel(int n_src,
double *vd,
const double* dd);

/// @brief return the numbers of identical elements between two vectors
int num_match_local_kernel(int n, const double* d1, const double* d2);

/** @brief process variable bounds */
void process_bounds_local_kernel(int n_local,
const double* xl,
const double* xu,
double* ixl,
double* ixu,
int& n_bnds_low,
int& n_bnds_upp,
int& n_bnds_lu,
int& n_fixed_vars,
double fixed_var_tol);

/** @brief relax variable bounds */
void relax_bounds_kernel(int n_local,
double* xl,
double* xu,
double fixed_var_tol,
double fixed_var_perturb);

/// for hiopVectorIntCuda
/**
* @brief Set the vector entries to be a linear space of starting at i0 containing evenly
Expand Down
Loading