diff --git a/tools/cgeist/.gitignore b/tools/cgeist/.gitignore index 46636c589917..66e9f7f994dd 100644 --- a/tools/cgeist/.gitignore +++ b/tools/cgeist/.gitignore @@ -1 +1,2 @@ *.time *.exec1 *.out1 +*.execm diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu new file mode 100644 index 000000000000..a07d15fe7948 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu @@ -0,0 +1,169 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10 10 +// clang-format on +/** + * 2mm.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +__global__ void kernel_A_mul_B(int ni, int nj, int nk, int nl, double alpha, + double beta, double *tmp, double *A, double *B, + double *C, double *D) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k; + double dot = 0.0; + + if (i < ni && j < nj) { + for (k = 0; k < nk; k++) + dot += alpha * A[i * nk + k] * B[k * nj + j]; + tmp[i * nj + j] = dot; + } +} + +__global__ void kernel_D_plus_tmp_mul_C(int ni, int nj, int nk, int nl, + double alpha, double beta, double *tmp, + double *A, double *B, double *C, + double *D) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int l = blockDim.y * blockIdx.y + threadIdx.y; + int j; + double dot = 0.0; + + if (i < ni && l < nl) { + // D[i * nj + l] *= beta; + dot = D[i * nj + l] * beta; + + for (j = 0; j < nj; j++) + // D[i * nl + l] += tmp[i * nj + j] * C[j * nl + l]; + dot += tmp[i * nj + j] * C[j * nl + l]; + D[i * nl + l] = dot; + } +} + +short num_blocks(short num, short factor) { + return (num + factor - 1) / factor; +} + +static void kernel(int ni, int nj, int nk, int nl, double alpha, double beta, + double *tmp, double *A, double *B, double *C, double *D) { + + unsigned threadsPerBlock = 256; + dim3 block(threadsPerBlock / 32, 32, 1); + + { + dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1); + kernel_A_mul_B<<>>(ni, nj, nk, nl, alpha, beta, tmp, A, B, C, + D); + } + + { + dim3 grid(num_blocks(ni, block.x), num_blocks(nl, block.y), 1); + kernel_D_plus_tmp_mul_C<<>>(ni, nj, nk, nl, alpha, beta, tmp, + A, B, C, D); + } +} + +static void print_array(int ni, int nl, double *D) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nl; j++) { + fprintf(stderr, "%0.2lf ", D[i * ni + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static void init_array(int ni, int nj, int nk, int nl, double *A, double *B, + double *C, double *D, double *tmp) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nk; j++) + A[i * ni + j] = ((double)i * j) / ni; + for (i = 0; i < nk; i++) + for (j = 0; j < nj; j++) + B[i * nk + j] = ((double)i * (j + 1)) / nj; + for (i = 0; i < nl; i++) + for (j = 0; j < nj; j++) + C[i * nl + j] = ((double)i * (j + 3)) / nl; + for (i = 0; i < ni; i++) + for (j = 0; j < nl; j++) + D[i * ni + j] = ((double)i * (j + 2)) / nk; + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + tmp[i * ni + j] = 0; +} + +int main(int argc, char **argv) { + int dump_code = atoi(argv[1]); + long ni = atoi(argv[2]); + long nj = atoi(argv[3]); + long nk = atoi(argv[4]); + long nl = atoi(argv[5]); + + double alpha = 32412; + double beta = 2123; + double *A = (double *)malloc(ni * nk * sizeof(double)); + double *B = (double *)malloc(nk * nj * sizeof(double)); + double *C = (double *)malloc(nl * nj * sizeof(double)); + double *D = (double *)malloc(ni * nl * sizeof(double)); + double *tmp = (double *)malloc(ni * nj * sizeof(double)); + + init_array(ni, nj, nk, nl, A, B, C, D, tmp); + + double *dev_A; + double *dev_B; + double *dev_C; + double *dev_D; + double *dev_tmp; + double *dev_alpha; + double *dev_beta; + cudaMalloc(&dev_A, ni * nk * sizeof(double)); + cudaMalloc(&dev_B, nk * nj * sizeof(double)); + cudaMalloc(&dev_C, nl * nj * sizeof(double)); + cudaMalloc(&dev_D, ni * nl * sizeof(double)); + cudaMalloc(&dev_tmp, ni * nj * sizeof(double)); + cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, nl * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_D, D, ni * nl * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmp, tmp, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + + kernel(ni, nj, nk, nl, alpha, beta, dev_tmp, dev_A, dev_B, dev_C, dev_D); + + cudaMemcpy(D, dev_D, ni * nl * sizeof(double), cudaMemcpyDeviceToHost); + cudaFree((void *)dev_A); + cudaFree((void *)dev_B); + cudaFree((void *)dev_C); + cudaFree((void *)dev_D); + cudaFree((void *)dev_tmp); + cudaFree((void *)dev_alpha); + cudaFree((void *)dev_beta); + + if (dump_code == 1) + print_array(ni, nk, D); + + free((void *)tmp); + ; + free((void *)A); + ; + free((void *)B); + ; + free((void *)C); + ; + free((void *)D); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu new file mode 100644 index 000000000000..56db77171210 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu @@ -0,0 +1,161 @@ +// clang-format off +// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm +// RUN: true +// clang-format on +/** + * 3mm.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +__global__ void kernel_A_mul_B(int ni, int nj, int nk, double *C, double *A, + double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + double dot = 0.0; + + if (i < ni && j < nj) { + for (int k = 0; k < nk; k++) + // C[i * nj + j] += A[i * nk + k] * B[k * nj + j]; + dot += A[i * nk + k] * B[k * nj + j]; + C[i * nj + j] = dot; + } +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +static void init_array(int ni, int nj, int nk, int nl, int nm, double *A, + double *B, double *C, double *D, double *E, double *F, + double *G) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nk; j++) + A[i * ni + j] = ((double)i * j) / ni; + for (i = 0; i < nk; i++) + for (j = 0; j < nj; j++) + B[i * nk + j] = ((double)i * (j + 1)) / nj; + for (i = 0; i < nj; i++) + for (j = 0; j < nm; j++) + C[i * nj + j] = ((double)i * (j + 3)) / nl; + for (i = 0; i < nm; i++) + for (j = 0; j < nl; j++) + D[i * nm + j] = ((double)i * (j + 2)) / nk; + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + E[i * ni + j] = 0; + for (i = 0; i < nj; i++) + for (j = 0; j < nl; j++) + F[i * nj + j] = 0; + for (i = 0; i < ni; i++) + for (j = 0; j < nl; j++) + G[i * ni + j] = 0; +} + +static void print_array(int ni, int nl, double *G) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nl; j++) { + fprintf(stderr, "%0.2lf ", G[i * ni + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static void kernel(int ni, int nj, int nk, int nl, int nm, double *E, double *A, + double *B, double *F, double *C, double *D, double *G) { + unsigned threadsPerBlock = 256; + dim3 block(threadsPerBlock / 32, 32, 1); + + { + dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1); + kernel_A_mul_B<<>>(ni, nj, nk, E, A, B); + } + + { + dim3 grid(num_blocks(nj, block.x), num_blocks(nl, block.y), 1); + kernel_A_mul_B<<>>(nj, nl, nm, F, C, D); + } + + { + dim3 grid(num_blocks(ni, block.x), num_blocks(nl, block.y), 1); + kernel_A_mul_B<<>>(ni, nl, nj, G, E, F); + } +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int ni = atoi(argv[2]); + int nj = atoi(argv[3]); + int nk = atoi(argv[4]); + int nl = atoi(argv[5]); + int nm = atoi(argv[6]); + + double *A = (double *)malloc(ni * nk * sizeof(double)); + double *B = (double *)malloc(nk * nj * sizeof(double)); + double *C = (double *)malloc(nj * nm * sizeof(double)); + double *D = (double *)malloc(nm * nl * sizeof(double)); + double *E = (double *)malloc(ni * nj * sizeof(double)); + double *F = (double *)malloc(nj * nl * sizeof(double)); + double *G = (double *)malloc(ni * nl * sizeof(double)); + + init_array(ni, nj, nk, nl, nm, A, B, C, D, E, F, G); + + double *dev_A; + double *dev_B; + double *dev_C; + double *dev_D; + double *dev_E; + double *dev_F; + double *dev_G; + cudaMalloc(&dev_A, ni * nk * sizeof(double)); + cudaMalloc(&dev_B, nk * nj * sizeof(double)); + cudaMalloc(&dev_C, nl * nj * sizeof(double)); + cudaMalloc(&dev_D, ni * nl * sizeof(double)); + cudaMalloc(&dev_E, ni * nj * sizeof(double)); + cudaMalloc(&dev_F, nj * nl * sizeof(double)); + cudaMalloc(&dev_G, ni * nl * sizeof(double)); + cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, nl * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_D, D, ni * nl * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_E, E, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_F, F, nj * nl * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_G, G, ni * nl * sizeof(double), cudaMemcpyHostToDevice); + + kernel(ni, nj, nk, nl, nm, dev_E, dev_A, dev_B, dev_F, dev_C, dev_D, dev_G); + + cudaMemcpy(G, dev_G, ni * nl * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(ni, nl, G); + + free((void *)E); + ; + free((void *)A); + ; + free((void *)B); + ; + free((void *)F); + ; + free((void *)C); + ; + free((void *)D); + ; + free((void *)G); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS b/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS new file mode 100644 index 000000000000..e1758fb080ce --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS @@ -0,0 +1,14 @@ +* * * * * * * * * * * * * +* Authors of PolyBench * +* * * * * * * * * * * * * + + +* Louis-Noel Pouchet + Who provided packaging and harmonization of all test files, + the PolyBench infrastructure and machinery, and several + reference C files. + +* Uday Bondugula + Who provided many of the original reference C files, including + Fortran to C translation. + diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/README b/tools/cgeist/Test/CUDA/polybench-cuda/README new file mode 100644 index 000000000000..d4978228b7e8 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/README @@ -0,0 +1,217 @@ +* * * * * * * * * * +* PolyBenchs 3.1 * +* * * * * * * * * * + +Copyright (c) 2011 the Ohio State University. +Contact: Louis-Noel Pouchet + + +------------- +* New in 3.1: +------------- + +- fixed a typo in polybench.h, causing compilation problems for 3D arrays. +- set by default heap arrays, stack arrays are now optional. + + +------------- +* New in 3.0: +------------- + +- multiple dataset sizes are predefined. Each file comes now with a .h + header fiile defining the dataset. +- support of heap-allocated arrays. It uses a single malloc for the + entire array region, the data allocated is cast into a C99 + multidimensional array. +- One benchmark is out: gauss_filter +- One benchmark is in: floyd-warshall +- PAPI support has been greatly improved; it also can report the + counters on a specific core to be set by the user. + + + +---------------- +* Mailing lists: +---------------- + +** polybench-announces@lists.sourceforge.net: +--------------------------------------------- + +Announces about releases of PolyBench. + +** polybench-discussion@lists.sourceforge.net: +---------------------------------------------- + +General discussions reg. PolyBench. + + + +----------------------- +* Available benchmarks: +----------------------- + +::linear-algebra:: +linear-algebra/kernels: +linear-algebra/kernels/2mm/2mm.c +linear-algebra/kernels/3mm/3mm.c +linear-algebra/kernels/atax/atax.c +linear-algebra/kernels/bicg/bicg.c +linear-algebra/kernels/cholesky/cholesky.c +linear-algebra/kernels/doitgen/doitgen.c +linear-algebra/kernels/gemm/gemm.c +linear-algebra/kernels/gemver/gemver.c +linear-algebra/kernels/gesummv/gesummv.c +linear-algebra/kernels/mvt/mvt.c +linear-algebra/kernels/symm/symm.c +linear-algebra/kernels/syr2k/syr2k.c +linear-algebra/kernels/syrk/syrk.c +linear-algebra/kernels/trisolv/trisolv.c +linear-algebra/kernels/trmm/trmm.c + +linear-algebra/solvers: +linear-algebra/solvers/durbin/durbin.c +linear-algebra/solvers/dynprog/dynprog.c +linear-algebra/solvers/gramschmidt/gramschmidt.c +linear-algebra/solvers/lu/lu.c +linear-algebra/solvers/ludcmp/ludcmp.c + +::datamining:: +datamining/correlation/correlation.c +datamining/covariance/covariance.c + +::medley:: +medley/floyd-warshall/floyd-warshall.c +medley/reg_detect/reg_detect.c + +::stencils:: +stencils/adi/adi.c +stencils/fdtd-2d/fdtd-2d.c +stencils/fdtd-apml/fdtd-apml.c +stencils/jacobi-1d-imper/jacobi-1d-imper.c +stencils/jacobi-2d-imper/jacobi-2d-imper.c +stencils/seidel-2d/seidel-2d.c + + + +------------------------------ +* Sample compilation commands: +------------------------------ + + +** To compile a benchmark without any monitoring: +------------------------------------------------- + +$> gcc -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -o atax_base + + +** To compile a benchmark with execution time reporting: +-------------------------------------------------------- + +$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_TIME -o atax_time + + +** To generate the reference output of a benchmark: +--------------------------------------------------- + +$> gcc -O0 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_DUMP_ARRAYS -o atax_ref +$> ./atax_ref 2>atax_ref.out + + + + +------------------------- +* Some available options: +------------------------- + +They are all passed as macro definitions during compilation time (e.g, +-Dname_of_the_option). + +- POLYBENCH_TIME: output execution time (gettimeofday) [default: off] + +- POLYBENCH_NO_FLUSH_CACHE: don't flush the cache before calling the + timer [default: flush the cache] + +- POLYBENCH_LINUX_FIFO_SCHEDULER: use FIFO real-time scheduler for the + kernel execution, the program must be run as root, under linux only, + and compiled with -lc [default: off] + +- POLYBENCH_CACHE_SIZE_KB: cache size to flush, in kB [default: 33MB] + +- POLYBENCH_STACK_ARRAYS: use stack allocation instead of malloc [default: off] + +- POLYBENCH_DUMP_ARRAYS: dump all live-out arrays on stderr [default: off] + +- POLYBENCH_CYCLE_ACCURATE_TIMER: Use Time Stamp Counter to monitor + the execution time of the kernel [default: off] + +- POLYBENCH_PAPI: turn on papi timing (see below). + +- MINI_DATASET, SMALL_DATASET, STANDARD_DATASET, LARGE_DATASET, + EXTRALARGE_DATASET: set the dataset size to be used + [default: STANDARD_DATASET] + + + +--------------- +* PAPI support: +--------------- + +** To compile a benchmark with PAPI support: +-------------------------------------------- + +$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_PAPI -lpapi -o atax_papi + + +** To specify which counter(s) to monitor: +------------------------------------------ + +Edit utilities/papi_counters.list, and add 1 line per event to +monitor. Each line (including the last one) must finish with a ',' and +both native and standard events are supported. + +The whole kernel is run one time per counter (no multiplexing) and +there is no sampling being used for the counter value. + + + +------------------------------ +* Accurate performance timing: +------------------------------ + +With kernels that have an execution time in the orders of a few tens +of milliseconds, it is critical to validate any performance number by +repeating several times the experiment. A companion script is +available to perform reasonable performance measurement of a PolyBench. + +$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_TIME -o atax_time +$> ./utilities/time_benchmark.sh ./atax_time + +This script will run five times the benchmark (that must be a +PolyBench compiled with -DPOLYBENCH_TIME), eliminate the two extremal +times, and check that the deviation of the three remaining does not +exceed a given thresold, set to 5%. + +It is also possible to use POLYBENCH_CYCLE_ACCURATE_TIMER to use the +Time Stamp Counter instead of gettimeofday() to monitor the number of +elapsed cycles. + + + + +---------------------------------------- +* Generating macro-free benchmark suite: +---------------------------------------- + +(from the root of the archive:) +$> PARGS="-I utilities -DPOLYBENCH_TIME"; +$> for i in `cat utilities/benchmark_list`; do create_cpped_version.sh $i "$PARGS"; done + +This create for each benchmark file 'xxx.c' a new file +'xxx.preproc.c'. The PARGS variable in the above example can be set to +the desired configuration, for instance to create a full C99 version +(parametric arrays): + +$> PARGS="-I utilities -DPOLYBENCH_USE_C99_PROTO"; +$> for i in `cat utilities/benchmark_list`; do ./utilities/create_cpped_version.sh "$i" "$PARGS"; done + + diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu b/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu new file mode 100644 index 000000000000..cbb195dd973e --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu @@ -0,0 +1,177 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 5 +// clang-format on +/** + * adi.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define N 2048 +#define TSTEPS 50 + +__global__ void kernel_column_sweep(int tsteps, int n, double *u, double *v, + double *p, double *q, double a, double b, + double c, double d, double e, double f) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + + if (i < n - 1) { + v[0 * n + i] = 1; + p[i * n + 0] = 0; + q[i * n + 0] = v[0 * n + i]; + for (int j = 1; j < n - 1; j++) { + p[i * n + j] = -c / (a * p[i * n + j - 1] + b); + q[i * n + j] = (-d * u[j * n + i - 1] + (1 + 2 * d) * u[j * n + i] - + f * u[j * n + i + 1] - a * q[i * n + j - 1]) / + (a * p[i * n + j - 1] + b); + } + + v[(n - 1) * n + i] = 1; + for (int j = n - 2; j >= 1; j--) + v[j * n + i] = p[i * n + j] * v[(j + 1) * n + i] + q[i * n + j]; + } +} + +__global__ void kernel_row_sweep(int tsteps, int n, double *u, double *v, + double *p, double *q, double a, double b, + double c, double d, double e, double f) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + + if (i < n - 1) { + u[i * n + 0] = 1; + p[i + n + 0] = 0; + q[i * n + 0] = u[i * n + 0]; + for (int j = 1; j < n - 1; j++) { + p[i * n + j] = -f / (d * p[i * n + j - 1] + e); + q[i * n + j] = (-a * v[(i - 1) * n + j] + (1 + 2 * a) * v[i * n + j] - + c * v[(i + 1) * n + j] - d * q[i * n + j - 1]) / + (d * p[i * n + j - 1] + e); + } + u[i * n + n - 1] = 1; + for (int j = n - 2; j >= 1; j--) + u[i * n + j] = p[i * n + j] * u[i * n + j + 1] + q[i * n + j]; + } +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +static void kernel(int tsteps, int n, double *u, double *v, double *p, + double *q) { + unsigned threadsPerBlock = 256; + + double DX = 1 / (double)n; + double DY = 1 / (double)n; + double DT = 1 / (double)tsteps; + double B1 = 2; + double B2 = 1; + double mul1 = B1 * DT / DX / DX; + double mul2 = B2 * DT / DY / DY; + + double a = -mul1 / 2; + double b = 1 + mul1; + double c = a; + double d = -mul2 / 2; + double e = 1 + mul2; + double f = d; + + for (int t = 1; t <= tsteps; t++) { + // Column Sweep + kernel_column_sweep<<>>(tsteps, n, u, v, p, q, a, b, c, d, + e, f); + + // Row Sweep + kernel_row_sweep<<>>( + tsteps, n, u, v, p, q, a, b, c, d, e, f); + } +} + +/* Array initialization. */ +static void init_array(int n, double *u, double *v, double *p, double *q) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + u[i * n + j] = (double)(i + n - j) / n; + v[i * n + j] = 0; + p[i * n + j] = 0; + q[i * n + j] = 0; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *u) + +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", u[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[2]); + int tsteps = atoi(argv[3]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + + double *u = (double *)malloc(n * n * sizeof(double)); + double *v = (double *)malloc(n * n * sizeof(double)); + double *p = (double *)malloc(n * n * sizeof(double)); + double *q = (double *)malloc(n * n * sizeof(double)); + + /* Initialize array(s). */ + init_array(n, u, v, p, q); + + double *dev_u; + double *dev_v; + double *dev_p; + double *dev_q; + cudaMalloc(&dev_u, n * n * sizeof(double)); + cudaMalloc(&dev_v, n * n * sizeof(double)); + cudaMalloc(&dev_p, n * n * sizeof(double)); + cudaMalloc(&dev_q, n * n * sizeof(double)); + cudaMemcpy(dev_u, u, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_v, v, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_p, p, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_q, q, n * n * sizeof(double), cudaMemcpyHostToDevice); + + /* Run kernel. */ + kernel(tsteps, n, dev_u, dev_v, dev_p, dev_q); + + cudaMemcpy(u, dev_u, n * n * sizeof(double), cudaMemcpyDeviceToHost); + cudaFree((void *)dev_u); + cudaFree((void *)dev_v); + cudaFree((void *)dev_p); + cudaFree((void *)dev_q); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, u); + + /* Be clean. */ + free((void *)u); + free((void *)v); + free((void *)p); + free((void *)q); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu b/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu new file mode 100644 index 000000000000..190adf49603b --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu @@ -0,0 +1,125 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * atax.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define RUN 100 + +__global__ void kernel3(int m, int n, double *A, double *x, double *y, + double *tmp) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < m) { + double dot = 0.0; + for (int j = 0; j < n; j++) { + dot += A[i * n + j] * x[j]; + } + tmp[i] = dot; + } +} + +__global__ void kernel4(int m, int n, double *A, double *x, double *y, + double *tmp) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + if (j < n) { + double dot = 0; + y[j] = 0; + for (int i = 0; i < m; i++) + dot += A[i * n + j] * tmp[i]; + y[j] = dot; + } +} + +static int num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +/* Array initialization. */ +static void init_array(int nx, int ny, double *A, double *x, double *tmp, + double *y) { + int i, j; + + for (i = 0; i < ny; i++) { + x[i] = i * M_PI; + } + for (i = 0; i < nx; i++) + tmp[i] = 0; + for (i = 0; i < nx; i++) + for (j = 0; j < ny; j++) + A[i * ny + j] = ((double)i * (j + 1)) / nx; +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int nx, double *y) + +{ + int i; + + for (i = 0; i < nx; i++) { + fprintf(stderr, "%0.2lf ", y[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int nx = atoi(argv[2]); + int ny = atoi(argv[3]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(nx * ny * sizeof(double)); + double *x = (double *)malloc(ny * sizeof(double)); + double *y = (double *)malloc(ny * sizeof(double)); + double *tmp = (double *)malloc(nx * sizeof(double)); + /* Initialize array(s). */ + init_array(nx, ny, A, x, tmp, y); + + double *dev_A; + double *dev_x; + double *dev_y; + double *dev_tmp; + cudaMalloc(&dev_A, nx * ny * sizeof(double)); + cudaMalloc(&dev_x, ny * sizeof(double)); + cudaMalloc(&dev_y, ny * sizeof(double)); + cudaMalloc(&dev_tmp, nx * sizeof(double)); + + cudaMemcpy(dev_A, A, nx * ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_x, x, ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_y, y, ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmp, tmp, nx * sizeof(double), cudaMemcpyHostToDevice); + + const int threadsPerBlock = 256; + kernel3<<>>( + nx, ny, dev_A, dev_x, dev_y, dev_tmp); + kernel4<<>>( + nx, ny, dev_A, dev_x, dev_y, dev_tmp); + + cudaMemcpy(y, dev_y, ny * sizeof(double), cudaMemcpyDeviceToHost); + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) { + print_array(nx, y); + } + /* Be clean. */ + free((void *)A); + free((void *)x); + free((void *)y); + free((void *)tmp); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu b/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu new file mode 100644 index 000000000000..69bcb968d78f --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu @@ -0,0 +1,139 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * bicg.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +// #define NX 40000 +#define RUN 100 + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_q(int m, int n, double *A, double s[], double q[], + double p[], double r[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) { + double dot = 0; + q[i] = 0; + for (int j = 0; j < m; j++) + dot += A[i * m + j] * p[j]; + q[i] += dot; + } +} + +__global__ void kernel_s(int m, int n, double *A, double s[], double q[], + double p[], double r[]) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < m) { + s[j] = 0; + double dot = 0; + for (int i = 0; i < n; i++) + dot += r[i] * A[i * m + j]; + s[j] = dot; + } +} + +/* Array initialization. */ +static void init_array(int nx, int ny, double *A, double *r, double *p) { + int i, j; + + for (i = 0; i < ny; i++) + p[i] = i * M_PI; + for (i = 0; i < nx; i++) { + r[i] = i * M_PI; + for (j = 0; j < ny; j++) + A[i * ny + j] = ((double)i * (j + 1)) / nx; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int nx, int ny, double *s, double *q) + +{ + int i; + + for (i = 0; i < ny; i++) { + fprintf(stderr, "%0.2lf ", s[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } + for (i = 0; i < nx; i++) { + fprintf(stderr, "%0.2lf ", q[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static void kernel(int m, int n, double *A, double s[], double q[], double p[], + double r[]) { + + const unsigned threadsPerBlock = 256; + kernel_q<<>>(m, n, A, s, q, + p, r); + kernel_s<<>>(m, n, A, s, q, + p, r); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int m = atoi(argv[2]); + int n = atoi(argv[3]); + int dump_code = atoi(argv[1]); + + // for(int i = 0; i < RUN; i++) { + /* Variable declaration/allocation. */ + double *A = (double *)malloc(m * n * sizeof(double)); + double *s = (double *)malloc(n * sizeof(double)); + double *q = (double *)malloc(m * sizeof(double)); + double *p = (double *)malloc(n * sizeof(double)); + double *r = (double *)malloc(m * sizeof(double)); + /* Initialize array(s). */ + init_array(m, n, A, r, p); + double *dev_A; + double *dev_s; + double *dev_q; + double *dev_p; + double *dev_r; + cudaMalloc(&dev_A, m * n * sizeof(double)); + cudaMalloc(&dev_s, n * sizeof(double)); + cudaMalloc(&dev_q, m * sizeof(double)); + cudaMalloc(&dev_p, n * sizeof(double)); + cudaMalloc(&dev_r, m * sizeof(double)); + cudaMemcpy(dev_A, A, m * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_s, s, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_q, q, m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_p, p, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_r, r, m * sizeof(double), cudaMemcpyHostToDevice); + kernel(m, n, dev_A, dev_s, dev_q, dev_p, dev_r); + cudaMemcpy(s, dev_s, n * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(q, dev_q, m * sizeof(double), cudaMemcpyDeviceToHost); + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(m, n, s, q); + /* Be clean. */ + free((void *)A); + free((void *)s); + free((void *)q); + free((void *)p); + free((void *)r); + // } + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu b/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu new file mode 100644 index 000000000000..459512998754 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu @@ -0,0 +1,108 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5 +// clang-format on +/** + * cholesky.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +/* Array initialization. */ +static void init_array(int n, double *A) { + int i, j; + + for (i = 0; i < n; i++) { + for (j = 0; j < n; j++) + A[i * n + j] = 1.0 / n; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *A) + +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", A[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } +} + +__global__ void kernel0(int n, int j, double *A) { + A[j * n + j] = std::sqrt(A[j * n + j]); +} + +__global__ void kernel1(int n, int j, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n && i > j) + A[i * n + j] /= A[j * n + j]; +} + +__global__ void kernel2(int n, int j, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int k = blockDim.y * blockIdx.y + threadIdx.y; + + if (j < n && j < i && i < n && j < k && k <= i) + A[i * n + k] -= A[i * n + j] * A[k * n + j]; +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +static void kernel_polly(int n, double *dev_A) { + const unsigned int threadsPerBlock = 256; + + for (int iter = 0; iter < n; iter++) { + kernel0<<<1, 1>>>(n, iter, dev_A); + + kernel1<<>>(n, iter, + dev_A); + + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel2<<>>(n, iter, dev_A); + } +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[2]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(n * n * sizeof(double)); + + /* Initialize array(s). */ + init_array(n, A); + + double *dev_A; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel_polly(n, dev_A); + + cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, A); + + /* Be clean. */ + free((void *)A); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu b/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu new file mode 100644 index 000000000000..4767bca2769b --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu @@ -0,0 +1,183 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * correlation.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define EPS 0.1 + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_mean(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < m) { + mean[j] = 0.0; + for (int i = 0; i < n; i++) + mean[j] += data[i * m + j]; + mean[j] /= n; + } +} + +__global__ void kernel_stddev(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < m) { + stddev[j] = 0.0; + for (int i = 0; i < n; i++) + stddev[j] += (data[i * m + j] - mean[j]) * (data[i * m + j] - mean[j]); + stddev[j] /= n; + stddev[j] = sqrt(stddev[j]); + /* The following in an inelegant but usual way to handle + near-zero std. dev. values, which below would cause a zero- + divide. */ + if (stddev[j] <= EPS) + stddev[j] = 1.0; + } +} + +__global__ void kernel_reduce(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j < m) { + data[i * m + j] -= mean[j]; + data[i * m + j] /= std::sqrt((double)n) * stddev[j]; + } +} + +__global__ void kernel_diag(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < m) { + corr[i * m + i] = 1.0; + } +} + +__global__ void kernel_corr(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y + i + 1; + + if (i < m - 1 && j < m) { + corr[i * m + j] = 0.0; + for (int k = 0; k < n; k++) + corr[i * m + j] += (data[k * m + i] * data[k * m + j]); + corr[j * m + i] = corr[i * m + j]; + } +} + +__global__ void kernel_tail(int m, int n, double *data, double *corr, + double mean[], double stddev[]) { + corr[(m - 1) * m + m - 1] = 1.0; +} + +static void kernel(int m, int n, double *data, double *corr, double mean[], + double stddev[]) { + const unsigned threadsPerBlock = 256; + + kernel_mean<<>>( + m, n, data, corr, mean, stddev); + kernel_stddev<<>>( + m, n, data, corr, mean, stddev); + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(m, block.y), 1); + kernel_reduce<<>>(m, n, data, corr, mean, stddev); + } + + kernel_diag<<>>( + m, n, data, corr, mean, stddev); + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m - 1, block.x), num_blocks(m - 1, block.y), 1); + kernel_corr<<>>(m, n, data, corr, mean, stddev); + } + + kernel_tail<<<1, 1>>>(m, n, data, corr, mean, stddev); +} + +static void init_array(int m, int n, double *data) { + int i, j; + + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) + data[i * n + j] = ((double)i * j) / 1000; +} + +static void print_array(int m, double *corr) + +{ + int i, j; + + for (i = 0; i < m; i++) + for (j = 0; j < m; j++) { + fprintf(stderr, "%0.2lf ", corr[i * m + j]); + if ((i * m + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + int m = atoi(argv[3]); + + double *data = (double *)malloc(n * m * sizeof(double)); + double *mean = (double *)malloc(m * sizeof(double)); + double *stddev = (double *)malloc(m * sizeof(double)); + double *corr = (double *)malloc(m * m * sizeof(double)); + + init_array(m, n, data); + + double *dev_data; + double *dev_mean; + double *dev_stddev; + double *dev_corr; + cudaMalloc(&dev_data, n * m * sizeof(double)); + cudaMalloc(&dev_mean, m * sizeof(double)); + cudaMalloc(&dev_stddev, m * sizeof(double)); + cudaMalloc(&dev_corr, m * m * sizeof(double)); + cudaMemcpy(dev_data, data, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_mean, mean, m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_stddev, stddev, m * m * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(dev_corr, corr, m * m * sizeof(double), cudaMemcpyHostToDevice); + + kernel(m, n, dev_data, dev_corr, dev_mean, dev_stddev); + cudaMemcpy(corr, dev_corr, m * m * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(m, corr); + + free((void *)data); + ; + free((void *)corr); + ; + free((void *)mean); + ; + free((void *)stddev); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu b/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu new file mode 100644 index 000000000000..fed93c517f02 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu @@ -0,0 +1,131 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * covariance.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static void init_array(int m, int n, double *data) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + data[i * m + j] = ((double)i * j) / 1000; +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_mean(int m, int n, double data[], double cov[], + double mean[]) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < m) { + mean[j] = 0.0; + for (int i = 0; i < n; i++) + mean[j] += data[i * m + j]; + mean[j] /= n; + } +} + +__global__ void kernel_reduce(int m, int n, double data[], double cov[], + double mean[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j < m) { + data[i * m + j] -= mean[j]; + } +} + +__global__ void kernel_cov(int m, int n, double data[], double cov[], + double mean[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y + i; + + if (i < m && j < m) { + cov[i * m + j] = 0.0; + for (int k = 0; k < n; k++) + cov[i * m + j] += data[k * m + i] * data[k * m + j]; + cov[i * m + j] /= (n - 1.0); + cov[j * m + i] = cov[i * m + j]; + } +} + +static void kernel(int m, int n, double data[], double cov[], double mean[]) { + const unsigned threadsPerBlock = 256; + + kernel_mean<<>>(m, n, data, + cov, mean); + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(m, block.y), 1); + kernel_reduce<<>>(m, n, data, cov, mean); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m - 1, block.x), num_blocks(m - 1, block.y), 1); + kernel_cov<<>>(m, n, data, cov, mean); + } +} + +static void print_array(int m, double *cov) + +{ + int i, j; + + for (i = 0; i < m; i++) + for (j = 0; j < m; j++) { + fprintf(stderr, "%0.2lf ", cov[i * m + j]); + if ((i * m + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + int m = atoi(argv[3]); + + double *data = (double *)malloc(n * m * sizeof(double)); + double *mean = (double *)malloc(m * sizeof(double)); + double *cov = (double *)malloc(m * m * sizeof(double)); + + init_array(m, n, data); + + double *dev_data; + double *dev_mean; + double *dev_cov; + cudaMalloc(&dev_data, n * m * sizeof(double)); + cudaMalloc(&dev_mean, m * sizeof(double)); + cudaMalloc(&dev_cov, m * m * sizeof(double)); + cudaMemcpy(dev_data, data, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_mean, mean, m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_cov, cov, m * m * sizeof(double), cudaMemcpyHostToDevice); + + kernel(m, n, dev_data, dev_cov, dev_mean); + cudaMemcpy(cov, dev_cov, m * m * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(m, cov); + + free((void *)data); + free((void *)cov); + free((void *)mean); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu b/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu new file mode 100644 index 000000000000..7cfa24415dd3 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu @@ -0,0 +1,111 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10 +// clang-format on +/** + * doitgen.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_sum(int nr, int nq, int np, double *A, double *C4, + double *sum) { + int r = blockDim.x * blockIdx.x + threadIdx.x; + int q = blockDim.y * blockIdx.y + threadIdx.y; + int p = blockDim.z * blockIdx.z + threadIdx.z; + + if (r < nr && q < nq && p < np) { + double dot = 0.0; + sum[(r * nq + q) * np + p] = 0; + for (int s = 0; s < np; s++) + dot += A[(r * nq + q) * np + s] * C4[s * np + p]; + sum[(r * nq + q) * np + p] = dot; + } +} + +static void kernel(int nr, int nq, int np, double *A, double *C4, double *sum) { + + const unsigned threadsPerBlock = 256; + + dim3 block(1, threadsPerBlock / 32, 32); + dim3 grid(num_blocks(nr, block.x), num_blocks(nq, block.y), + num_blocks(np, block.z)); + kernel_sum<<>>(nr, nq, np, A, C4, sum); +} + +static void init_array(int nr, int nq, int np, double *A, double *C4) { + int i, j, k; + + for (i = 0; i < nr; i++) + for (j = 0; j < nq; j++) + for (k = 0; k < np; k++) + A[i * np * nq + j * nq + k] = ((double)i * j + k) / np; + for (i = 0; i < np; i++) + for (j = 0; j < np; j++) + C4[i * np + j] = ((double)i * j) / np; +} + +static void print_array(int nr, int nq, int np, double *A) { + int i, j, k; + + for (i = 0; i < nr; i++) + for (j = 0; j < nq; j++) + for (k = 0; k < np; k++) { + fprintf(stderr, "%0.2lf ", A[i * nq * np + j * nq + k]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int nr = atoi(argv[2]); + int nq = atoi(argv[3]); + int np = atoi(argv[4]); + + double *A = (double *)malloc(nr * nq * np * sizeof(double)); + double *sum = (double *)malloc(nr * nq * np * sizeof(double)); + double *C4 = (double *)malloc(np * np * sizeof(double)); + + init_array(nr, nq, np, A, C4); + + double *dev_A; + double *dev_sum; + double *dev_C4; + cudaMalloc(&dev_A, nr * nq * np * sizeof(double)); + cudaMalloc(&dev_sum, nr * nq * np * sizeof(double)); + cudaMalloc(&dev_C4, np * np * sizeof(double)); + cudaMemcpy(dev_A, A, nr * nq * np * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C4, C4, np * np * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_sum, sum, nr * nq * np * sizeof(double), + cudaMemcpyHostToDevice); + + kernel(nr, nq, np, dev_A, dev_C4, dev_sum); + + cudaMemcpy(sum, dev_sum, nr * nq * np * sizeof(double), + cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(nr, nq, np, sum); + + free((void *)A); + ; + free((void *)sum); + ; + free((void *)C4); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu new file mode 100644 index 000000000000..e3d50d078a88 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu @@ -0,0 +1,165 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5 10 10 +// clang-format on +/** + * fdtd-2d.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define TMAX 100 +#define NX 4000 +#define NY 4000 + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_splat(int tmax, int nx, int ny, double *ex, double *ey, + double *hz, double fict[], int t) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < ny) + ey[0 * ny + j] = fict[t]; +} + +__global__ void kernel_ey(int tmax, int nx, int ny, double *ex, double *ey, + double *hz, double fict[], int t) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < nx && j < ny) + ey[i * ny + j] -= (double)(0.5) * (hz[i * ny + j] - hz[(i - 1) * ny + j]); +} + +__global__ void kernel_ex(int tmax, int nx, int ny, double *ex, double *ey, + double *hz, double fict[], int t) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y + 1; + + if (i < nx && j < ny) + ex[i * ny + j] -= (double)(0.5) * (hz[i * ny + j] - hz[i * ny + j - 1]); +} + +__global__ void kernel_hz(int tmax, int nx, int ny, double *ex, double *ey, + double *hz, double fict[], int t) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < nx - 1 && j < ny - 1) + hz[i * ny + j] -= (double)(0.7) * (ex[i * ny + j + 1] - ex[i * ny + j] + + ey[(i + 1) * ny + j] - ey[i * ny + j]); +} + +static void kernel(int tmax, int nx, int ny, double *ex, double *ey, double *hz, + double *fict) { + const unsigned threadsPerBlock = 256; + + for (int t = 0; t < tmax; t++) { + kernel_splat<<>>( + tmax, nx, ny, ex, ey, hz, fict, t); + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(nx - 1, block.x), num_blocks(ny, block.y), 1); + kernel_ey<<>>(tmax, nx, ny, ex, ey, hz, fict, t); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(nx, block.x), num_blocks(ny - 1, block.y), 1); + kernel_ex<<>>(tmax, nx, ny, ex, ey, hz, fict, t); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(nx - 1, block.x), num_blocks(ny - 1, block.y), 1); + kernel_hz<<>>(tmax, nx, ny, ex, ey, hz, fict, t); + } + } +} + +/* Array initialization. */ +static void init_array(int nx, int ny, double *ex, double *ey, double *hz, + double *_fict_) { + int i, j; + + for (i = 0; i < ny; i++) + _fict_[i] = (double)i; + for (i = 0; i < nx; i++) + for (j = 0; j < ny; j++) { + ex[i * ny + j] = ((double)i * (j + 1)) / nx; + ey[i * ny + j] = ((double)i * (j + 2)) / ny; + hz[i * ny + j] = ((double)i * (j + 3)) / nx; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int nx, int ny, double *ex, double *ey, double *hz) { + int i, j; + + for (i = 0; i < nx; i++) + for (j = 0; j < ny; j++) { + fprintf(stderr, "%0.2lf ", ex[i * ny + j]); + fprintf(stderr, "%0.2lf ", ey[i * ny + j]); + fprintf(stderr, "%0.2lf ", hz[i * ny + j]); + if ((i * nx + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int tmax = atoi(argv[2]); + int nx = atoi(argv[3]); + int ny = atoi(argv[4]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *ex = (double *)malloc(nx * ny * sizeof(double)); + double *ey = (double *)malloc(nx * ny * sizeof(double)); + double *hz = (double *)malloc(nx * ny * sizeof(double)); + double *_fict_ = (double *)malloc(ny * sizeof(double)); + + /* Initialize array(s). */ + init_array(nx, ny, ex, ey, hz, _fict_); + double *dev_ex; + double *dev_ey; + double *dev_hz; + double *dev_fict; + cudaMalloc(&dev_ex, nx * ny * sizeof(double)); + cudaMalloc(&dev_ey, nx * ny * sizeof(double)); + cudaMalloc(&dev_hz, nx * ny * sizeof(double)); + cudaMalloc(&dev_fict, ny * sizeof(double)); + cudaMemcpy(dev_ex, ex, nx * ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_ey, ey, nx * ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_hz, hz, nx * ny * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_fict, _fict_, ny * sizeof(double), cudaMemcpyHostToDevice); + /* Run kernel. */ + kernel(tmax, nx, ny, dev_ex, dev_ey, dev_hz, dev_fict); + cudaMemcpy(ex, dev_ex, nx * ny * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(ey, dev_ey, nx * ny * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(hz, dev_hz, nx * ny * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(nx, ny, ex, ey, hz); + + /* Be clean. */ + free((void *)ex); + free((void *)ey); + free((void *)hz); + free((void *)_fict_); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu new file mode 100644 index 000000000000..04a534687bc0 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu @@ -0,0 +1,108 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10 +// clang-format on +/** + * gemm.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +__global__ void kernel_dev(int ni, int nj, int nk, double alpha, double beta, + double *C, double *A, double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + int k; + double dot = 0.0; + + if (i < ni && j < nj) { + dot = C[i * nj + j] * beta; + for (k = 0; k < nk; k++) + dot += alpha * A[i * nk + k] * B[k * nj + j]; + C[i * nj + j] = dot; + } +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +static void kernel(int ni, int nj, int nk, double alpha, double beta, double *C, + double *A, double *B) { + + unsigned threadsPerBlock = 256; + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1); + kernel_dev<<>>(ni, nj, nk, alpha, beta, C, A, B); +} + +static void init_array(int ni, int nj, int nk, double *C, double *A, + double *B) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + C[i * nj + j] = ((double)i * j) / ni; + for (i = 0; i < ni; i++) + for (j = 0; j < nk; j++) + A[i * nk + j] = ((double)i * j) / ni; + for (i = 0; i < nk; i++) + for (j = 0; j < nj; j++) + B[i * nj + j] = ((double)i * j) / ni; +} + +static void print_array(int ni, int nj, double *C) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) { + fprintf(stderr, "%0.2lf ", C[i * nj + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int ni = atoi(argv[2]); + int nj = atoi(argv[3]); + int nk = atoi(argv[4]); + + double *A = (double *)malloc(ni * nk * sizeof(double)); + double *B = (double *)malloc(nk * nj * sizeof(double)); + double *C = (double *)malloc(ni * nj * sizeof(double)); + + double alpha = 32412; + double beta = 2123; + + init_array(ni, nj, nk, C, A, B); + + double *dev_A; + double *dev_B; + double *dev_C; + cudaMalloc(&dev_A, ni * nk * sizeof(double)); + cudaMalloc(&dev_B, nk * nj * sizeof(double)); + cudaMalloc(&dev_C, ni * nj * sizeof(double)); + cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + + kernel(ni, nj, nk, alpha, beta, dev_C, dev_A, dev_B); + cudaMemcpy(C, dev_C, ni * nj * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(ni, nj, C); + + free((void *)C); + free((void *)A); + free((void *)B); + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu new file mode 100644 index 000000000000..264f168e7185 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu @@ -0,0 +1,190 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 +// clang-format on +/** + * gemver.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define RUN 50 + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_A(int n, double alpha, double beta, double *A, + double *u1, double *v1, double *u2, double *v2, + double *w, double *x, double *y, double *z) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j < n) + A[i * n + j] += u1[i] * v1[j] + u2[i] * v2[j]; +} + +__global__ void kernel_x(int n, double alpha, double beta, double *A, + double *u1, double *v1, double *u2, double *v2, + double *w, double *x, double *y, double *z) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) { + for (int j = 0; j < n; j++) + x[i] += beta * A[j * n + i] * y[j]; + } +} + +__global__ void kernel_y(int n, double alpha, double beta, double *A, + double *u1, double *v1, double *u2, double *v2, + double *w, double *x, double *y, double *z) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) + x[i] += z[i]; +} + +__global__ void kernel_w(int n, double alpha, double beta, double *A, + double *u1, double *v1, double *u2, double *v2, + double *w, double *x, double *y, double *z) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) { + for (int j = 0; j < n; j++) + w[i] += alpha * A[i * n + j] * x[j]; + } +} + +static void kernel(int n, double alpha, double beta, double *A, double *u1, + double *v1, double *u2, double *v2, double *w, double *x, + double *y, double *z) { + + const unsigned threadsPerBlock = 256; + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel_A<<>>(n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z); + } + + kernel_x<<>>( + n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z); + kernel_y<<>>( + n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z); + kernel_w<<>>( + n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z); +} + +/* Array initialization. */ +static void init_array(int n, double *A, double *u1, double *v1, double *u2, + double *v2, double *w, double *x, double *y, double *z) { + int i, j; + + for (i = 0; i < n; i++) { + u1[i] = i; + u2[i] = (i + 1) / n / 2.0; + v1[i] = (i + 1) / n / 4.0; + v2[i] = (i + 1) / n / 6.0; + y[i] = (i + 1) / n / 8.0; + z[i] = (i + 1) / n / 9.0; + x[i] = 0.0; + w[i] = 0.0; + for (j = 0; j < n; j++) + A[i * n + j] = ((double)i * j) / n; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *w) { + int i; + + for (i = 0; i < n; i++) { + fprintf(stderr, "%0.2lf ", w[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[2]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(n * n * sizeof(double)); + double *u1 = (double *)malloc(n * sizeof(double)); + double *v1 = (double *)malloc(n * sizeof(double)); + double *u2 = (double *)malloc(n * sizeof(double)); + double *v2 = (double *)malloc(n * sizeof(double)); + double *w = (double *)malloc(n * sizeof(double)); + double *x = (double *)malloc(n * sizeof(double)); + double *y = (double *)malloc(n * sizeof(double)); + double *z = (double *)malloc(n * sizeof(double)); + + double alpha = 43532; + double beta = 12313; + /* Initialize array(s). */ + init_array(n, A, u1, v1, u2, v2, w, x, y, z); + + double *dev_A; + double *dev_u1; + double *dev_v1; + double *dev_u2; + double *dev_v2; + double *dev_w; + double *dev_x; + double *dev_y; + double *dev_z; + double *dev_alpha; + double *dev_beta; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMalloc(&dev_u1, n * sizeof(double)); + cudaMalloc(&dev_v1, n * sizeof(double)); + cudaMalloc(&dev_u2, n * sizeof(double)); + cudaMalloc(&dev_v2, n * sizeof(double)); + cudaMalloc(&dev_w, n * sizeof(double)); + cudaMalloc(&dev_x, n * sizeof(double)); + cudaMalloc(&dev_y, n * sizeof(double)); + cudaMalloc(&dev_z, n * sizeof(double)); + cudaMalloc(&dev_alpha, sizeof(double)); + cudaMalloc(&dev_beta, sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_u1, u1, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_v1, v1, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_u2, u2, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_v2, v2, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_w, w, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_x, x, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_y, y, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_z, z, n * sizeof(double), cudaMemcpyHostToDevice); + + /* Run kernel. */ + kernel(n, alpha, beta, dev_A, dev_u1, dev_v1, dev_u2, dev_v2, dev_w, dev_x, + dev_y, dev_z); + cudaMemcpy(w, dev_w, n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, w); + + /* Be clean. */ + free((void *)A); + free((void *)u1); + free((void *)v1); + free((void *)u2); + free((void *)v2); + free((void *)w); + free((void *)x); + free((void *)y); + free((void *)z); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu new file mode 100644 index 000000000000..ac4e1a09442e --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu @@ -0,0 +1,124 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 +// clang-format on +/** + * gesummv.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +#define RUN 100 + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_y(int n, double alpha, double beta, double *A, double *B, + double tmp[], double x[], double y[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + + if (i < n) { + tmp[i] = 0; + y[i] = 0; + for (int j = 0; j < n; j++) { + tmp[i] += A[i * n + j] * x[j]; + y[i] += B[i * n + j] * x[j]; + } + y[i] = alpha * tmp[i] + beta * y[i]; + } +} + +/* Array initialization. */ +static void init_array(int n, double *A, double *B, double *x) { + int i, j; + + for (i = 0; i < n; i++) { + x[i] = ((double)i) / n; + for (j = 0; j < n; j++) { + A[i * n + j] = ((double)i * j) / n; + B[i * n + j] = ((double)i * j) / n; + } + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *y) + +{ + int i; + + for (i = 0; i < n; i++) { + fprintf(stderr, "%0.2lf ", y[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[2]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(n * n * sizeof(double)); + double *B = (double *)malloc(n * n * sizeof(double)); + double *tmp = (double *)malloc(n * sizeof(double)); + double *x = (double *)malloc(n * sizeof(double)); + double *y = (double *)malloc(n * sizeof(double)); + + //__builtin_assume(n>0); + //__builtin_assume(n<0x7FFFFFFE); + /* Initialize array(s). */ + init_array(n, A, B, x); + double alpha = 43532; + double beta = 12313; + + double *dev_A; + double *dev_B; + double *dev_tmp; + double *dev_x; + double *dev_y; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMalloc(&dev_B, n * n * sizeof(double)); + cudaMalloc(&dev_tmp, n * sizeof(double)); + cudaMalloc(&dev_x, n * sizeof(double)); + cudaMalloc(&dev_y, n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmp, tmp, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_x, x, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_y, y, n * sizeof(double), cudaMemcpyHostToDevice); + /* Run kernel. */ + + const unsigned threadsPerBlock = 256; + kernel_y<<>>( + n, alpha, beta, dev_A, dev_B, dev_tmp, dev_x, dev_y); + cudaMemcpy(y, dev_y, n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, y); + + /* Be clean. */ + free((void *)A); + free((void *)B); + free((void *)tmp); + free((void *)x); + free((void *)y); + + cudaFree((void *)dev_A); + cudaFree((void *)dev_B); + cudaFree((void *)dev_tmp); + cudaFree((void *)dev_x); + cudaFree((void *)dev_y); + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu new file mode 100644 index 000000000000..11e0fe5a4287 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu @@ -0,0 +1,96 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10 +// clang-format on +#include +#include +#include +#include +#include + +static void init_array(int n, double *A, double *B) { + int i, j, k; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + for (k = 0; k < n; k++) { + A[(i * n + j) * n + k] = ((double)i + j + k) / n; + B[(i * n + j) * n + k] = ((double)i + j + k + 1) / n; + } +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_stencil(int iter, double *A, double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + int j = blockDim.y * blockIdx.y + threadIdx.y + 1; + int k = blockDim.z * blockIdx.z + threadIdx.z + 1; + + if (i < iter - 1 && j < iter - 1 && k < iter - 1) { + B[(i * iter + j) * iter + k] = + (A[((i + 1) * iter + j) * iter + k] - 2 * A[(i * iter + j) * iter + k] + + A[((i - 1) * iter + j) * iter + k]) / + 8 + + (A[(i * iter + (j + 1)) * iter + k] - 2 * A[(i * iter + j) * iter + k] + + A[(i * iter + (j - 1)) * iter + k]) / + 8 + + (A[(i * iter + j) * iter + k + 1] - 2 * A[(i * iter + j) * iter + k] + + A[(i * iter + j) * iter + k - 1]) / + 8 + + A[(i * iter + j) * iter + k]; + } +} + +static void kernel(int tsteps, int iter, double *A, double *B) { + const unsigned int threadsPerBlock = 256; + + for (int t = 1; t <= tsteps; t++) { + dim3 block(1, threadsPerBlock / 32, 32); + dim3 grid(num_blocks(iter - 2, block.x), num_blocks(iter - 2, block.y), + num_blocks(iter - 2, block.z)); + kernel_stencil<<>>(iter, A, B); + kernel_stencil<<>>(iter, B, A); + } +} + +static void print_array(int n, double *A) { + int i, j, k; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + for (k = 0; k < n; k++) { + fprintf(stderr, "%0.2lf ", A[(i * n + j) * n + k]); + if (((i * n + j) * n + k) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int tsteps = atoi(argv[2]); + int n = atoi(argv[3]); + + double *A = (double *)malloc(n * n * n * sizeof(double)); + double *B = (double *)malloc(n * n * n * sizeof(double)); + + init_array(n, A, B); + + double *dev_A; + double *dev_B; + cudaMalloc(&dev_A, n * n * n * sizeof(double)); + cudaMalloc(&dev_B, n * n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, n * n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(tsteps, n, dev_A, dev_B); + cudaMemcpy(A, dev_A, n * n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(n, A); + + free((void *)A); + free((void *)B); +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu new file mode 100644 index 000000000000..6cdf5544de3c --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu @@ -0,0 +1,98 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5 10 +// clang-format on +/** + * jacobi-1d-imper.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_stencil(int n, double A[], double B[]) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + + if (i < n - 1) { + B[i] = (A[i - 1] + A[i] + A[i + 1]) / 3; + } +} + +static void kernel(int tsteps, int n, double A[], double B[]) { + const unsigned int threadsPerBlock = 256; + + for (int t = 1; t <= tsteps; t++) { + kernel_stencil<<>>(n, A, + B); + kernel_stencil<<>>(n, B, + A); + } +} + +/* Array initialization. */ +static void init_array(int n, double *A, double *B) { + int i; + + for (i = 0; i < n; i++) { + A[i] = ((double)i + 2) / n; + B[i] = ((double)i + 3) / n; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *A) + +{ + int i; + + for (i = 0; i < n; i++) { + fprintf(stderr, "%0.2lf ", A[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[3]); + int tsteps = atoi(argv[2]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(n * sizeof(double)); + double *B = (double *)malloc(n * sizeof(double)); + + init_array(n, A, B); + + double *dev_A; + double *dev_B; + cudaMalloc(&dev_A, n * sizeof(double)); + cudaMalloc(&dev_B, n * sizeof(double)); + cudaMemcpy(dev_A, A, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, n * sizeof(double), cudaMemcpyHostToDevice); + + /* Run kernel. */ + kernel(tsteps, n, dev_A, dev_B); + cudaMemcpy(A, dev_A, n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, A); + + /* Be clean. */ + free((void *)A); + free((void *)B); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu new file mode 100644 index 000000000000..4ea7bb037d77 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu @@ -0,0 +1,101 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 5 +// clang-format on +/** + * jacobi-2d-imper.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_stencil(int n, double *A, double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + int j = blockDim.y * blockIdx.y + threadIdx.y + 1; + + if (i < n - 1 && j < n - 1) { + B[i * n + j] = (A[i * n + j] + A[i * n + j - 1] + A[i * n + 1 + j] + + A[(1 + i) * n + j] + A[(i - 1) * n + j]) / + 5; + } +} + +static void kernel(int tsteps, int n, double *A, double *B) { + const unsigned int threadsPerBlock = 256; + + for (int t = 1; t <= tsteps; t++) { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n - 2, block.x), num_blocks(n - 2, block.y), 1); + kernel_stencil<<>>(n, A, B); + kernel_stencil<<>>(n, B, A); + } +} + +static void init_array(int n, double *A, double *B) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + A[i * n + j] = ((double)i * (j + 2) + 2) / n; + B[i * n + j] = ((double)i * (j + 3) + 3) / n; + } +} + +static void print_array(int n, double *A) + +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", A[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + int tsteps = atoi(argv[3]); + + double *A = (double *)malloc(n * n * sizeof(double)); + double *B = (double *)malloc(n * n * sizeof(double)); + + // __builtin_assume(tsteps>0); + // __builtin_assume(n>2); + init_array(n, A, B); + + double *dev_A; + double *dev_B; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMalloc(&dev_B, n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(tsteps, n, dev_A, dev_B); + cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(n, A); + + free((void *)A); + ; + free((void *)B); + ; + cudaFree((void *)dev_A); + cudaFree((void *)dev_B); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu b/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu new file mode 100644 index 000000000000..4406bdb7ffcd --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu @@ -0,0 +1,96 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 100 +// clang-format on +/** + * lu.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static void init_array(int n, double *A) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + A[i * n + j] = ((double)(i + 1) * (j + 1)) / n; +} + +static void print_array(int n, double *A) + +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", A[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_div(int n, double *A, int k) { + int i = blockDim.x * blockIdx.x + threadIdx.x + k + 1; + + if (i < n) + A[i * n + k] /= A[k * n + k]; +} + +__global__ void kernel_A(int n, double *A, int k) { + int i = blockDim.x * blockIdx.x + threadIdx.x + k + 1; + int j = blockDim.y * blockIdx.y + threadIdx.y + k + 1; + + if (i < n && j < n) + A[i * n + j] -= A[i * n + k] * A[k * n + j]; +} + +static void kernel(int n, double *A) { + const unsigned int threadsPerBlock = 256; + + for (int iter = 0; iter < n - 1; iter++) { + kernel_div<<>>(n, A, iter); + + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n - (iter + 1), block.x), + num_blocks(n - (iter + 1), block.y), 1); + kernel_A<<>>(n, A, iter); + } +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + + double *A = (double *)malloc(n * n * sizeof(double)); + + init_array(n, A); + + double *dev_A; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(n, dev_A); + + cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(n, A); + + free((void *)A); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu b/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu new file mode 100644 index 000000000000..20cff302c13e --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu @@ -0,0 +1,141 @@ +// clang-format off +// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm +// RUN: true +// clang-format on +/** + * mvt.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include +#include + +/* Include benchmark-specific header. */ +/* Default data type is double, default size is 4000. */ +#define RUN 200 +#define N 15000 +// #define N 40 + +/* Array initialization. */ +static void init_array(int n, double *x1, double *x2, double *y_1, double *y_2, + double *A) { + int i, j; + + for (i = 0; i < n; i++) { + x1[i] = ((double)i) / n; + x2[i] = ((double)i + 1) / n; + y_1[i] = ((double)i + 3) / n; + y_2[i] = ((double)i + 4) / n; + for (j = 0; j < n; j++) + A[i * n + j] = ((double)i * j) / n; + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int n, double *x1, double *x2) + +{ + int i; + + for (i = 0; i < n; i++) { + fprintf(stderr, "%0.2lf", x1[i]); + fprintf(stderr, "%0.2lf", x2[i]); + if (i % 20 == 0) + fprintf(stderr, "\n"); + } +} + +__global__ void kernel_x1(int n, double *x1, double *x2, double *y_1, + double *y_2, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j; + if (i < n) { + for (j = 0; j < n; j++) + x1[i] += A[i * n + j] * y_1[j]; + } +} + +__global__ void kernel_x2(int n, double *x1, double *x2, double *y_1, + double *y_2, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j; + if (i < n) { + for (j = 0; j < n; j++) + x2[i] += A[j * n + i] * y_2[j]; + } +} + +short num_blocks(short num, short factor) { + return (num + factor - 1) / factor; +} + +void kernel(int n, double *x1, double *x2, double *y_1, double *y_2, + double *A) { + short threadsPerBlock = 256; + + kernel_x1<<>>(n, x1, x2, y_1, + y_2, A); + kernel_x2<<>>(n, x1, x2, y_1, + y_2, A); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int dump_code = atoi(argv[1]); + int n = N; + + /* Variable declaration/allocation. */ + double *A = (double *)malloc(N * N * sizeof(double)); + double *x1 = (double *)malloc(sizeof(double) * n); + double *x2 = (double *)malloc(sizeof(double) * n); + double *y_1 = (double *)malloc(sizeof(double) * n); + double *y_2 = (double *)malloc(sizeof(double) * n); + + /* Initialize array(s). */ + init_array(n, x1, x2, y_1, y_2, A); + + double *dev_A; + double *dev_x1; + double *dev_x2; + double *dev_y_1; + double *dev_y_2; + + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMalloc(&dev_x1, n * sizeof(double)); + cudaMalloc(&dev_x2, n * sizeof(double)); + cudaMalloc(&dev_y_1, n * sizeof(double)); + cudaMalloc(&dev_y_2, n * sizeof(double)); + + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_x1, x1, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_x2, x2, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_y_1, y_1, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_y_2, y_2, n * sizeof(double), cudaMemcpyHostToDevice); + + /* Run kernel. */ + kernel(n, dev_x1, dev_x2, dev_y_1, dev_y_2, dev_A); + + cudaMemcpy(x1, dev_x1, n * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(x2, dev_x2, n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(n, x1, x2); + + /* Be clean. */ + free((void *)A); + free((void *)x1); + free((void *)x2); + free((void *)y_1); + free((void *)y_2); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu b/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu new file mode 100644 index 000000000000..53828dde253b --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu @@ -0,0 +1,111 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 +// clang-format on +#include +#include +#include +#include +#include + +#define match(b1, b2) (((b1) + (b2)) == 3 ? 1 : 0) + +static void init_array(int n, double *table, double *oldtable, double *seq) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + table[i * n + j] = ((double)i * j) / n; + oldtable[i * n + j] = ((double)i * j) / n; + } + for (i = 0; i < n; i++) + seq[i] = ((double)i) / n; +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +// Dynamic programming wavefront +__global__ void kernel_max_score(int n, double *seq, double *table, + double *oldtable, int w) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + int i = ((int)n - 1) + j - w; + + if (0 <= i && i < n && i + 1 <= j && j < n) { + double maximum = table[i * n + j]; + + if (j - 1 >= 0) + maximum = max(maximum, table[i * n + (j - 1)]); + if (i + 1 < n) + maximum = max(maximum, table[(i + 1) * n + j]); + + if (j - 1 >= 0 && i + 1 < n) { + auto upd = table[(i + 1) * n + (j - 1)]; + + /* don't allow adjacent elements to bond */ + if (i < j - 1) + upd += (seq[i] + seq[j] == 3) ? (double)1 : (double)0; + + maximum = max(maximum, upd); + } + + for (int k = i + 1; k < j; k++) + maximum = max(maximum, table[i * n + k] + table[(k + 1) * n + j]); + + // AtomicMax::set_if_larger(table[i * n + j], maximum); + table[i * n + j] = maximum; + } +} + +static void kernel(int n, double *seq, double *table, double *oldtable) { + const unsigned threadsPerBlock = 32; + + for (int w = n; w < 2 * n - 1; ++w) { // wavefronting + kernel_max_score<<>>( + n, seq, table, oldtable, w); + } +} + +static void print_array(int n, double *table) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", table[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + double *table = (double *)malloc(n * n * sizeof(double)); + double *oldtable = (double *)malloc(n * n * sizeof(double)); + double *seq = (double *)malloc(n * sizeof(double)); + + init_array(n, table, oldtable, seq); + + double *dev_table; + double *dev_oldtable; + double *dev_seq; + cudaMalloc(&dev_table, n * n * sizeof(double)); + cudaMalloc(&dev_oldtable, n * n * sizeof(double)); + cudaMalloc(&dev_seq, n * sizeof(double)); + cudaMemcpy(dev_table, table, n * n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_oldtable, oldtable, n * n * sizeof(double), + cudaMemcpyHostToDevice); + cudaMemcpy(dev_seq, seq, n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(n, dev_seq, dev_table, dev_oldtable); + cudaMemcpy(table, dev_table, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(n, table); + + free((void *)table); + free((void *)oldtable); + free((void *)seq); +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu b/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu new file mode 100644 index 000000000000..01d6e40e28bd --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu @@ -0,0 +1,89 @@ +// clang-format off +// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 10 +// RUN: true +// clang-format on +// This program performs sum reduction with an optimization +// removing warp divergence +// By: Nick from CoffeeBeforeArch + +#include +#include +#include +#include +#include + +#define SHMEM_SIZE 256 + +__global__ void sumReduction(int *v, int *v_r) { + // Allocate shared memory + __shared__ int partial_sum[SHMEM_SIZE]; + + // Calculate thread ID + int tid = blockIdx.x * blockDim.x + threadIdx.x; + + // Load elements into shared memory + partial_sum[threadIdx.x] = v[tid]; + __syncthreads(); + + // Increase the stride of the access until we exceed the CTA dimensions + for (int s = 1; s < blockDim.x; s *= 2) { + // Change the indexing to be sequential threads + int index = 2 * s * threadIdx.x; + + // Each thread does work unless the index goes off the block + if (index < blockDim.x) { + partial_sum[index] += partial_sum[index + s]; + } + __syncthreads(); + } + + // Let the thread 0 for this block write it's result to main memory + // Result is inexed by this block + if (threadIdx.x == 0) { + v_r[blockIdx.x] = partial_sum[0]; + } +} + +void initialize_vector(int *v, int n) { + for (int i = 0; i < n; i++) { + v[i] = 1; // rand() % 10; + } +} + +int main() { + // Vector size + int N = 1 << 16; + int bytes = N * sizeof(int); + + // Original vector and result vector + int *h_v, *h_v_r; + int *d_v, *d_v_r; + + // Initialize vector + initialize_vector(h_v, N); + + // Allocate device memory + cudaMalloc(&d_v, bytes); + cudaMalloc(&d_v_r, bytes); + + // Copy to device + cudaMemcpy(d_v, h_v, bytes, cudaMemcpyHostToDevice); + + // TB Size + const int TB_SIZE = 256; + + // Grid Size (No padding) + int GRID_SIZE = N / TB_SIZE; + + // Call kernels + sumReduction<<>>(d_v, d_v_r); + + sumReduction<<<1, TB_SIZE>>>(d_v_r, d_v_r); + + // Copy to host; + cudaMemcpy(h_v_r, d_v_r, bytes, cudaMemcpyDeviceToHost); + + printf("COMPLETED SUCCESSFULLY %d\n", h_v_r[0]); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu new file mode 100644 index 000000000000..a691858ee476 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu @@ -0,0 +1,91 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 2 +// clang-format on +/** + * seidel-2d.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static void init_array(int n, double *A) { + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) + A[i * n + j] = ((double)i * (j + 2) + 2) / n; +} + +static void print_array(int n, double *A) + +{ + int i, j; + + for (i = 0; i < n; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", A[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_stencil(int tsteps, int n, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x + 1; + int j = blockDim.y * blockIdx.y + threadIdx.y + 1; + + if (i < n - 1 && j < n - 1) + A[i * n + j] = + (A[(i - 1) * n + j - 1] + A[(i - 1) * n + j] + A[(i - 1) * n + j + 1] + + A[i * n + j - 1] + A[i * n + j] + A[i * n + j + 1] + + A[(i + 1) * n + j - 1] + A[(i + 1) * n + j] + A[(i + 1) * n + j + 1]) / + 9; +} + +static void kernel(int tsteps, int n, double *A) { + // FIXME: Parallelizing this should give different results + const unsigned int threadsPerBlock = 256; + + for (int t = 1; t <= tsteps; t++) { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n - 2, block.x), num_blocks(n - 2, block.y), 1); + kernel_stencil<<>>(tsteps, n, A); + } +} + +int main(int argc, char **argv) { + + int dump_code = atoi(argv[1]); + int n = atoi(argv[2]); + int tsteps = atoi(argv[3]); + + double *A = (double *)malloc(n * n * sizeof(double)); + + init_array(n, A); + + double *dev_A; + cudaMalloc(&dev_A, n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(tsteps, n, dev_A); + + cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(n, A); + + free((void *)A); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu new file mode 100644 index 000000000000..d8c462e808ee --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu @@ -0,0 +1,147 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * symm.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static void init_array(int ni, int nj, double *C, double *A, double *B, + double *tmp) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) { + C[i * nj + j] = ((double)i * j) / ni; + B[i * nj + j] = ((double)i * j) / ni; + tmp[i * nj + j] = 0; + } + for (i = 0; i < nj; i++) + for (j = 0; j < nj; j++) + A[i * nj + j] = ((double)i * j) / ni; +} + +static void print_array(int ni, int nj, double *C) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) { + fprintf(stderr, "%0.2lf ", C[i * nj + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_tmp(int m, int n, double alpha, double beta, double *C, + double *A, double *B, double *tmp) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < m && j < n) { + tmp[i * n + j] = 0; + for (int k = 0; k < i; k++) + tmp[i * n + j] += B[k * n + j] * A[i * n + k]; + } +} + +__global__ void kernel_C(int m, int n, double alpha, double beta, double *C, + double *A, double *B, double *tmp) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < m && j < n) + C[i * n + j] = beta * C[i * n + j] + alpha * B[i * n + j] * A[i * n + i] + + alpha * tmp[i * n + j]; +} + +__global__ void kernel_sum(int m, int n, double alpha, double beta, double *C, + double *A, double *B, double *tmp) { + int k = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (k < m - 1 && j < n) { + for (int i = k + 1; i < m; i++) + C[k * n + j] += alpha * B[i * n + j] * A[i * n + k]; + } +} + +static void kernel(int m, int n, double alpha, double beta, double *C, + double *A, double *B, double *tmp) { + const unsigned int threadsPerBlock = 256; + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1); + kernel_tmp<<>>(m, n, alpha, beta, C, A, B, tmp); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1); + kernel_C<<>>(m, n, alpha, beta, C, A, B, tmp); + } + + // TODO: Combine both kernels? + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m - 1, block.x), num_blocks(n, block.y), 1); + kernel_sum<<>>(m, n, alpha, beta, C, A, B, tmp); + } +} + +int main(int argc, char **argv) { + int dump_code = atoi(argv[1]); + int ni = atoi(argv[2]); + int nj = atoi(argv[3]); + + double *A = (double *)malloc(nj * nj * sizeof(double)); + double *B = (double *)malloc(ni * nj * sizeof(double)); + double *C = (double *)malloc(ni * nj * sizeof(double)); + double *tmp = (double *)malloc(ni * nj * sizeof(double)); + + double alpha = 32412; + double beta = 2123; + + init_array(ni, nj, C, A, B, tmp); + + double *dev_A; + double *dev_B; + double *dev_C; + double *dev_tmp; + cudaMalloc(&dev_A, nj * nj * sizeof(double)); + cudaMalloc(&dev_B, ni * nj * sizeof(double)); + cudaMalloc(&dev_C, ni * nj * sizeof(double)); + cudaMalloc(&dev_tmp, ni * nj * sizeof(double)); + cudaMemcpy(dev_A, A, nj * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_tmp, tmp, ni * nj * sizeof(double), cudaMemcpyHostToDevice); + + kernel(ni, nj, alpha, beta, dev_C, dev_A, dev_B, dev_tmp); + cudaMemcpy(C, dev_C, ni * nj * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(ni, nj, C); + + free((void *)C); + ; + free((void *)A); + ; + free((void *)B); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu b/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu new file mode 100644 index 000000000000..459997bb7fc5 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu @@ -0,0 +1,122 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * syr2k.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_beta(int n, int m, double alpha, double beta, double *C, + double *A, double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j <= i) + C[i * n + j] *= beta; +} + +__global__ void kernel_product(int n, int m, double alpha, double beta, + double *C, double *A, double *B) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j <= i) { + for (int k = 0; k < m; k++) + C[i * n + j] += A[j * m + k] * alpha * B[i * m + k] + + B[j * m + k] * alpha * A[i * m + k]; + } +} + +static void kernel(int n, int m, double alpha, double beta, double *C, + double *A, double *B) { + const unsigned int threadsPerBlock = 256; + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel_beta<<>>(n, m, alpha, beta, C, A, B); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel_product<<>>(n, m, alpha, beta, C, A, B); + } +} + +static void init_array(int ni, int nj, double *C, double *A, double *B) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) { + A[i * nj + j] = ((double)i * j) / ni; + B[i * nj + j] = ((double)i * j) / ni; + } + for (i = 0; i < ni; i++) + for (j = 0; j < ni; j++) + C[i * ni + j] = ((double)i * j) / ni; +} + +static void print_array(int ni, double *C) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < ni; j++) { + fprintf(stderr, "%0.2lf ", C[i * ni + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + int dump_code = atoi(argv[1]); + int m = atoi(argv[2]); + int n = atoi(argv[3]); + + double *A = (double *)malloc(m * n * sizeof(double)); + double *B = (double *)malloc(m * n * sizeof(double)); + double *C = (double *)malloc(m * m * sizeof(double)); + + double alpha = 32412; + double beta = 2123; + + init_array(m, n, C, A, B); + + double *dev_A; + double *dev_B; + double *dev_C; + cudaMalloc(&dev_A, n * m * sizeof(double)); + cudaMalloc(&dev_B, n * m * sizeof(double)); + cudaMalloc(&dev_C, n * m * sizeof(double)); + cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, n * m * sizeof(double), cudaMemcpyHostToDevice); + + kernel(m, n, alpha, beta, dev_C, dev_A, dev_B); + cudaMemcpy(C, dev_C, n * m * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(m, C); + + free((void *)C); + ; + free((void *)A); + ; + free((void *)B); + ; + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu b/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu new file mode 100644 index 000000000000..4035bee9b1f8 --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu @@ -0,0 +1,115 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * syrk.c: This file is part of the PolyBench/C 3.2 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_beta(int n, int m, double alpha, double beta, double *C, + double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j <= i) + C[i * n + j] *= beta; +} + +__global__ void kernel_product(int n, int m, double alpha, double beta, + double *C, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < n && j <= i) { + for (int k = 0; k < m; k++) + C[i * n + j] += alpha * A[i * m + k] * A[j * m + k]; + } +} + +static void kernel(int n, int m, double alpha, double beta, double *C, + double *A) { + const unsigned int threadsPerBlock = 256; + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel_beta<<>>(m, n, alpha, beta, C, A); + } + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1); + kernel_product<<>>(m, n, alpha, beta, C, A); + } +} + +static void init_array(int ni, int nj, double *C, double *A) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < nj; j++) + A[i * nj + j] = ((double)i * j) / ni; + for (i = 0; i < ni; i++) + for (j = 0; j < ni; j++) + C[i * ni + j] = ((double)i * j) / ni; +} + +static void print_array(int ni, double *C) { + int i, j; + + for (i = 0; i < ni; i++) + for (j = 0; j < ni; j++) { + fprintf(stderr, "%0.2lf ", C[i * ni + j]); + if ((i * ni + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + int dump_code = atoi(argv[1]); + int m = atoi(argv[2]); + int n = atoi(argv[3]); + //__builtin_assume(nj>0); + //__builtin_assume(ni>0); + //__builtin_assume(ni<2147483646); + //__builtin_assume(nj<2147483646); + + double *A = (double *)malloc(m * n * sizeof(double)); + double *C = (double *)malloc(m * m * sizeof(double)); + double alpha = 32412; + double beta = 2123; + + init_array(m, n, C, A); + + double *dev_C; + double *dev_A; + cudaMalloc(&dev_A, n * m * sizeof(double)); + cudaMalloc(&dev_C, n * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_C, C, n * n * sizeof(double), cudaMemcpyHostToDevice); + + kernel(m, n, alpha, beta, dev_C, dev_A); + + cudaMemcpy(C, dev_C, n * n * sizeof(double), cudaMemcpyDeviceToHost); + + if (dump_code == 1) + print_array(m, C); + + free((void *)C); + free((void *)A); + + return 0; +} diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu new file mode 100644 index 000000000000..21263ca8f51e --- /dev/null +++ b/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu @@ -0,0 +1,113 @@ +// clang-format off +// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 +// clang-format on +/** + * trmm.c: This file is part of the PolyBench 3.0 test suite. + * + * + * Contact: Louis-Noel Pouchet + * Web address: http://polybench.sourceforge.net + */ +#include +#include +#include +#include +#include + +/* Array initialization. */ +static void init_array(int n, int m, double *alpha, double *A, double *B) { + int i, j; + + *alpha = 32412; + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) { + A[i * m + j] = ((double)i * j) / m; + B[j * n + j] = ((double)i * j) / n; + } +} + +static unsigned num_blocks(int num, int factor) { + return (num + factor - 1) / factor; +} + +__global__ void kernel_contract(int n, int m, double alpha, double *B, + double *A) { + int j = blockDim.x * blockIdx.x + threadIdx.x; + + if (j < n) { + for (int i = 0; i < m; i++) + for (int k = i + 1; k < m; k++) + B[i * n + j] += A[k * m + i] * B[k * n + j]; + } +} + +__global__ void kernel_alpha(int n, int m, double alpha, double *B, double *A) { + int i = blockDim.x * blockIdx.x + threadIdx.x; + int j = blockDim.y * blockIdx.y + threadIdx.y; + + if (i < m && j < n) + B[i * n + j] *= alpha; +} + +static void kernel(int n, int m, double alpha, double *B, double *A) { + const unsigned int threadsPerBlock = 256; + + kernel_contract<<>>( + n, m, alpha, B, A); + + { + dim3 block(threadsPerBlock / 32, 32, 1); + dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1); + kernel_alpha<<>>(n, m, alpha, B, A); + } +} + +/* DCE code. Must scan the entire live-out data. + Can be used also to check the correctness of the output. */ +static void print_array(int m, int n, double *B) { + int i, j; + + for (i = 0; i < m; i++) + for (j = 0; j < n; j++) { + fprintf(stderr, "%0.2lf ", B[i * n + j]); + if ((i * n + j) % 20 == 0) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); +} + +int main(int argc, char **argv) { + /* Retrieve problem size. */ + int n = atoi(argv[2]); + int m = atoi(argv[3]); + int dump_code = atoi(argv[1]); + + /* Variable declaration/allocation. */ + double alpha; + double *A = (double *)malloc(n * m * sizeof(double)); + double *B = (double *)malloc(m * n * sizeof(double)); + + /* Initialize array(s). */ + init_array(n, m, &alpha, A, B); + + double *dev_A; + double *dev_B; + cudaMalloc(&dev_A, n * m * sizeof(double)); + cudaMalloc(&dev_B, m * n * sizeof(double)); + cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dev_B, B, m * n * sizeof(double), cudaMemcpyHostToDevice); + /* Run kernel. */ + kernel(n, m, alpha, dev_A, dev_B); + cudaMemcpy(B, dev_B, m * n * sizeof(double), cudaMemcpyDeviceToHost); + + /* Prevent dead-code elimination. All live-out data must be printed + by the function call in argument. */ + if (dump_code == 1) + print_array(m, n, B); + + /* Be clean. */ + free((void *)A); + free((void *)B); + + return 0; +} diff --git a/tools/cgeist/Test/lit.cfg b/tools/cgeist/Test/lit.cfg index e32339fce937..a7b7f3685d8e 100644 --- a/tools/cgeist/Test/lit.cfg +++ b/tools/cgeist/Test/lit.cfg @@ -73,3 +73,4 @@ config.substitutions.append(('%stdinclude', '-resource-dir=' + resource_dir + " config.substitutions.append(('%resourcedir', '-resource-dir=' + resource_dir)) config.substitutions.append(('%polyexec', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_TIME -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET')) config.substitutions.append(('%polyverify', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_DUMP_ARRAYS -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET')) +config.substitutions.append(('%cudaopts', '-L' + os.path.dirname(config.cudart_static_path) + ' -lstdc++ -ldl -lpthread -lrt -lcudart_static -lcuda --cuda-lower --emit-cuda --std=c++17 --cuda-gpu-arch=sm_80')) diff --git a/tools/cgeist/Test/lit.site.cfg.in b/tools/cgeist/Test/lit.site.cfg.in index d252d7977f74..f3f734d12079 100644 --- a/tools/cgeist/Test/lit.site.cfg.in +++ b/tools/cgeist/Test/lit.site.cfg.in @@ -9,6 +9,7 @@ config.mlir_clang_obj_root = "@MLIR_CLANG_BINARY_DIR@" config.target_triple = "@TARGET_TRIPLE@" config.llvm_obj_root = path(r"@LLVM_BINARY_DIR@") config.polygeist_enable_cuda = "@POLYGEIST_ENABLE_CUDA@" +config.cudart_static_path = "@CUDA_cudart_static_LIBRARY@" config.polygeist_enable_rocm = "@POLYGEIST_ENABLE_ROCM@" # Support substitution of the tools and build_mode with user parameters. diff --git a/tools/cgeist/driver.cc b/tools/cgeist/driver.cc index 45c92f80bff5..2eee90d1a54c 100644 --- a/tools/cgeist/driver.cc +++ b/tools/cgeist/driver.cc @@ -685,20 +685,8 @@ int main(int argc, char **argv) { if (ScalarReplacement) optPM.addPass(mlir::affine::createAffineScalarReplacementPass()); } - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 4; - } - if (mlir::failed(mlir::verify(module.get()))) { - module->dump(); - return 5; - } -#define optPM optPM2 -#define pm pm2 { - mlir::PassManager pm(&context); - enablePrinting(pm); mlir::OpPassManager &optPM = pm.nest(); if (DetectReduction) @@ -735,15 +723,9 @@ int main(int argc, char **argv) { optPM2.addPass(mlir::polygeist::createPolygeistCanonicalizePass( canonicalizerConfig, {}, {})); } - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 6; - } } if (CudaLower || EmitROCM) { - mlir::PassManager pm(&context); - enablePrinting(pm); mlir::OpPassManager &optPM = pm.nest(); optPM.addPass(mlir::createLowerAffinePass()); optPM.addPass(mlir::polygeist::createPolygeistCanonicalizePass( @@ -807,16 +789,10 @@ int main(int argc, char **argv) { if (ScalarReplacement) noptPM2.addPass(mlir::affine::createAffineScalarReplacementPass()); } - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 7; - } } - mlir::PassManager pm(&context); - enablePrinting(pm); - mlir::OpPassManager &optPM = pm.nest(); if (CudaLower) { + mlir::OpPassManager &optPM = pm.nest(); optPM.addPass(mlir::polygeist::createPolygeistCanonicalizePass( canonicalizerConfig, {}, {})); optPM.addPass(mlir::createCSEPass()); @@ -901,17 +877,10 @@ int main(int argc, char **argv) { if (InnerSerialize) pm.addPass(polygeist::createInnerSerializationPass()); addLICM(pm); - - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 8; - } } #if POLYGEIST_ENABLE_GPU if (EmitGPU) { - mlir::PassManager pm(&context); - enablePrinting(pm); pm.addPass(mlir::createCSEPass()); if (CudaLower) pm.addPass(polygeist::createConvertParallelToGPUPass1( @@ -937,27 +906,21 @@ int main(int argc, char **argv) { pm.addPass(mlir::createCSEPass()); pm.addPass(mlir::polygeist::createPolygeistCanonicalizePass( canonicalizerConfig, {}, {})); - - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 12; - } } #endif { - mlir::PassManager pm(&context); - enablePrinting(pm); mlir::OpPassManager &gpuPM = pm.nest(); gpuPM.addPass(polygeist::createFixGPUFuncPass()); pm.addPass(mlir::polygeist::createPolygeistCanonicalizePass( canonicalizerConfig, {}, {})); pm.addPass(polygeist::createLowerAlternativesPass()); pm.addPass(polygeist::createCollectKernelStatisticsPass()); - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 12; - } + } + + if (mlir::failed(pm.run(module.get()))) { + module->dump(); + return 12; } // Prune unused gpu module funcs @@ -996,7 +959,7 @@ int main(int argc, char **argv) { pm2.addPass(mlir::polygeist::createPolygeistCanonicalizePass( canonicalizerConfig, {}, {})); } - pm.nest().addPass( + pm2.nest().addPass( polygeist::createPolygeistMem2RegPass()); pm2.addPass(mlir::createCSEPass()); pm2.addPass(mlir::polygeist::createPolygeistCanonicalizePass( @@ -1110,13 +1073,6 @@ int main(int argc, char **argv) { return 10; } } - - } else { - - if (mlir::failed(pm.run(module.get()))) { - module->dump(); - return 11; - } } if (mlir::failed(mlir::verify(module.get()))) { module->dump();