diff --git a/tools/cgeist/.gitignore b/tools/cgeist/.gitignore
index 46636c589917..66e9f7f994dd 100644
--- a/tools/cgeist/.gitignore
+++ b/tools/cgeist/.gitignore
@@ -1 +1,2 @@
 *.time *.exec1 *.out1
+*.execm
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu
new file mode 100644
index 000000000000..a07d15fe7948
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/2mm/2mm.cu
@@ -0,0 +1,169 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10 10
+// clang-format on
+/**
+ * 2mm.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__global__ void kernel_A_mul_B(int ni, int nj, int nk, int nl, double alpha,
+                               double beta, double *tmp, double *A, double *B,
+                               double *C, double *D) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int k;
+  double dot = 0.0;
+
+  if (i < ni && j < nj) {
+    for (k = 0; k < nk; k++)
+      dot += alpha * A[i * nk + k] * B[k * nj + j];
+    tmp[i * nj + j] = dot;
+  }
+}
+
+__global__ void kernel_D_plus_tmp_mul_C(int ni, int nj, int nk, int nl,
+                                        double alpha, double beta, double *tmp,
+                                        double *A, double *B, double *C,
+                                        double *D) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int l = blockDim.y * blockIdx.y + threadIdx.y;
+  int j;
+  double dot = 0.0;
+
+  if (i < ni && l < nl) {
+    // D[i * nj + l] *= beta;
+    dot = D[i * nj + l] * beta;
+
+    for (j = 0; j < nj; j++)
+      // D[i * nl + l] += tmp[i * nj + j] * C[j * nl + l];
+      dot += tmp[i * nj + j] * C[j * nl + l];
+    D[i * nl + l] = dot;
+  }
+}
+
+short num_blocks(short num, short factor) {
+  return (num + factor - 1) / factor;
+}
+
+static void kernel(int ni, int nj, int nk, int nl, double alpha, double beta,
+                   double *tmp, double *A, double *B, double *C, double *D) {
+
+  unsigned threadsPerBlock = 256;
+  dim3 block(threadsPerBlock / 32, 32, 1);
+
+  {
+    dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1);
+    kernel_A_mul_B<<<grid, block>>>(ni, nj, nk, nl, alpha, beta, tmp, A, B, C,
+                                    D);
+  }
+
+  {
+    dim3 grid(num_blocks(ni, block.x), num_blocks(nl, block.y), 1);
+    kernel_D_plus_tmp_mul_C<<<grid, block>>>(ni, nj, nk, nl, alpha, beta, tmp,
+                                             A, B, C, D);
+  }
+}
+
+static void print_array(int ni, int nl, double *D) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nl; j++) {
+      fprintf(stderr, "%0.2lf ", D[i * ni + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+static void init_array(int ni, int nj, int nk, int nl, double *A, double *B,
+                       double *C, double *D, double *tmp) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nk; j++)
+      A[i * ni + j] = ((double)i * j) / ni;
+  for (i = 0; i < nk; i++)
+    for (j = 0; j < nj; j++)
+      B[i * nk + j] = ((double)i * (j + 1)) / nj;
+  for (i = 0; i < nl; i++)
+    for (j = 0; j < nj; j++)
+      C[i * nl + j] = ((double)i * (j + 3)) / nl;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nl; j++)
+      D[i * ni + j] = ((double)i * (j + 2)) / nk;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      tmp[i * ni + j] = 0;
+}
+
+int main(int argc, char **argv) {
+  int dump_code = atoi(argv[1]);
+  long ni = atoi(argv[2]);
+  long nj = atoi(argv[3]);
+  long nk = atoi(argv[4]);
+  long nl = atoi(argv[5]);
+
+  double alpha = 32412;
+  double beta = 2123;
+  double *A = (double *)malloc(ni * nk * sizeof(double));
+  double *B = (double *)malloc(nk * nj * sizeof(double));
+  double *C = (double *)malloc(nl * nj * sizeof(double));
+  double *D = (double *)malloc(ni * nl * sizeof(double));
+  double *tmp = (double *)malloc(ni * nj * sizeof(double));
+
+  init_array(ni, nj, nk, nl, A, B, C, D, tmp);
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_C;
+  double *dev_D;
+  double *dev_tmp;
+  double *dev_alpha;
+  double *dev_beta;
+  cudaMalloc(&dev_A, ni * nk * sizeof(double));
+  cudaMalloc(&dev_B, nk * nj * sizeof(double));
+  cudaMalloc(&dev_C, nl * nj * sizeof(double));
+  cudaMalloc(&dev_D, ni * nl * sizeof(double));
+  cudaMalloc(&dev_tmp, ni * nj * sizeof(double));
+  cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, nl * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_D, D, ni * nl * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_tmp, tmp, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(ni, nj, nk, nl, alpha, beta, dev_tmp, dev_A, dev_B, dev_C, dev_D);
+
+  cudaMemcpy(D, dev_D, ni * nl * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaFree((void *)dev_A);
+  cudaFree((void *)dev_B);
+  cudaFree((void *)dev_C);
+  cudaFree((void *)dev_D);
+  cudaFree((void *)dev_tmp);
+  cudaFree((void *)dev_alpha);
+  cudaFree((void *)dev_beta);
+
+  if (dump_code == 1)
+    print_array(ni, nk, D);
+
+  free((void *)tmp);
+  ;
+  free((void *)A);
+  ;
+  free((void *)B);
+  ;
+  free((void *)C);
+  ;
+  free((void *)D);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu
new file mode 100644
index 000000000000..56db77171210
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/3mm/3mm.cu
@@ -0,0 +1,161 @@
+// clang-format off
+// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm
+// RUN: true
+// clang-format on
+/**
+ * 3mm.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__global__ void kernel_A_mul_B(int ni, int nj, int nk, double *C, double *A,
+                               double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  double dot = 0.0;
+
+  if (i < ni && j < nj) {
+    for (int k = 0; k < nk; k++)
+      // C[i * nj + j] += A[i * nk + k] * B[k * nj + j];
+      dot += A[i * nk + k] * B[k * nj + j];
+    C[i * nj + j] = dot;
+  }
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+static void init_array(int ni, int nj, int nk, int nl, int nm, double *A,
+                       double *B, double *C, double *D, double *E, double *F,
+                       double *G) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nk; j++)
+      A[i * ni + j] = ((double)i * j) / ni;
+  for (i = 0; i < nk; i++)
+    for (j = 0; j < nj; j++)
+      B[i * nk + j] = ((double)i * (j + 1)) / nj;
+  for (i = 0; i < nj; i++)
+    for (j = 0; j < nm; j++)
+      C[i * nj + j] = ((double)i * (j + 3)) / nl;
+  for (i = 0; i < nm; i++)
+    for (j = 0; j < nl; j++)
+      D[i * nm + j] = ((double)i * (j + 2)) / nk;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      E[i * ni + j] = 0;
+  for (i = 0; i < nj; i++)
+    for (j = 0; j < nl; j++)
+      F[i * nj + j] = 0;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nl; j++)
+      G[i * ni + j] = 0;
+}
+
+static void print_array(int ni, int nl, double *G) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nl; j++) {
+      fprintf(stderr, "%0.2lf ", G[i * ni + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+static void kernel(int ni, int nj, int nk, int nl, int nm, double *E, double *A,
+                   double *B, double *F, double *C, double *D, double *G) {
+  unsigned threadsPerBlock = 256;
+  dim3 block(threadsPerBlock / 32, 32, 1);
+
+  {
+    dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1);
+    kernel_A_mul_B<<<grid, block>>>(ni, nj, nk, E, A, B);
+  }
+
+  {
+    dim3 grid(num_blocks(nj, block.x), num_blocks(nl, block.y), 1);
+    kernel_A_mul_B<<<grid, block>>>(nj, nl, nm, F, C, D);
+  }
+
+  {
+    dim3 grid(num_blocks(ni, block.x), num_blocks(nl, block.y), 1);
+    kernel_A_mul_B<<<grid, block>>>(ni, nl, nj, G, E, F);
+  }
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int ni = atoi(argv[2]);
+  int nj = atoi(argv[3]);
+  int nk = atoi(argv[4]);
+  int nl = atoi(argv[5]);
+  int nm = atoi(argv[6]);
+
+  double *A = (double *)malloc(ni * nk * sizeof(double));
+  double *B = (double *)malloc(nk * nj * sizeof(double));
+  double *C = (double *)malloc(nj * nm * sizeof(double));
+  double *D = (double *)malloc(nm * nl * sizeof(double));
+  double *E = (double *)malloc(ni * nj * sizeof(double));
+  double *F = (double *)malloc(nj * nl * sizeof(double));
+  double *G = (double *)malloc(ni * nl * sizeof(double));
+
+  init_array(ni, nj, nk, nl, nm, A, B, C, D, E, F, G);
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_C;
+  double *dev_D;
+  double *dev_E;
+  double *dev_F;
+  double *dev_G;
+  cudaMalloc(&dev_A, ni * nk * sizeof(double));
+  cudaMalloc(&dev_B, nk * nj * sizeof(double));
+  cudaMalloc(&dev_C, nl * nj * sizeof(double));
+  cudaMalloc(&dev_D, ni * nl * sizeof(double));
+  cudaMalloc(&dev_E, ni * nj * sizeof(double));
+  cudaMalloc(&dev_F, nj * nl * sizeof(double));
+  cudaMalloc(&dev_G, ni * nl * sizeof(double));
+  cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, nl * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_D, D, ni * nl * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_E, E, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_F, F, nj * nl * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_G, G, ni * nl * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(ni, nj, nk, nl, nm, dev_E, dev_A, dev_B, dev_F, dev_C, dev_D, dev_G);
+
+  cudaMemcpy(G, dev_G, ni * nl * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(ni, nl, G);
+
+  free((void *)E);
+  ;
+  free((void *)A);
+  ;
+  free((void *)B);
+  ;
+  free((void *)F);
+  ;
+  free((void *)C);
+  ;
+  free((void *)D);
+  ;
+  free((void *)G);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS b/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS
new file mode 100644
index 000000000000..e1758fb080ce
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/AUTHORS
@@ -0,0 +1,14 @@
+* * * * * * * * * * * * *
+* Authors of PolyBench  *
+* * * * * * * * * * * * *
+
+
+* Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+  Who provided packaging and harmonization of all test files,
+  the PolyBench infrastructure and machinery, and several
+  reference C files.
+
+* Uday Bondugula <Uday Bondhugula <ubondhug@us.ibm.com>
+  Who provided many of the original reference C files, including
+  Fortran to C translation.
+
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/README b/tools/cgeist/Test/CUDA/polybench-cuda/README
new file mode 100644
index 000000000000..d4978228b7e8
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/README
@@ -0,0 +1,217 @@
+* * * * * * * * * *
+* PolyBenchs 3.1  *
+* * * * * * * * * *
+
+Copyright (c) 2011 the Ohio State University.
+Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+
+
+-------------
+* New in 3.1:
+-------------
+
+- fixed a typo in polybench.h, causing compilation problems for 3D arrays.
+- set by default heap arrays, stack arrays are now optional.
+
+
+-------------
+* New in 3.0:
+-------------
+
+- multiple dataset sizes are predefined. Each file comes now with a .h
+  header fiile defining the dataset.
+- support of heap-allocated arrays. It uses a single malloc for the
+  entire array region, the data allocated is cast into a C99
+  multidimensional array.
+- One benchmark is out: gauss_filter
+- One benchmark is in: floyd-warshall
+- PAPI support has been greatly improved; it also can report the
+  counters on a specific core to be set by the user.
+
+
+
+----------------
+* Mailing lists:
+----------------
+
+** polybench-announces@lists.sourceforge.net:
+---------------------------------------------
+
+Announces about releases of PolyBench.
+
+** polybench-discussion@lists.sourceforge.net:
+----------------------------------------------
+
+General discussions reg. PolyBench.
+
+
+
+-----------------------
+* Available benchmarks:
+-----------------------
+
+::linear-algebra::
+linear-algebra/kernels:
+linear-algebra/kernels/2mm/2mm.c
+linear-algebra/kernels/3mm/3mm.c
+linear-algebra/kernels/atax/atax.c
+linear-algebra/kernels/bicg/bicg.c
+linear-algebra/kernels/cholesky/cholesky.c
+linear-algebra/kernels/doitgen/doitgen.c
+linear-algebra/kernels/gemm/gemm.c
+linear-algebra/kernels/gemver/gemver.c
+linear-algebra/kernels/gesummv/gesummv.c
+linear-algebra/kernels/mvt/mvt.c
+linear-algebra/kernels/symm/symm.c
+linear-algebra/kernels/syr2k/syr2k.c
+linear-algebra/kernels/syrk/syrk.c
+linear-algebra/kernels/trisolv/trisolv.c
+linear-algebra/kernels/trmm/trmm.c
+
+linear-algebra/solvers:
+linear-algebra/solvers/durbin/durbin.c
+linear-algebra/solvers/dynprog/dynprog.c
+linear-algebra/solvers/gramschmidt/gramschmidt.c
+linear-algebra/solvers/lu/lu.c
+linear-algebra/solvers/ludcmp/ludcmp.c
+
+::datamining::
+datamining/correlation/correlation.c
+datamining/covariance/covariance.c
+
+::medley::
+medley/floyd-warshall/floyd-warshall.c
+medley/reg_detect/reg_detect.c
+
+::stencils::
+stencils/adi/adi.c
+stencils/fdtd-2d/fdtd-2d.c
+stencils/fdtd-apml/fdtd-apml.c
+stencils/jacobi-1d-imper/jacobi-1d-imper.c
+stencils/jacobi-2d-imper/jacobi-2d-imper.c
+stencils/seidel-2d/seidel-2d.c
+
+
+
+------------------------------
+* Sample compilation commands:
+------------------------------
+
+
+** To compile a benchmark without any monitoring:
+-------------------------------------------------
+
+$> gcc -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -o atax_base
+
+
+** To compile a benchmark with execution time reporting:
+--------------------------------------------------------
+
+$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_TIME -o atax_time
+
+
+** To generate the reference output of a benchmark:
+---------------------------------------------------
+
+$> gcc -O0 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_DUMP_ARRAYS -o atax_ref
+$> ./atax_ref 2>atax_ref.out
+
+
+
+
+-------------------------
+* Some available options:
+-------------------------
+
+They are all passed as macro definitions during compilation time (e.g,
+-Dname_of_the_option).
+
+- POLYBENCH_TIME: output execution time (gettimeofday) [default: off]
+
+- POLYBENCH_NO_FLUSH_CACHE: don't flush the cache before calling the
+  timer [default: flush the cache]
+
+- POLYBENCH_LINUX_FIFO_SCHEDULER: use FIFO real-time scheduler for the
+  kernel execution, the program must be run as root, under linux only,
+  and compiled with -lc [default: off]
+
+- POLYBENCH_CACHE_SIZE_KB: cache size to flush, in kB [default: 33MB]
+
+- POLYBENCH_STACK_ARRAYS: use stack allocation instead of malloc [default: off]
+
+- POLYBENCH_DUMP_ARRAYS: dump all live-out arrays on stderr [default: off]
+
+- POLYBENCH_CYCLE_ACCURATE_TIMER: Use Time Stamp Counter to monitor
+  the execution time of the kernel [default: off]
+
+- POLYBENCH_PAPI: turn on papi timing (see below).
+
+- MINI_DATASET, SMALL_DATASET, STANDARD_DATASET, LARGE_DATASET,
+  EXTRALARGE_DATASET: set the dataset size to be used
+  [default: STANDARD_DATASET]
+
+
+
+---------------
+* PAPI support:
+---------------
+
+** To compile a benchmark with PAPI support:
+--------------------------------------------
+
+$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_PAPI -lpapi -o atax_papi
+
+
+** To specify which counter(s) to monitor:
+------------------------------------------
+
+Edit utilities/papi_counters.list, and add 1 line per event to
+monitor. Each line (including the last one) must finish with a ',' and
+both native and standard events are supported.
+
+The whole kernel is run one time per counter (no multiplexing) and
+there is no sampling being used for the counter value.
+
+
+
+------------------------------
+* Accurate performance timing:
+------------------------------
+
+With kernels that have an execution time in the orders of a few tens
+of milliseconds, it is critical to validate any performance number by
+repeating several times the experiment. A companion script is
+available to perform reasonable performance measurement of a PolyBench.
+
+$> gcc -O3 -I utilities -I linear-algebra/kernels/atax utilities/polybench.c linear-algebra/kernels/atax/atax.c -DPOLYBENCH_TIME -o atax_time
+$> ./utilities/time_benchmark.sh ./atax_time
+
+This script will run five times the benchmark (that must be a
+PolyBench compiled with -DPOLYBENCH_TIME), eliminate the two extremal
+times, and check that the deviation of the three remaining does not
+exceed a given thresold, set to 5%.
+
+It is also possible to use POLYBENCH_CYCLE_ACCURATE_TIMER to use the
+Time Stamp Counter instead of gettimeofday() to monitor the number of
+elapsed cycles.
+
+
+
+
+----------------------------------------
+* Generating macro-free benchmark suite:
+----------------------------------------
+
+(from the root of the archive:)
+$> PARGS="-I utilities -DPOLYBENCH_TIME";
+$> for i in `cat utilities/benchmark_list`; do create_cpped_version.sh $i "$PARGS"; done
+
+This create for each benchmark file 'xxx.c' a new file
+'xxx.preproc.c'. The PARGS variable in the above example can be set to
+the desired configuration, for instance to create a full C99 version
+(parametric arrays):
+
+$> PARGS="-I utilities -DPOLYBENCH_USE_C99_PROTO";
+$> for i in `cat utilities/benchmark_list`; do ./utilities/create_cpped_version.sh "$i" "$PARGS"; done
+
+
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu b/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu
new file mode 100644
index 000000000000..cbb195dd973e
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/adi/adi.cu
@@ -0,0 +1,177 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 5
+// clang-format on
+/**
+ * adi.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define N 2048
+#define TSTEPS 50
+
+__global__ void kernel_column_sweep(int tsteps, int n, double *u, double *v,
+                                    double *p, double *q, double a, double b,
+                                    double c, double d, double e, double f) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+
+  if (i < n - 1) {
+    v[0 * n + i] = 1;
+    p[i * n + 0] = 0;
+    q[i * n + 0] = v[0 * n + i];
+    for (int j = 1; j < n - 1; j++) {
+      p[i * n + j] = -c / (a * p[i * n + j - 1] + b);
+      q[i * n + j] = (-d * u[j * n + i - 1] + (1 + 2 * d) * u[j * n + i] -
+                      f * u[j * n + i + 1] - a * q[i * n + j - 1]) /
+                     (a * p[i * n + j - 1] + b);
+    }
+
+    v[(n - 1) * n + i] = 1;
+    for (int j = n - 2; j >= 1; j--)
+      v[j * n + i] = p[i * n + j] * v[(j + 1) * n + i] + q[i * n + j];
+  }
+}
+
+__global__ void kernel_row_sweep(int tsteps, int n, double *u, double *v,
+                                 double *p, double *q, double a, double b,
+                                 double c, double d, double e, double f) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+
+  if (i < n - 1) {
+    u[i * n + 0] = 1;
+    p[i + n + 0] = 0;
+    q[i * n + 0] = u[i * n + 0];
+    for (int j = 1; j < n - 1; j++) {
+      p[i * n + j] = -f / (d * p[i * n + j - 1] + e);
+      q[i * n + j] = (-a * v[(i - 1) * n + j] + (1 + 2 * a) * v[i * n + j] -
+                      c * v[(i + 1) * n + j] - d * q[i * n + j - 1]) /
+                     (d * p[i * n + j - 1] + e);
+    }
+    u[i * n + n - 1] = 1;
+    for (int j = n - 2; j >= 1; j--)
+      u[i * n + j] = p[i * n + j] * u[i * n + j + 1] + q[i * n + j];
+  }
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+static void kernel(int tsteps, int n, double *u, double *v, double *p,
+                   double *q) {
+  unsigned threadsPerBlock = 256;
+
+  double DX = 1 / (double)n;
+  double DY = 1 / (double)n;
+  double DT = 1 / (double)tsteps;
+  double B1 = 2;
+  double B2 = 1;
+  double mul1 = B1 * DT / DX / DX;
+  double mul2 = B2 * DT / DY / DY;
+
+  double a = -mul1 / 2;
+  double b = 1 + mul1;
+  double c = a;
+  double d = -mul2 / 2;
+  double e = 1 + mul2;
+  double f = d;
+
+  for (int t = 1; t <= tsteps; t++) {
+    // Column Sweep
+    kernel_column_sweep<<<num_blocks(n - 2, threadsPerBlock),
+                          threadsPerBlock>>>(tsteps, n, u, v, p, q, a, b, c, d,
+                                             e, f);
+
+    // Row Sweep
+    kernel_row_sweep<<<num_blocks(n - 2, threadsPerBlock), threadsPerBlock>>>(
+        tsteps, n, u, v, p, q, a, b, c, d, e, f);
+  }
+}
+
+/* Array initialization. */
+static void init_array(int n, double *u, double *v, double *p, double *q) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      u[i * n + j] = (double)(i + n - j) / n;
+      v[i * n + j] = 0;
+      p[i * n + j] = 0;
+      q[i * n + j] = 0;
+    }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *u)
+
+{
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", u[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[2]);
+  int tsteps = atoi(argv[3]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+
+  double *u = (double *)malloc(n * n * sizeof(double));
+  double *v = (double *)malloc(n * n * sizeof(double));
+  double *p = (double *)malloc(n * n * sizeof(double));
+  double *q = (double *)malloc(n * n * sizeof(double));
+
+  /* Initialize array(s). */
+  init_array(n, u, v, p, q);
+
+  double *dev_u;
+  double *dev_v;
+  double *dev_p;
+  double *dev_q;
+  cudaMalloc(&dev_u, n * n * sizeof(double));
+  cudaMalloc(&dev_v, n * n * sizeof(double));
+  cudaMalloc(&dev_p, n * n * sizeof(double));
+  cudaMalloc(&dev_q, n * n * sizeof(double));
+  cudaMemcpy(dev_u, u, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_v, v, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_p, p, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_q, q, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Run kernel. */
+  kernel(tsteps, n, dev_u, dev_v, dev_p, dev_q);
+
+  cudaMemcpy(u, dev_u, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaFree((void *)dev_u);
+  cudaFree((void *)dev_v);
+  cudaFree((void *)dev_p);
+  cudaFree((void *)dev_q);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, u);
+
+  /* Be clean. */
+  free((void *)u);
+  free((void *)v);
+  free((void *)p);
+  free((void *)q);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu b/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu
new file mode 100644
index 000000000000..190adf49603b
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/atax/atax.cu
@@ -0,0 +1,125 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * atax.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define RUN 100
+
+__global__ void kernel3(int m, int n, double *A, double *x, double *y,
+                        double *tmp) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < m) {
+    double dot = 0.0;
+    for (int j = 0; j < n; j++) {
+      dot += A[i * n + j] * x[j];
+    }
+    tmp[i] = dot;
+  }
+}
+
+__global__ void kernel4(int m, int n, double *A, double *x, double *y,
+                        double *tmp) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  if (j < n) {
+    double dot = 0;
+    y[j] = 0;
+    for (int i = 0; i < m; i++)
+      dot += A[i * n + j] * tmp[i];
+    y[j] = dot;
+  }
+}
+
+static int num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+/* Array initialization. */
+static void init_array(int nx, int ny, double *A, double *x, double *tmp,
+                       double *y) {
+  int i, j;
+
+  for (i = 0; i < ny; i++) {
+    x[i] = i * M_PI;
+  }
+  for (i = 0; i < nx; i++)
+    tmp[i] = 0;
+  for (i = 0; i < nx; i++)
+    for (j = 0; j < ny; j++)
+      A[i * ny + j] = ((double)i * (j + 1)) / nx;
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int nx, double *y)
+
+{
+  int i;
+
+  for (i = 0; i < nx; i++) {
+    fprintf(stderr, "%0.2lf ", y[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int nx = atoi(argv[2]);
+  int ny = atoi(argv[3]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(nx * ny * sizeof(double));
+  double *x = (double *)malloc(ny * sizeof(double));
+  double *y = (double *)malloc(ny * sizeof(double));
+  double *tmp = (double *)malloc(nx * sizeof(double));
+  /* Initialize array(s). */
+  init_array(nx, ny, A, x, tmp, y);
+
+  double *dev_A;
+  double *dev_x;
+  double *dev_y;
+  double *dev_tmp;
+  cudaMalloc(&dev_A, nx * ny * sizeof(double));
+  cudaMalloc(&dev_x, ny * sizeof(double));
+  cudaMalloc(&dev_y, ny * sizeof(double));
+  cudaMalloc(&dev_tmp, nx * sizeof(double));
+
+  cudaMemcpy(dev_A, A, nx * ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_x, x, ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_y, y, ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_tmp, tmp, nx * sizeof(double), cudaMemcpyHostToDevice);
+
+  const int threadsPerBlock = 256;
+  kernel3<<<num_blocks(nx, threadsPerBlock), threadsPerBlock>>>(
+      nx, ny, dev_A, dev_x, dev_y, dev_tmp);
+  kernel4<<<num_blocks(ny, threadsPerBlock), threadsPerBlock>>>(
+      nx, ny, dev_A, dev_x, dev_y, dev_tmp);
+
+  cudaMemcpy(y, dev_y, ny * sizeof(double), cudaMemcpyDeviceToHost);
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1) {
+    print_array(nx, y);
+  }
+  /* Be clean. */
+  free((void *)A);
+  free((void *)x);
+  free((void *)y);
+  free((void *)tmp);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu b/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu
new file mode 100644
index 000000000000..69bcb968d78f
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/bicg/bicg.cu
@@ -0,0 +1,139 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * bicg.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+// #define NX 40000
+#define RUN 100
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_q(int m, int n, double *A, double s[], double q[],
+                         double p[], double r[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n) {
+    double dot = 0;
+    q[i] = 0;
+    for (int j = 0; j < m; j++)
+      dot += A[i * m + j] * p[j];
+    q[i] += dot;
+  }
+}
+
+__global__ void kernel_s(int m, int n, double *A, double s[], double q[],
+                         double p[], double r[]) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < m) {
+    s[j] = 0;
+    double dot = 0;
+    for (int i = 0; i < n; i++)
+      dot += r[i] * A[i * m + j];
+    s[j] = dot;
+  }
+}
+
+/* Array initialization. */
+static void init_array(int nx, int ny, double *A, double *r, double *p) {
+  int i, j;
+
+  for (i = 0; i < ny; i++)
+    p[i] = i * M_PI;
+  for (i = 0; i < nx; i++) {
+    r[i] = i * M_PI;
+    for (j = 0; j < ny; j++)
+      A[i * ny + j] = ((double)i * (j + 1)) / nx;
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int nx, int ny, double *s, double *q)
+
+{
+  int i;
+
+  for (i = 0; i < ny; i++) {
+    fprintf(stderr, "%0.2lf ", s[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+  for (i = 0; i < nx; i++) {
+    fprintf(stderr, "%0.2lf ", q[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+  fprintf(stderr, "\n");
+}
+
+static void kernel(int m, int n, double *A, double s[], double q[], double p[],
+                   double r[]) {
+
+  const unsigned threadsPerBlock = 256;
+  kernel_q<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(m, n, A, s, q,
+                                                                p, r);
+  kernel_s<<<num_blocks(m, threadsPerBlock), threadsPerBlock>>>(m, n, A, s, q,
+                                                                p, r);
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int m = atoi(argv[2]);
+  int n = atoi(argv[3]);
+  int dump_code = atoi(argv[1]);
+
+  //  for(int i = 0; i < RUN; i++) {
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(m * n * sizeof(double));
+  double *s = (double *)malloc(n * sizeof(double));
+  double *q = (double *)malloc(m * sizeof(double));
+  double *p = (double *)malloc(n * sizeof(double));
+  double *r = (double *)malloc(m * sizeof(double));
+  /* Initialize array(s). */
+  init_array(m, n, A, r, p);
+  double *dev_A;
+  double *dev_s;
+  double *dev_q;
+  double *dev_p;
+  double *dev_r;
+  cudaMalloc(&dev_A, m * n * sizeof(double));
+  cudaMalloc(&dev_s, n * sizeof(double));
+  cudaMalloc(&dev_q, m * sizeof(double));
+  cudaMalloc(&dev_p, n * sizeof(double));
+  cudaMalloc(&dev_r, m * sizeof(double));
+  cudaMemcpy(dev_A, A, m * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_s, s, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_q, q, m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_p, p, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_r, r, m * sizeof(double), cudaMemcpyHostToDevice);
+  kernel(m, n, dev_A, dev_s, dev_q, dev_p, dev_r);
+  cudaMemcpy(s, dev_s, n * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaMemcpy(q, dev_q, m * sizeof(double), cudaMemcpyDeviceToHost);
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(m, n, s, q);
+  /* Be clean. */
+  free((void *)A);
+  free((void *)s);
+  free((void *)q);
+  free((void *)p);
+  free((void *)r);
+  //  }
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu b/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu
new file mode 100644
index 000000000000..459512998754
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/cholesky/cholesky.cu
@@ -0,0 +1,108 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5
+// clang-format on
+/**
+ * cholesky.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Array initialization. */
+static void init_array(int n, double *A) {
+  int i, j;
+
+  for (i = 0; i < n; i++) {
+    for (j = 0; j < n; j++)
+      A[i * n + j] = 1.0 / n;
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *A)
+
+{
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", A[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+}
+
+__global__ void kernel0(int n, int j, double *A) {
+  A[j * n + j] = std::sqrt(A[j * n + j]);
+}
+
+__global__ void kernel1(int n, int j, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n && i > j)
+    A[i * n + j] /= A[j * n + j];
+}
+
+__global__ void kernel2(int n, int j, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int k = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (j < n && j < i && i < n && j < k && k <= i)
+    A[i * n + k] -= A[i * n + j] * A[k * n + j];
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+static void kernel_polly(int n, double *dev_A) {
+  const unsigned int threadsPerBlock = 256;
+
+  for (int iter = 0; iter < n; iter++) {
+    kernel0<<<1, 1>>>(n, iter, dev_A);
+
+    kernel1<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(n, iter,
+                                                                 dev_A);
+
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel2<<<block, grid>>>(n, iter, dev_A);
+  }
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[2]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(n * n * sizeof(double));
+
+  /* Initialize array(s). */
+  init_array(n, A);
+
+  double *dev_A;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel_polly(n, dev_A);
+
+  cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, A);
+
+  /* Be clean. */
+  free((void *)A);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu b/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu
new file mode 100644
index 000000000000..4767bca2769b
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/correlation/correlation.cu
@@ -0,0 +1,183 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * correlation.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define EPS 0.1
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_mean(int m, int n, double *data, double *corr,
+                            double mean[], double stddev[]) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < m) {
+    mean[j] = 0.0;
+    for (int i = 0; i < n; i++)
+      mean[j] += data[i * m + j];
+    mean[j] /= n;
+  }
+}
+
+__global__ void kernel_stddev(int m, int n, double *data, double *corr,
+                              double mean[], double stddev[]) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < m) {
+    stddev[j] = 0.0;
+    for (int i = 0; i < n; i++)
+      stddev[j] += (data[i * m + j] - mean[j]) * (data[i * m + j] - mean[j]);
+    stddev[j] /= n;
+    stddev[j] = sqrt(stddev[j]);
+    /* The following in an inelegant but usual way to handle
+       near-zero std. dev. values, which below would cause a zero-
+       divide. */
+    if (stddev[j] <= EPS)
+      stddev[j] = 1.0;
+  }
+}
+
+__global__ void kernel_reduce(int m, int n, double *data, double *corr,
+                              double mean[], double stddev[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j < m) {
+    data[i * m + j] -= mean[j];
+    data[i * m + j] /= std::sqrt((double)n) * stddev[j];
+  }
+}
+
+__global__ void kernel_diag(int m, int n, double *data, double *corr,
+                            double mean[], double stddev[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < m) {
+    corr[i * m + i] = 1.0;
+  }
+}
+
+__global__ void kernel_corr(int m, int n, double *data, double *corr,
+                            double mean[], double stddev[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + i + 1;
+
+  if (i < m - 1 && j < m) {
+    corr[i * m + j] = 0.0;
+    for (int k = 0; k < n; k++)
+      corr[i * m + j] += (data[k * m + i] * data[k * m + j]);
+    corr[j * m + i] = corr[i * m + j];
+  }
+}
+
+__global__ void kernel_tail(int m, int n, double *data, double *corr,
+                            double mean[], double stddev[]) {
+  corr[(m - 1) * m + m - 1] = 1.0;
+}
+
+static void kernel(int m, int n, double *data, double *corr, double mean[],
+                   double stddev[]) {
+  const unsigned threadsPerBlock = 256;
+
+  kernel_mean<<<num_blocks(m, threadsPerBlock), threadsPerBlock>>>(
+      m, n, data, corr, mean, stddev);
+  kernel_stddev<<<num_blocks(m, threadsPerBlock), threadsPerBlock>>>(
+      m, n, data, corr, mean, stddev);
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(m, block.y), 1);
+    kernel_reduce<<<block, grid>>>(m, n, data, corr, mean, stddev);
+  }
+
+  kernel_diag<<<threadsPerBlock, num_blocks(m, threadsPerBlock)>>>(
+      m, n, data, corr, mean, stddev);
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m - 1, block.x), num_blocks(m - 1, block.y), 1);
+    kernel_corr<<<block, grid>>>(m, n, data, corr, mean, stddev);
+  }
+
+  kernel_tail<<<1, 1>>>(m, n, data, corr, mean, stddev);
+}
+
+static void init_array(int m, int n, double *data) {
+  int i, j;
+
+  for (i = 0; i < m; i++)
+    for (j = 0; j < n; j++)
+      data[i * n + j] = ((double)i * j) / 1000;
+}
+
+static void print_array(int m, double *corr)
+
+{
+  int i, j;
+
+  for (i = 0; i < m; i++)
+    for (j = 0; j < m; j++) {
+      fprintf(stderr, "%0.2lf ", corr[i * m + j]);
+      if ((i * m + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int m = atoi(argv[3]);
+
+  double *data = (double *)malloc(n * m * sizeof(double));
+  double *mean = (double *)malloc(m * sizeof(double));
+  double *stddev = (double *)malloc(m * sizeof(double));
+  double *corr = (double *)malloc(m * m * sizeof(double));
+
+  init_array(m, n, data);
+
+  double *dev_data;
+  double *dev_mean;
+  double *dev_stddev;
+  double *dev_corr;
+  cudaMalloc(&dev_data, n * m * sizeof(double));
+  cudaMalloc(&dev_mean, m * sizeof(double));
+  cudaMalloc(&dev_stddev, m * sizeof(double));
+  cudaMalloc(&dev_corr, m * m * sizeof(double));
+  cudaMemcpy(dev_data, data, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_mean, mean, m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_stddev, stddev, m * m * sizeof(double),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_corr, corr, m * m * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(m, n, dev_data, dev_corr, dev_mean, dev_stddev);
+  cudaMemcpy(corr, dev_corr, m * m * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(m, corr);
+
+  free((void *)data);
+  ;
+  free((void *)corr);
+  ;
+  free((void *)mean);
+  ;
+  free((void *)stddev);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu b/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu
new file mode 100644
index 000000000000..fed93c517f02
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/covariance/covariance.cu
@@ -0,0 +1,131 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * covariance.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void init_array(int m, int n, double *data) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++)
+      data[i * m + j] = ((double)i * j) / 1000;
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_mean(int m, int n, double data[], double cov[],
+                            double mean[]) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < m) {
+    mean[j] = 0.0;
+    for (int i = 0; i < n; i++)
+      mean[j] += data[i * m + j];
+    mean[j] /= n;
+  }
+}
+
+__global__ void kernel_reduce(int m, int n, double data[], double cov[],
+                              double mean[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j < m) {
+    data[i * m + j] -= mean[j];
+  }
+}
+
+__global__ void kernel_cov(int m, int n, double data[], double cov[],
+                           double mean[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + i;
+
+  if (i < m && j < m) {
+    cov[i * m + j] = 0.0;
+    for (int k = 0; k < n; k++)
+      cov[i * m + j] += data[k * m + i] * data[k * m + j];
+    cov[i * m + j] /= (n - 1.0);
+    cov[j * m + i] = cov[i * m + j];
+  }
+}
+
+static void kernel(int m, int n, double data[], double cov[], double mean[]) {
+  const unsigned threadsPerBlock = 256;
+
+  kernel_mean<<<num_blocks(m, threadsPerBlock), threadsPerBlock>>>(m, n, data,
+                                                                   cov, mean);
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(m, block.y), 1);
+    kernel_reduce<<<block, grid>>>(m, n, data, cov, mean);
+  }
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m - 1, block.x), num_blocks(m - 1, block.y), 1);
+    kernel_cov<<<block, grid>>>(m, n, data, cov, mean);
+  }
+}
+
+static void print_array(int m, double *cov)
+
+{
+  int i, j;
+
+  for (i = 0; i < m; i++)
+    for (j = 0; j < m; j++) {
+      fprintf(stderr, "%0.2lf ", cov[i * m + j]);
+      if ((i * m + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int m = atoi(argv[3]);
+
+  double *data = (double *)malloc(n * m * sizeof(double));
+  double *mean = (double *)malloc(m * sizeof(double));
+  double *cov = (double *)malloc(m * m * sizeof(double));
+
+  init_array(m, n, data);
+
+  double *dev_data;
+  double *dev_mean;
+  double *dev_cov;
+  cudaMalloc(&dev_data, n * m * sizeof(double));
+  cudaMalloc(&dev_mean, m * sizeof(double));
+  cudaMalloc(&dev_cov, m * m * sizeof(double));
+  cudaMemcpy(dev_data, data, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_mean, mean, m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_cov, cov, m * m * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(m, n, dev_data, dev_cov, dev_mean);
+  cudaMemcpy(cov, dev_cov, m * m * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(m, cov);
+
+  free((void *)data);
+  free((void *)cov);
+  free((void *)mean);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu b/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu
new file mode 100644
index 000000000000..7cfa24415dd3
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/doitgen/doitgen.cu
@@ -0,0 +1,111 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10
+// clang-format on
+/**
+ * doitgen.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_sum(int nr, int nq, int np, double *A, double *C4,
+                           double *sum) {
+  int r = blockDim.x * blockIdx.x + threadIdx.x;
+  int q = blockDim.y * blockIdx.y + threadIdx.y;
+  int p = blockDim.z * blockIdx.z + threadIdx.z;
+
+  if (r < nr && q < nq && p < np) {
+    double dot = 0.0;
+    sum[(r * nq + q) * np + p] = 0;
+    for (int s = 0; s < np; s++)
+      dot += A[(r * nq + q) * np + s] * C4[s * np + p];
+    sum[(r * nq + q) * np + p] = dot;
+  }
+}
+
+static void kernel(int nr, int nq, int np, double *A, double *C4, double *sum) {
+
+  const unsigned threadsPerBlock = 256;
+
+  dim3 block(1, threadsPerBlock / 32, 32);
+  dim3 grid(num_blocks(nr, block.x), num_blocks(nq, block.y),
+            num_blocks(np, block.z));
+  kernel_sum<<<grid, block>>>(nr, nq, np, A, C4, sum);
+}
+
+static void init_array(int nr, int nq, int np, double *A, double *C4) {
+  int i, j, k;
+
+  for (i = 0; i < nr; i++)
+    for (j = 0; j < nq; j++)
+      for (k = 0; k < np; k++)
+        A[i * np * nq + j * nq + k] = ((double)i * j + k) / np;
+  for (i = 0; i < np; i++)
+    for (j = 0; j < np; j++)
+      C4[i * np + j] = ((double)i * j) / np;
+}
+
+static void print_array(int nr, int nq, int np, double *A) {
+  int i, j, k;
+
+  for (i = 0; i < nr; i++)
+    for (j = 0; j < nq; j++)
+      for (k = 0; k < np; k++) {
+        fprintf(stderr, "%0.2lf ", A[i * nq * np + j * nq + k]);
+        if (i % 20 == 0)
+          fprintf(stderr, "\n");
+      }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int nr = atoi(argv[2]);
+  int nq = atoi(argv[3]);
+  int np = atoi(argv[4]);
+
+  double *A = (double *)malloc(nr * nq * np * sizeof(double));
+  double *sum = (double *)malloc(nr * nq * np * sizeof(double));
+  double *C4 = (double *)malloc(np * np * sizeof(double));
+
+  init_array(nr, nq, np, A, C4);
+
+  double *dev_A;
+  double *dev_sum;
+  double *dev_C4;
+  cudaMalloc(&dev_A, nr * nq * np * sizeof(double));
+  cudaMalloc(&dev_sum, nr * nq * np * sizeof(double));
+  cudaMalloc(&dev_C4, np * np * sizeof(double));
+  cudaMemcpy(dev_A, A, nr * nq * np * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C4, C4, np * np * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_sum, sum, nr * nq * np * sizeof(double),
+             cudaMemcpyHostToDevice);
+
+  kernel(nr, nq, np, dev_A, dev_C4, dev_sum);
+
+  cudaMemcpy(sum, dev_sum, nr * nq * np * sizeof(double),
+             cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(nr, nq, np, sum);
+
+  free((void *)A);
+  ;
+  free((void *)sum);
+  ;
+  free((void *)C4);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu
new file mode 100644
index 000000000000..e3d50d078a88
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/fdtd-2d/fdtd-2d.cu
@@ -0,0 +1,165 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5 10 10
+// clang-format on
+/**
+ * fdtd-2d.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define TMAX 100
+#define NX 4000
+#define NY 4000
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_splat(int tmax, int nx, int ny, double *ex, double *ey,
+                             double *hz, double fict[], int t) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < ny)
+    ey[0 * ny + j] = fict[t];
+}
+
+__global__ void kernel_ey(int tmax, int nx, int ny, double *ex, double *ey,
+                          double *hz, double fict[], int t) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < nx && j < ny)
+    ey[i * ny + j] -= (double)(0.5) * (hz[i * ny + j] - hz[(i - 1) * ny + j]);
+}
+
+__global__ void kernel_ex(int tmax, int nx, int ny, double *ex, double *ey,
+                          double *hz, double fict[], int t) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + 1;
+
+  if (i < nx && j < ny)
+    ex[i * ny + j] -= (double)(0.5) * (hz[i * ny + j] - hz[i * ny + j - 1]);
+}
+
+__global__ void kernel_hz(int tmax, int nx, int ny, double *ex, double *ey,
+                          double *hz, double fict[], int t) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < nx - 1 && j < ny - 1)
+    hz[i * ny + j] -= (double)(0.7) * (ex[i * ny + j + 1] - ex[i * ny + j] +
+                                       ey[(i + 1) * ny + j] - ey[i * ny + j]);
+}
+
+static void kernel(int tmax, int nx, int ny, double *ex, double *ey, double *hz,
+                   double *fict) {
+  const unsigned threadsPerBlock = 256;
+
+  for (int t = 0; t < tmax; t++) {
+    kernel_splat<<<threadsPerBlock, num_blocks(ny, threadsPerBlock)>>>(
+        tmax, nx, ny, ex, ey, hz, fict, t);
+
+    {
+      dim3 block(threadsPerBlock / 32, 32, 1);
+      dim3 grid(num_blocks(nx - 1, block.x), num_blocks(ny, block.y), 1);
+      kernel_ey<<<grid, block>>>(tmax, nx, ny, ex, ey, hz, fict, t);
+    }
+
+    {
+      dim3 block(threadsPerBlock / 32, 32, 1);
+      dim3 grid(num_blocks(nx, block.x), num_blocks(ny - 1, block.y), 1);
+      kernel_ex<<<grid, block>>>(tmax, nx, ny, ex, ey, hz, fict, t);
+    }
+
+    {
+      dim3 block(threadsPerBlock / 32, 32, 1);
+      dim3 grid(num_blocks(nx - 1, block.x), num_blocks(ny - 1, block.y), 1);
+      kernel_hz<<<grid, block>>>(tmax, nx, ny, ex, ey, hz, fict, t);
+    }
+  }
+}
+
+/* Array initialization. */
+static void init_array(int nx, int ny, double *ex, double *ey, double *hz,
+                       double *_fict_) {
+  int i, j;
+
+  for (i = 0; i < ny; i++)
+    _fict_[i] = (double)i;
+  for (i = 0; i < nx; i++)
+    for (j = 0; j < ny; j++) {
+      ex[i * ny + j] = ((double)i * (j + 1)) / nx;
+      ey[i * ny + j] = ((double)i * (j + 2)) / ny;
+      hz[i * ny + j] = ((double)i * (j + 3)) / nx;
+    }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int nx, int ny, double *ex, double *ey, double *hz) {
+  int i, j;
+
+  for (i = 0; i < nx; i++)
+    for (j = 0; j < ny; j++) {
+      fprintf(stderr, "%0.2lf ", ex[i * ny + j]);
+      fprintf(stderr, "%0.2lf ", ey[i * ny + j]);
+      fprintf(stderr, "%0.2lf ", hz[i * ny + j]);
+      if ((i * nx + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int tmax = atoi(argv[2]);
+  int nx = atoi(argv[3]);
+  int ny = atoi(argv[4]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *ex = (double *)malloc(nx * ny * sizeof(double));
+  double *ey = (double *)malloc(nx * ny * sizeof(double));
+  double *hz = (double *)malloc(nx * ny * sizeof(double));
+  double *_fict_ = (double *)malloc(ny * sizeof(double));
+
+  /* Initialize array(s). */
+  init_array(nx, ny, ex, ey, hz, _fict_);
+  double *dev_ex;
+  double *dev_ey;
+  double *dev_hz;
+  double *dev_fict;
+  cudaMalloc(&dev_ex, nx * ny * sizeof(double));
+  cudaMalloc(&dev_ey, nx * ny * sizeof(double));
+  cudaMalloc(&dev_hz, nx * ny * sizeof(double));
+  cudaMalloc(&dev_fict, ny * sizeof(double));
+  cudaMemcpy(dev_ex, ex, nx * ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_ey, ey, nx * ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_hz, hz, nx * ny * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_fict, _fict_, ny * sizeof(double), cudaMemcpyHostToDevice);
+  /* Run kernel. */
+  kernel(tmax, nx, ny, dev_ex, dev_ey, dev_hz, dev_fict);
+  cudaMemcpy(ex, dev_ex, nx * ny * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaMemcpy(ey, dev_ey, nx * ny * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaMemcpy(hz, dev_hz, nx * ny * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(nx, ny, ex, ey, hz);
+
+  /* Be clean. */
+  free((void *)ex);
+  free((void *)ey);
+  free((void *)hz);
+  free((void *)_fict_);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu
new file mode 100644
index 000000000000..04a534687bc0
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/gemm/gemm.cu
@@ -0,0 +1,108 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10
+// clang-format on
+/**
+ * gemm.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__global__ void kernel_dev(int ni, int nj, int nk, double alpha, double beta,
+                           double *C, double *A, double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+  int k;
+  double dot = 0.0;
+
+  if (i < ni && j < nj) {
+    dot = C[i * nj + j] * beta;
+    for (k = 0; k < nk; k++)
+      dot += alpha * A[i * nk + k] * B[k * nj + j];
+    C[i * nj + j] = dot;
+  }
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+static void kernel(int ni, int nj, int nk, double alpha, double beta, double *C,
+                   double *A, double *B) {
+
+  unsigned threadsPerBlock = 256;
+  dim3 block(threadsPerBlock / 32, 32, 1);
+  dim3 grid(num_blocks(ni, block.x), num_blocks(nj, block.y), 1);
+  kernel_dev<<<grid, block>>>(ni, nj, nk, alpha, beta, C, A, B);
+}
+
+static void init_array(int ni, int nj, int nk, double *C, double *A,
+                       double *B) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      C[i * nj + j] = ((double)i * j) / ni;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nk; j++)
+      A[i * nk + j] = ((double)i * j) / ni;
+  for (i = 0; i < nk; i++)
+    for (j = 0; j < nj; j++)
+      B[i * nj + j] = ((double)i * j) / ni;
+}
+
+static void print_array(int ni, int nj, double *C) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+      fprintf(stderr, "%0.2lf ", C[i * nj + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int ni = atoi(argv[2]);
+  int nj = atoi(argv[3]);
+  int nk = atoi(argv[4]);
+
+  double *A = (double *)malloc(ni * nk * sizeof(double));
+  double *B = (double *)malloc(nk * nj * sizeof(double));
+  double *C = (double *)malloc(ni * nj * sizeof(double));
+
+  double alpha = 32412;
+  double beta = 2123;
+
+  init_array(ni, nj, nk, C, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_C;
+  cudaMalloc(&dev_A, ni * nk * sizeof(double));
+  cudaMalloc(&dev_B, nk * nj * sizeof(double));
+  cudaMalloc(&dev_C, ni * nj * sizeof(double));
+  cudaMemcpy(dev_A, A, ni * nk * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, nk * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(ni, nj, nk, alpha, beta, dev_C, dev_A, dev_B);
+  cudaMemcpy(C, dev_C, ni * nj * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(ni, nj, C);
+
+  free((void *)C);
+  free((void *)A);
+  free((void *)B);
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu
new file mode 100644
index 000000000000..264f168e7185
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/gemver/gemver.cu
@@ -0,0 +1,190 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10
+// clang-format on
+/**
+ * gemver.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define RUN 50
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_A(int n, double alpha, double beta, double *A,
+                         double *u1, double *v1, double *u2, double *v2,
+                         double *w, double *x, double *y, double *z) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j < n)
+    A[i * n + j] += u1[i] * v1[j] + u2[i] * v2[j];
+}
+
+__global__ void kernel_x(int n, double alpha, double beta, double *A,
+                         double *u1, double *v1, double *u2, double *v2,
+                         double *w, double *x, double *y, double *z) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n) {
+    for (int j = 0; j < n; j++)
+      x[i] += beta * A[j * n + i] * y[j];
+  }
+}
+
+__global__ void kernel_y(int n, double alpha, double beta, double *A,
+                         double *u1, double *v1, double *u2, double *v2,
+                         double *w, double *x, double *y, double *z) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n)
+    x[i] += z[i];
+}
+
+__global__ void kernel_w(int n, double alpha, double beta, double *A,
+                         double *u1, double *v1, double *u2, double *v2,
+                         double *w, double *x, double *y, double *z) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n) {
+    for (int j = 0; j < n; j++)
+      w[i] += alpha * A[i * n + j] * x[j];
+  }
+}
+
+static void kernel(int n, double alpha, double beta, double *A, double *u1,
+                   double *v1, double *u2, double *v2, double *w, double *x,
+                   double *y, double *z) {
+
+  const unsigned threadsPerBlock = 256;
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel_A<<<grid, block>>>(n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z);
+  }
+
+  kernel_x<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+      n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z);
+  kernel_y<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+      n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z);
+  kernel_w<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+      n, alpha, beta, A, u1, v1, u2, v2, w, x, y, z);
+}
+
+/* Array initialization. */
+static void init_array(int n, double *A, double *u1, double *v1, double *u2,
+                       double *v2, double *w, double *x, double *y, double *z) {
+  int i, j;
+
+  for (i = 0; i < n; i++) {
+    u1[i] = i;
+    u2[i] = (i + 1) / n / 2.0;
+    v1[i] = (i + 1) / n / 4.0;
+    v2[i] = (i + 1) / n / 6.0;
+    y[i] = (i + 1) / n / 8.0;
+    z[i] = (i + 1) / n / 9.0;
+    x[i] = 0.0;
+    w[i] = 0.0;
+    for (j = 0; j < n; j++)
+      A[i * n + j] = ((double)i * j) / n;
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *w) {
+  int i;
+
+  for (i = 0; i < n; i++) {
+    fprintf(stderr, "%0.2lf ", w[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[2]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(n * n * sizeof(double));
+  double *u1 = (double *)malloc(n * sizeof(double));
+  double *v1 = (double *)malloc(n * sizeof(double));
+  double *u2 = (double *)malloc(n * sizeof(double));
+  double *v2 = (double *)malloc(n * sizeof(double));
+  double *w = (double *)malloc(n * sizeof(double));
+  double *x = (double *)malloc(n * sizeof(double));
+  double *y = (double *)malloc(n * sizeof(double));
+  double *z = (double *)malloc(n * sizeof(double));
+
+  double alpha = 43532;
+  double beta = 12313;
+  /* Initialize array(s). */
+  init_array(n, A, u1, v1, u2, v2, w, x, y, z);
+
+  double *dev_A;
+  double *dev_u1;
+  double *dev_v1;
+  double *dev_u2;
+  double *dev_v2;
+  double *dev_w;
+  double *dev_x;
+  double *dev_y;
+  double *dev_z;
+  double *dev_alpha;
+  double *dev_beta;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMalloc(&dev_u1, n * sizeof(double));
+  cudaMalloc(&dev_v1, n * sizeof(double));
+  cudaMalloc(&dev_u2, n * sizeof(double));
+  cudaMalloc(&dev_v2, n * sizeof(double));
+  cudaMalloc(&dev_w, n * sizeof(double));
+  cudaMalloc(&dev_x, n * sizeof(double));
+  cudaMalloc(&dev_y, n * sizeof(double));
+  cudaMalloc(&dev_z, n * sizeof(double));
+  cudaMalloc(&dev_alpha, sizeof(double));
+  cudaMalloc(&dev_beta, sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_u1, u1, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_v1, v1, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_u2, u2, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_v2, v2, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_w, w, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_x, x, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_y, y, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_z, z, n * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Run kernel. */
+  kernel(n, alpha, beta, dev_A, dev_u1, dev_v1, dev_u2, dev_v2, dev_w, dev_x,
+         dev_y, dev_z);
+  cudaMemcpy(w, dev_w, n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, w);
+
+  /* Be clean. */
+  free((void *)A);
+  free((void *)u1);
+  free((void *)v1);
+  free((void *)u2);
+  free((void *)v2);
+  free((void *)w);
+  free((void *)x);
+  free((void *)y);
+  free((void *)z);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu b/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu
new file mode 100644
index 000000000000..ac4e1a09442e
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/gesummv/gesummv.cu
@@ -0,0 +1,124 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10
+// clang-format on
+/**
+ * gesummv.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define RUN 100
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_y(int n, double alpha, double beta, double *A, double *B,
+                         double tmp[], double x[], double y[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (i < n) {
+    tmp[i] = 0;
+    y[i] = 0;
+    for (int j = 0; j < n; j++) {
+      tmp[i] += A[i * n + j] * x[j];
+      y[i] += B[i * n + j] * x[j];
+    }
+    y[i] = alpha * tmp[i] + beta * y[i];
+  }
+}
+
+/* Array initialization. */
+static void init_array(int n, double *A, double *B, double *x) {
+  int i, j;
+
+  for (i = 0; i < n; i++) {
+    x[i] = ((double)i) / n;
+    for (j = 0; j < n; j++) {
+      A[i * n + j] = ((double)i * j) / n;
+      B[i * n + j] = ((double)i * j) / n;
+    }
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *y)
+
+{
+  int i;
+
+  for (i = 0; i < n; i++) {
+    fprintf(stderr, "%0.2lf ", y[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[2]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(n * n * sizeof(double));
+  double *B = (double *)malloc(n * n * sizeof(double));
+  double *tmp = (double *)malloc(n * sizeof(double));
+  double *x = (double *)malloc(n * sizeof(double));
+  double *y = (double *)malloc(n * sizeof(double));
+
+  //__builtin_assume(n>0);
+  //__builtin_assume(n<0x7FFFFFFE);
+  /* Initialize array(s). */
+  init_array(n, A, B, x);
+  double alpha = 43532;
+  double beta = 12313;
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_tmp;
+  double *dev_x;
+  double *dev_y;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMalloc(&dev_B, n * n * sizeof(double));
+  cudaMalloc(&dev_tmp, n * sizeof(double));
+  cudaMalloc(&dev_x, n * sizeof(double));
+  cudaMalloc(&dev_y, n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_tmp, tmp, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_x, x, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_y, y, n * sizeof(double), cudaMemcpyHostToDevice);
+  /* Run kernel. */
+
+  const unsigned threadsPerBlock = 256;
+  kernel_y<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+      n, alpha, beta, dev_A, dev_B, dev_tmp, dev_x, dev_y);
+  cudaMemcpy(y, dev_y, n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, y);
+
+  /* Be clean. */
+  free((void *)A);
+  free((void *)B);
+  free((void *)tmp);
+  free((void *)x);
+  free((void *)y);
+
+  cudaFree((void *)dev_A);
+  cudaFree((void *)dev_B);
+  cudaFree((void *)dev_tmp);
+  cudaFree((void *)dev_x);
+  cudaFree((void *)dev_y);
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu
new file mode 100644
index 000000000000..11e0fe5a4287
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/heat-3d/heat-3d.cu
@@ -0,0 +1,96 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10 10
+// clang-format on
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void init_array(int n, double *A, double *B) {
+  int i, j, k;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      for (k = 0; k < n; k++) {
+        A[(i * n + j) * n + k] = ((double)i + j + k) / n;
+        B[(i * n + j) * n + k] = ((double)i + j + k + 1) / n;
+      }
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_stencil(int iter, double *A, double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + 1;
+  int k = blockDim.z * blockIdx.z + threadIdx.z + 1;
+
+  if (i < iter - 1 && j < iter - 1 && k < iter - 1) {
+    B[(i * iter + j) * iter + k] =
+        (A[((i + 1) * iter + j) * iter + k] - 2 * A[(i * iter + j) * iter + k] +
+         A[((i - 1) * iter + j) * iter + k]) /
+            8 +
+        (A[(i * iter + (j + 1)) * iter + k] - 2 * A[(i * iter + j) * iter + k] +
+         A[(i * iter + (j - 1)) * iter + k]) /
+            8 +
+        (A[(i * iter + j) * iter + k + 1] - 2 * A[(i * iter + j) * iter + k] +
+         A[(i * iter + j) * iter + k - 1]) /
+            8 +
+        A[(i * iter + j) * iter + k];
+  }
+}
+
+static void kernel(int tsteps, int iter, double *A, double *B) {
+  const unsigned int threadsPerBlock = 256;
+
+  for (int t = 1; t <= tsteps; t++) {
+    dim3 block(1, threadsPerBlock / 32, 32);
+    dim3 grid(num_blocks(iter - 2, block.x), num_blocks(iter - 2, block.y),
+              num_blocks(iter - 2, block.z));
+    kernel_stencil<<<grid, block>>>(iter, A, B);
+    kernel_stencil<<<grid, block>>>(iter, B, A);
+  }
+}
+
+static void print_array(int n, double *A) {
+  int i, j, k;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      for (k = 0; k < n; k++) {
+        fprintf(stderr, "%0.2lf ", A[(i * n + j) * n + k]);
+        if (((i * n + j) * n + k) % 20 == 0)
+          fprintf(stderr, "\n");
+      }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int tsteps = atoi(argv[2]);
+  int n = atoi(argv[3]);
+
+  double *A = (double *)malloc(n * n * n * sizeof(double));
+  double *B = (double *)malloc(n * n * n * sizeof(double));
+
+  init_array(n, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  cudaMalloc(&dev_A, n * n * n * sizeof(double));
+  cudaMalloc(&dev_B, n * n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, n * n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(tsteps, n, dev_A, dev_B);
+  cudaMemcpy(A, dev_A, n * n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(n, A);
+
+  free((void *)A);
+  free((void *)B);
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu
new file mode 100644
index 000000000000..6cdf5544de3c
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-1d-imper/jacobi-1d-imper.cu
@@ -0,0 +1,98 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 5 10
+// clang-format on
+/**
+ * jacobi-1d-imper.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_stencil(int n, double A[], double B[]) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+
+  if (i < n - 1) {
+    B[i] = (A[i - 1] + A[i] + A[i + 1]) / 3;
+  }
+}
+
+static void kernel(int tsteps, int n, double A[], double B[]) {
+  const unsigned int threadsPerBlock = 256;
+
+  for (int t = 1; t <= tsteps; t++) {
+    kernel_stencil<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(n, A,
+                                                                        B);
+    kernel_stencil<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(n, B,
+                                                                        A);
+  }
+}
+
+/* Array initialization. */
+static void init_array(int n, double *A, double *B) {
+  int i;
+
+  for (i = 0; i < n; i++) {
+    A[i] = ((double)i + 2) / n;
+    B[i] = ((double)i + 3) / n;
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *A)
+
+{
+  int i;
+
+  for (i = 0; i < n; i++) {
+    fprintf(stderr, "%0.2lf ", A[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[3]);
+  int tsteps = atoi(argv[2]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(n * sizeof(double));
+  double *B = (double *)malloc(n * sizeof(double));
+
+  init_array(n, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  cudaMalloc(&dev_A, n * sizeof(double));
+  cudaMalloc(&dev_B, n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, n * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Run kernel. */
+  kernel(tsteps, n, dev_A, dev_B);
+  cudaMemcpy(A, dev_A, n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, A);
+
+  /* Be clean. */
+  free((void *)A);
+  free((void *)B);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu
new file mode 100644
index 000000000000..4ea7bb037d77
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/jacobi-2d-imper/jacobi-2d-imper.cu
@@ -0,0 +1,101 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 5
+// clang-format on
+/**
+ * jacobi-2d-imper.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_stencil(int n, double *A, double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + 1;
+
+  if (i < n - 1 && j < n - 1) {
+    B[i * n + j] = (A[i * n + j] + A[i * n + j - 1] + A[i * n + 1 + j] +
+                    A[(1 + i) * n + j] + A[(i - 1) * n + j]) /
+                   5;
+  }
+}
+
+static void kernel(int tsteps, int n, double *A, double *B) {
+  const unsigned int threadsPerBlock = 256;
+
+  for (int t = 1; t <= tsteps; t++) {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n - 2, block.x), num_blocks(n - 2, block.y), 1);
+    kernel_stencil<<<grid, block>>>(n, A, B);
+    kernel_stencil<<<grid, block>>>(n, B, A);
+  }
+}
+
+static void init_array(int n, double *A, double *B) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      A[i * n + j] = ((double)i * (j + 2) + 2) / n;
+      B[i * n + j] = ((double)i * (j + 3) + 3) / n;
+    }
+}
+
+static void print_array(int n, double *A)
+
+{
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", A[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int tsteps = atoi(argv[3]);
+
+  double *A = (double *)malloc(n * n * sizeof(double));
+  double *B = (double *)malloc(n * n * sizeof(double));
+
+  // __builtin_assume(tsteps>0);
+  // __builtin_assume(n>2);
+  init_array(n, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMalloc(&dev_B, n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(tsteps, n, dev_A, dev_B);
+  cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(n, A);
+
+  free((void *)A);
+  ;
+  free((void *)B);
+  ;
+  cudaFree((void *)dev_A);
+  cudaFree((void *)dev_B);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu b/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu
new file mode 100644
index 000000000000..4406bdb7ffcd
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/lu/lu.cu
@@ -0,0 +1,96 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 100
+// clang-format on
+/**
+ * lu.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void init_array(int n, double *A) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      A[i * n + j] = ((double)(i + 1) * (j + 1)) / n;
+}
+
+static void print_array(int n, double *A)
+
+{
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", A[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_div(int n, double *A, int k) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + k + 1;
+
+  if (i < n)
+    A[i * n + k] /= A[k * n + k];
+}
+
+__global__ void kernel_A(int n, double *A, int k) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + k + 1;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + k + 1;
+
+  if (i < n && j < n)
+    A[i * n + j] -= A[i * n + k] * A[k * n + j];
+}
+
+static void kernel(int n, double *A) {
+  const unsigned int threadsPerBlock = 256;
+
+  for (int iter = 0; iter < n - 1; iter++) {
+    kernel_div<<<num_blocks(n - (iter + 1), threadsPerBlock),
+                 threadsPerBlock>>>(n, A, iter);
+
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n - (iter + 1), block.x),
+              num_blocks(n - (iter + 1), block.y), 1);
+    kernel_A<<<grid, block>>>(n, A, iter);
+  }
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+
+  double *A = (double *)malloc(n * n * sizeof(double));
+
+  init_array(n, A);
+
+  double *dev_A;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(n, dev_A);
+
+  cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(n, A);
+
+  free((void *)A);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu b/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu
new file mode 100644
index 000000000000..20cff302c13e
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/mvt/mvt.cu
@@ -0,0 +1,141 @@
+// clang-format off
+// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm
+// RUN: true
+// clang-format on
+/**
+ * mvt.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <cuda.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Include benchmark-specific header. */
+/* Default data type is double, default size is 4000. */
+#define RUN 200
+#define N 15000
+// #define N 40
+
+/* Array initialization. */
+static void init_array(int n, double *x1, double *x2, double *y_1, double *y_2,
+                       double *A) {
+  int i, j;
+
+  for (i = 0; i < n; i++) {
+    x1[i] = ((double)i) / n;
+    x2[i] = ((double)i + 1) / n;
+    y_1[i] = ((double)i + 3) / n;
+    y_2[i] = ((double)i + 4) / n;
+    for (j = 0; j < n; j++)
+      A[i * n + j] = ((double)i * j) / n;
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int n, double *x1, double *x2)
+
+{
+  int i;
+
+  for (i = 0; i < n; i++) {
+    fprintf(stderr, "%0.2lf", x1[i]);
+    fprintf(stderr, "%0.2lf", x2[i]);
+    if (i % 20 == 0)
+      fprintf(stderr, "\n");
+  }
+}
+
+__global__ void kernel_x1(int n, double *x1, double *x2, double *y_1,
+                          double *y_2, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j;
+  if (i < n) {
+    for (j = 0; j < n; j++)
+      x1[i] += A[i * n + j] * y_1[j];
+  }
+}
+
+__global__ void kernel_x2(int n, double *x1, double *x2, double *y_1,
+                          double *y_2, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j;
+  if (i < n) {
+    for (j = 0; j < n; j++)
+      x2[i] += A[j * n + i] * y_2[j];
+  }
+}
+
+short num_blocks(short num, short factor) {
+  return (num + factor - 1) / factor;
+}
+
+void kernel(int n, double *x1, double *x2, double *y_1, double *y_2,
+            double *A) {
+  short threadsPerBlock = 256;
+
+  kernel_x1<<<threadsPerBlock, num_blocks(n, threadsPerBlock)>>>(n, x1, x2, y_1,
+                                                                 y_2, A);
+  kernel_x2<<<threadsPerBlock, num_blocks(n, threadsPerBlock)>>>(n, x1, x2, y_1,
+                                                                 y_2, A);
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int dump_code = atoi(argv[1]);
+  int n = N;
+
+  /* Variable declaration/allocation. */
+  double *A = (double *)malloc(N * N * sizeof(double));
+  double *x1 = (double *)malloc(sizeof(double) * n);
+  double *x2 = (double *)malloc(sizeof(double) * n);
+  double *y_1 = (double *)malloc(sizeof(double) * n);
+  double *y_2 = (double *)malloc(sizeof(double) * n);
+
+  /* Initialize array(s). */
+  init_array(n, x1, x2, y_1, y_2, A);
+
+  double *dev_A;
+  double *dev_x1;
+  double *dev_x2;
+  double *dev_y_1;
+  double *dev_y_2;
+
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMalloc(&dev_x1, n * sizeof(double));
+  cudaMalloc(&dev_x2, n * sizeof(double));
+  cudaMalloc(&dev_y_1, n * sizeof(double));
+  cudaMalloc(&dev_y_2, n * sizeof(double));
+
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_x1, x1, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_x2, x2, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_y_1, y_1, n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_y_2, y_2, n * sizeof(double), cudaMemcpyHostToDevice);
+
+  /* Run kernel. */
+  kernel(n, dev_x1, dev_x2, dev_y_1, dev_y_2, dev_A);
+
+  cudaMemcpy(x1, dev_x1, n * sizeof(double), cudaMemcpyDeviceToHost);
+  cudaMemcpy(x2, dev_x2, n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(n, x1, x2);
+
+  /* Be clean. */
+  free((void *)A);
+  free((void *)x1);
+  free((void *)x2);
+  free((void *)y_1);
+  free((void *)y_2);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu b/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu
new file mode 100644
index 000000000000..53828dde253b
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/nussinov/nussinov.cu
@@ -0,0 +1,111 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10
+// clang-format on
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define match(b1, b2) (((b1) + (b2)) == 3 ? 1 : 0)
+
+static void init_array(int n, double *table, double *oldtable, double *seq) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      table[i * n + j] = ((double)i * j) / n;
+      oldtable[i * n + j] = ((double)i * j) / n;
+    }
+  for (i = 0; i < n; i++)
+    seq[i] = ((double)i) / n;
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+// Dynamic programming wavefront
+__global__ void kernel_max_score(int n, double *seq, double *table,
+                                 double *oldtable, int w) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+  int i = ((int)n - 1) + j - w;
+
+  if (0 <= i && i < n && i + 1 <= j && j < n) {
+    double maximum = table[i * n + j];
+
+    if (j - 1 >= 0)
+      maximum = max(maximum, table[i * n + (j - 1)]);
+    if (i + 1 < n)
+      maximum = max(maximum, table[(i + 1) * n + j]);
+
+    if (j - 1 >= 0 && i + 1 < n) {
+      auto upd = table[(i + 1) * n + (j - 1)];
+
+      /* don't allow adjacent elements to bond */
+      if (i < j - 1)
+        upd += (seq[i] + seq[j] == 3) ? (double)1 : (double)0;
+
+      maximum = max(maximum, upd);
+    }
+
+    for (int k = i + 1; k < j; k++)
+      maximum = max(maximum, table[i * n + k] + table[(k + 1) * n + j]);
+
+    //  AtomicMax<double>::set_if_larger(table[i * n + j], maximum);
+    table[i * n + j] = maximum;
+  }
+}
+
+static void kernel(int n, double *seq, double *table, double *oldtable) {
+  const unsigned threadsPerBlock = 32;
+
+  for (int w = n; w < 2 * n - 1; ++w) { // wavefronting
+    kernel_max_score<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+        n, seq, table, oldtable, w);
+  }
+}
+
+static void print_array(int n, double *table) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", table[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  double *table = (double *)malloc(n * n * sizeof(double));
+  double *oldtable = (double *)malloc(n * n * sizeof(double));
+  double *seq = (double *)malloc(n * sizeof(double));
+
+  init_array(n, table, oldtable, seq);
+
+  double *dev_table;
+  double *dev_oldtable;
+  double *dev_seq;
+  cudaMalloc(&dev_table, n * n * sizeof(double));
+  cudaMalloc(&dev_oldtable, n * n * sizeof(double));
+  cudaMalloc(&dev_seq, n * sizeof(double));
+  cudaMemcpy(dev_table, table, n * n * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_oldtable, oldtable, n * n * sizeof(double),
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_seq, seq, n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(n, dev_seq, dev_table, dev_oldtable);
+  cudaMemcpy(table, dev_table, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(n, table);
+
+  free((void *)table);
+  free((void *)oldtable);
+  free((void *)seq);
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu b/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu
new file mode 100644
index 000000000000..01d6e40e28bd
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/reduction/reduction.cu
@@ -0,0 +1,89 @@
+// clang-format off
+// COM: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 10
+// RUN: true
+// clang-format on
+// This program performs sum reduction with an optimization
+// removing warp divergence
+// By: Nick from CoffeeBeforeArch
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#define SHMEM_SIZE 256
+
+__global__ void sumReduction(int *v, int *v_r) {
+  // Allocate shared memory
+  __shared__ int partial_sum[SHMEM_SIZE];
+
+  // Calculate thread ID
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Load elements into shared memory
+  partial_sum[threadIdx.x] = v[tid];
+  __syncthreads();
+
+  // Increase the stride of the access until we exceed the CTA dimensions
+  for (int s = 1; s < blockDim.x; s *= 2) {
+    // Change the indexing to be sequential threads
+    int index = 2 * s * threadIdx.x;
+
+    // Each thread does work unless the index goes off the block
+    if (index < blockDim.x) {
+      partial_sum[index] += partial_sum[index + s];
+    }
+    __syncthreads();
+  }
+
+  // Let the thread 0 for this block write it's result to main memory
+  // Result is inexed by this block
+  if (threadIdx.x == 0) {
+    v_r[blockIdx.x] = partial_sum[0];
+  }
+}
+
+void initialize_vector(int *v, int n) {
+  for (int i = 0; i < n; i++) {
+    v[i] = 1; // rand() % 10;
+  }
+}
+
+int main() {
+  // Vector size
+  int N = 1 << 16;
+  int bytes = N * sizeof(int);
+
+  // Original vector and result vector
+  int *h_v, *h_v_r;
+  int *d_v, *d_v_r;
+
+  // Initialize vector
+  initialize_vector(h_v, N);
+
+  // Allocate device memory
+  cudaMalloc(&d_v, bytes);
+  cudaMalloc(&d_v_r, bytes);
+
+  // Copy to device
+  cudaMemcpy(d_v, h_v, bytes, cudaMemcpyHostToDevice);
+
+  // TB Size
+  const int TB_SIZE = 256;
+
+  // Grid Size (No padding)
+  int GRID_SIZE = N / TB_SIZE;
+
+  // Call kernels
+  sumReduction<<<GRID_SIZE, TB_SIZE>>>(d_v, d_v_r);
+
+  sumReduction<<<1, TB_SIZE>>>(d_v_r, d_v_r);
+
+  // Copy to host;
+  cudaMemcpy(h_v_r, d_v_r, bytes, cudaMemcpyDeviceToHost);
+
+  printf("COMPLETED SUCCESSFULLY %d\n", h_v_r[0]);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu b/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu
new file mode 100644
index 000000000000..a691858ee476
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/seidel-2d/seidel-2d.cu
@@ -0,0 +1,91 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 2
+// clang-format on
+/**
+ * seidel-2d.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void init_array(int n, double *A) {
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++)
+      A[i * n + j] = ((double)i * (j + 2) + 2) / n;
+}
+
+static void print_array(int n, double *A)
+
+{
+  int i, j;
+
+  for (i = 0; i < n; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", A[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_stencil(int tsteps, int n, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
+  int j = blockDim.y * blockIdx.y + threadIdx.y + 1;
+
+  if (i < n - 1 && j < n - 1)
+    A[i * n + j] =
+        (A[(i - 1) * n + j - 1] + A[(i - 1) * n + j] + A[(i - 1) * n + j + 1] +
+         A[i * n + j - 1] + A[i * n + j] + A[i * n + j + 1] +
+         A[(i + 1) * n + j - 1] + A[(i + 1) * n + j] + A[(i + 1) * n + j + 1]) /
+        9;
+}
+
+static void kernel(int tsteps, int n, double *A) {
+  // FIXME: Parallelizing this should give different results
+  const unsigned int threadsPerBlock = 256;
+
+  for (int t = 1; t <= tsteps; t++) {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n - 2, block.x), num_blocks(n - 2, block.y), 1);
+    kernel_stencil<<<grid, block>>>(tsteps, n, A);
+  }
+}
+
+int main(int argc, char **argv) {
+
+  int dump_code = atoi(argv[1]);
+  int n = atoi(argv[2]);
+  int tsteps = atoi(argv[3]);
+
+  double *A = (double *)malloc(n * n * sizeof(double));
+
+  init_array(n, A);
+
+  double *dev_A;
+  cudaMalloc(&dev_A, n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(tsteps, n, dev_A);
+
+  cudaMemcpy(A, dev_A, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(n, A);
+
+  free((void *)A);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu
new file mode 100644
index 000000000000..d8c462e808ee
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/symm/symm.cu
@@ -0,0 +1,147 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * symm.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static void init_array(int ni, int nj, double *C, double *A, double *B,
+                       double *tmp) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+      C[i * nj + j] = ((double)i * j) / ni;
+      B[i * nj + j] = ((double)i * j) / ni;
+      tmp[i * nj + j] = 0;
+    }
+  for (i = 0; i < nj; i++)
+    for (j = 0; j < nj; j++)
+      A[i * nj + j] = ((double)i * j) / ni;
+}
+
+static void print_array(int ni, int nj, double *C) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+      fprintf(stderr, "%0.2lf ", C[i * nj + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_tmp(int m, int n, double alpha, double beta, double *C,
+                           double *A, double *B, double *tmp) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < m && j < n) {
+    tmp[i * n + j] = 0;
+    for (int k = 0; k < i; k++)
+      tmp[i * n + j] += B[k * n + j] * A[i * n + k];
+  }
+}
+
+__global__ void kernel_C(int m, int n, double alpha, double beta, double *C,
+                         double *A, double *B, double *tmp) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < m && j < n)
+    C[i * n + j] = beta * C[i * n + j] + alpha * B[i * n + j] * A[i * n + i] +
+                   alpha * tmp[i * n + j];
+}
+
+__global__ void kernel_sum(int m, int n, double alpha, double beta, double *C,
+                           double *A, double *B, double *tmp) {
+  int k = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (k < m - 1 && j < n) {
+    for (int i = k + 1; i < m; i++)
+      C[k * n + j] += alpha * B[i * n + j] * A[i * n + k];
+  }
+}
+
+static void kernel(int m, int n, double alpha, double beta, double *C,
+                   double *A, double *B, double *tmp) {
+  const unsigned int threadsPerBlock = 256;
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1);
+    kernel_tmp<<<grid, block>>>(m, n, alpha, beta, C, A, B, tmp);
+  }
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1);
+    kernel_C<<<grid, block>>>(m, n, alpha, beta, C, A, B, tmp);
+  }
+
+  // TODO: Combine both kernels?
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m - 1, block.x), num_blocks(n, block.y), 1);
+    kernel_sum<<<grid, block>>>(m, n, alpha, beta, C, A, B, tmp);
+  }
+}
+
+int main(int argc, char **argv) {
+  int dump_code = atoi(argv[1]);
+  int ni = atoi(argv[2]);
+  int nj = atoi(argv[3]);
+
+  double *A = (double *)malloc(nj * nj * sizeof(double));
+  double *B = (double *)malloc(ni * nj * sizeof(double));
+  double *C = (double *)malloc(ni * nj * sizeof(double));
+  double *tmp = (double *)malloc(ni * nj * sizeof(double));
+
+  double alpha = 32412;
+  double beta = 2123;
+
+  init_array(ni, nj, C, A, B, tmp);
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_C;
+  double *dev_tmp;
+  cudaMalloc(&dev_A, nj * nj * sizeof(double));
+  cudaMalloc(&dev_B, ni * nj * sizeof(double));
+  cudaMalloc(&dev_C, ni * nj * sizeof(double));
+  cudaMalloc(&dev_tmp, ni * nj * sizeof(double));
+  cudaMemcpy(dev_A, A, nj * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_tmp, tmp, ni * nj * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(ni, nj, alpha, beta, dev_C, dev_A, dev_B, dev_tmp);
+  cudaMemcpy(C, dev_C, ni * nj * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(ni, nj, C);
+
+  free((void *)C);
+  ;
+  free((void *)A);
+  ;
+  free((void *)B);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu b/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu
new file mode 100644
index 000000000000..459997bb7fc5
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/syr2k/syr2k.cu
@@ -0,0 +1,122 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * syr2k.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_beta(int n, int m, double alpha, double beta, double *C,
+                            double *A, double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j <= i)
+    C[i * n + j] *= beta;
+}
+
+__global__ void kernel_product(int n, int m, double alpha, double beta,
+                               double *C, double *A, double *B) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j <= i) {
+    for (int k = 0; k < m; k++)
+      C[i * n + j] += A[j * m + k] * alpha * B[i * m + k] +
+                      B[j * m + k] * alpha * A[i * m + k];
+  }
+}
+
+static void kernel(int n, int m, double alpha, double beta, double *C,
+                   double *A, double *B) {
+  const unsigned int threadsPerBlock = 256;
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel_beta<<<grid, block>>>(n, m, alpha, beta, C, A, B);
+  }
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel_product<<<grid, block>>>(n, m, alpha, beta, C, A, B);
+  }
+}
+
+static void init_array(int ni, int nj, double *C, double *A, double *B) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++) {
+      A[i * nj + j] = ((double)i * j) / ni;
+      B[i * nj + j] = ((double)i * j) / ni;
+    }
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < ni; j++)
+      C[i * ni + j] = ((double)i * j) / ni;
+}
+
+static void print_array(int ni, double *C) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < ni; j++) {
+      fprintf(stderr, "%0.2lf ", C[i * ni + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  int dump_code = atoi(argv[1]);
+  int m = atoi(argv[2]);
+  int n = atoi(argv[3]);
+
+  double *A = (double *)malloc(m * n * sizeof(double));
+  double *B = (double *)malloc(m * n * sizeof(double));
+  double *C = (double *)malloc(m * m * sizeof(double));
+
+  double alpha = 32412;
+  double beta = 2123;
+
+  init_array(m, n, C, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  double *dev_C;
+  cudaMalloc(&dev_A, n * m * sizeof(double));
+  cudaMalloc(&dev_B, n * m * sizeof(double));
+  cudaMalloc(&dev_C, n * m * sizeof(double));
+  cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, n * m * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(m, n, alpha, beta, dev_C, dev_A, dev_B);
+  cudaMemcpy(C, dev_C, n * m * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(m, C);
+
+  free((void *)C);
+  ;
+  free((void *)A);
+  ;
+  free((void *)B);
+  ;
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu b/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu
new file mode 100644
index 000000000000..4035bee9b1f8
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/syrk/syrk.cu
@@ -0,0 +1,115 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * syrk.c: This file is part of the PolyBench/C 3.2 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_beta(int n, int m, double alpha, double beta, double *C,
+                            double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j <= i)
+    C[i * n + j] *= beta;
+}
+
+__global__ void kernel_product(int n, int m, double alpha, double beta,
+                               double *C, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < n && j <= i) {
+    for (int k = 0; k < m; k++)
+      C[i * n + j] += alpha * A[i * m + k] * A[j * m + k];
+  }
+}
+
+static void kernel(int n, int m, double alpha, double beta, double *C,
+                   double *A) {
+  const unsigned int threadsPerBlock = 256;
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel_beta<<<grid, block>>>(m, n, alpha, beta, C, A);
+  }
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(n, block.x), num_blocks(n, block.y), 1);
+    kernel_product<<<grid, block>>>(m, n, alpha, beta, C, A);
+  }
+}
+
+static void init_array(int ni, int nj, double *C, double *A) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < nj; j++)
+      A[i * nj + j] = ((double)i * j) / ni;
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < ni; j++)
+      C[i * ni + j] = ((double)i * j) / ni;
+}
+
+static void print_array(int ni, double *C) {
+  int i, j;
+
+  for (i = 0; i < ni; i++)
+    for (j = 0; j < ni; j++) {
+      fprintf(stderr, "%0.2lf ", C[i * ni + j]);
+      if ((i * ni + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  int dump_code = atoi(argv[1]);
+  int m = atoi(argv[2]);
+  int n = atoi(argv[3]);
+  //__builtin_assume(nj>0);
+  //__builtin_assume(ni>0);
+  //__builtin_assume(ni<2147483646);
+  //__builtin_assume(nj<2147483646);
+
+  double *A = (double *)malloc(m * n * sizeof(double));
+  double *C = (double *)malloc(m * m * sizeof(double));
+  double alpha = 32412;
+  double beta = 2123;
+
+  init_array(m, n, C, A);
+
+  double *dev_C;
+  double *dev_A;
+  cudaMalloc(&dev_A, n * m * sizeof(double));
+  cudaMalloc(&dev_C, n * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_C, C, n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+  kernel(m, n, alpha, beta, dev_C, dev_A);
+
+  cudaMemcpy(C, dev_C, n * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  if (dump_code == 1)
+    print_array(m, C);
+
+  free((void *)C);
+  free((void *)A);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu b/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu
new file mode 100644
index 000000000000..21263ca8f51e
--- /dev/null
+++ b/tools/cgeist/Test/CUDA/polybench-cuda/trmm/trmm.cu
@@ -0,0 +1,113 @@
+// clang-format off
+// RUN: cgeist %s %stdinclude %cudaopts -O3 -o %s.execm && %s.execm 1 10 10
+// clang-format on
+/**
+ * trmm.c: This file is part of the PolyBench 3.0 test suite.
+ *
+ *
+ * Contact: Louis-Noel Pouchet <pouchet@cse.ohio-state.edu>
+ * Web address: http://polybench.sourceforge.net
+ */
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/* Array initialization. */
+static void init_array(int n, int m, double *alpha, double *A, double *B) {
+  int i, j;
+
+  *alpha = 32412;
+  for (i = 0; i < n; i++)
+    for (j = 0; j < m; j++) {
+      A[i * m + j] = ((double)i * j) / m;
+      B[j * n + j] = ((double)i * j) / n;
+    }
+}
+
+static unsigned num_blocks(int num, int factor) {
+  return (num + factor - 1) / factor;
+}
+
+__global__ void kernel_contract(int n, int m, double alpha, double *B,
+                                double *A) {
+  int j = blockDim.x * blockIdx.x + threadIdx.x;
+
+  if (j < n) {
+    for (int i = 0; i < m; i++)
+      for (int k = i + 1; k < m; k++)
+        B[i * n + j] += A[k * m + i] * B[k * n + j];
+  }
+}
+
+__global__ void kernel_alpha(int n, int m, double alpha, double *B, double *A) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  if (i < m && j < n)
+    B[i * n + j] *= alpha;
+}
+
+static void kernel(int n, int m, double alpha, double *B, double *A) {
+  const unsigned int threadsPerBlock = 256;
+
+  kernel_contract<<<num_blocks(n, threadsPerBlock), threadsPerBlock>>>(
+      n, m, alpha, B, A);
+
+  {
+    dim3 block(threadsPerBlock / 32, 32, 1);
+    dim3 grid(num_blocks(m, block.x), num_blocks(n, block.y), 1);
+    kernel_alpha<<<grid, block>>>(n, m, alpha, B, A);
+  }
+}
+
+/* DCE code. Must scan the entire live-out data.
+   Can be used also to check the correctness of the output. */
+static void print_array(int m, int n, double *B) {
+  int i, j;
+
+  for (i = 0; i < m; i++)
+    for (j = 0; j < n; j++) {
+      fprintf(stderr, "%0.2lf ", B[i * n + j]);
+      if ((i * n + j) % 20 == 0)
+        fprintf(stderr, "\n");
+    }
+  fprintf(stderr, "\n");
+}
+
+int main(int argc, char **argv) {
+  /* Retrieve problem size. */
+  int n = atoi(argv[2]);
+  int m = atoi(argv[3]);
+  int dump_code = atoi(argv[1]);
+
+  /* Variable declaration/allocation. */
+  double alpha;
+  double *A = (double *)malloc(n * m * sizeof(double));
+  double *B = (double *)malloc(m * n * sizeof(double));
+
+  /* Initialize array(s). */
+  init_array(n, m, &alpha, A, B);
+
+  double *dev_A;
+  double *dev_B;
+  cudaMalloc(&dev_A, n * m * sizeof(double));
+  cudaMalloc(&dev_B, m * n * sizeof(double));
+  cudaMemcpy(dev_A, A, n * m * sizeof(double), cudaMemcpyHostToDevice);
+  cudaMemcpy(dev_B, B, m * n * sizeof(double), cudaMemcpyHostToDevice);
+  /* Run kernel. */
+  kernel(n, m, alpha, dev_A, dev_B);
+  cudaMemcpy(B, dev_B, m * n * sizeof(double), cudaMemcpyDeviceToHost);
+
+  /* Prevent dead-code elimination. All live-out data must be printed
+     by the function call in argument. */
+  if (dump_code == 1)
+    print_array(m, n, B);
+
+  /* Be clean. */
+  free((void *)A);
+  free((void *)B);
+
+  return 0;
+}
diff --git a/tools/cgeist/Test/lit.cfg b/tools/cgeist/Test/lit.cfg
index e32339fce937..a7b7f3685d8e 100644
--- a/tools/cgeist/Test/lit.cfg
+++ b/tools/cgeist/Test/lit.cfg
@@ -73,3 +73,4 @@ config.substitutions.append(('%stdinclude', '-resource-dir=' + resource_dir + "
 config.substitutions.append(('%resourcedir', '-resource-dir=' + resource_dir))
 config.substitutions.append(('%polyexec', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_TIME -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))
 config.substitutions.append(('%polyverify', config.test_source_root + '/polybench/utilities/polybench.c -D POLYBENCH_DUMP_ARRAYS -D POLYBENCH_NO_FLUSH_CACHE -D MINI_DATASET'))
+config.substitutions.append(('%cudaopts', '-L' + os.path.dirname(config.cudart_static_path) + ' -lstdc++ -ldl -lpthread -lrt -lcudart_static -lcuda --cuda-lower --emit-cuda --std=c++17 --cuda-gpu-arch=sm_80'))
diff --git a/tools/cgeist/Test/lit.site.cfg.in b/tools/cgeist/Test/lit.site.cfg.in
index d252d7977f74..f3f734d12079 100644
--- a/tools/cgeist/Test/lit.site.cfg.in
+++ b/tools/cgeist/Test/lit.site.cfg.in
@@ -9,6 +9,7 @@ config.mlir_clang_obj_root = "@MLIR_CLANG_BINARY_DIR@"
 config.target_triple = "@TARGET_TRIPLE@"
 config.llvm_obj_root = path(r"@LLVM_BINARY_DIR@")
 config.polygeist_enable_cuda = "@POLYGEIST_ENABLE_CUDA@"
+config.cudart_static_path = "@CUDA_cudart_static_LIBRARY@"
 config.polygeist_enable_rocm = "@POLYGEIST_ENABLE_ROCM@"
 
 # Support substitution of the tools and build_mode with user parameters.
diff --git a/tools/cgeist/driver.cc b/tools/cgeist/driver.cc
index 45c92f80bff5..2eee90d1a54c 100644
--- a/tools/cgeist/driver.cc
+++ b/tools/cgeist/driver.cc
@@ -685,20 +685,8 @@ int main(int argc, char **argv) {
       if (ScalarReplacement)
         optPM.addPass(mlir::affine::createAffineScalarReplacementPass());
     }
-    if (mlir::failed(pm.run(module.get()))) {
-      module->dump();
-      return 4;
-    }
-    if (mlir::failed(mlir::verify(module.get()))) {
-      module->dump();
-      return 5;
-    }
 
-#define optPM optPM2
-#define pm pm2
     {
-      mlir::PassManager pm(&context);
-      enablePrinting(pm);
       mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
 
       if (DetectReduction)
@@ -735,15 +723,9 @@ int main(int argc, char **argv) {
         optPM2.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
             canonicalizerConfig, {}, {}));
       }
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 6;
-      }
     }
 
     if (CudaLower || EmitROCM) {
-      mlir::PassManager pm(&context);
-      enablePrinting(pm);
       mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
       optPM.addPass(mlir::createLowerAffinePass());
       optPM.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
@@ -807,16 +789,10 @@ int main(int argc, char **argv) {
         if (ScalarReplacement)
           noptPM2.addPass(mlir::affine::createAffineScalarReplacementPass());
       }
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 7;
-      }
     }
 
-    mlir::PassManager pm(&context);
-    enablePrinting(pm);
-    mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
     if (CudaLower) {
+      mlir::OpPassManager &optPM = pm.nest<mlir::func::FuncOp>();
       optPM.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
           canonicalizerConfig, {}, {}));
       optPM.addPass(mlir::createCSEPass());
@@ -901,17 +877,10 @@ int main(int argc, char **argv) {
       if (InnerSerialize)
         pm.addPass(polygeist::createInnerSerializationPass());
       addLICM(pm);
-
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 8;
-      }
     }
 
 #if POLYGEIST_ENABLE_GPU
     if (EmitGPU) {
-      mlir::PassManager pm(&context);
-      enablePrinting(pm);
       pm.addPass(mlir::createCSEPass());
       if (CudaLower)
         pm.addPass(polygeist::createConvertParallelToGPUPass1(
@@ -937,27 +906,21 @@ int main(int argc, char **argv) {
       pm.addPass(mlir::createCSEPass());
       pm.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
           canonicalizerConfig, {}, {}));
-
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 12;
-      }
     }
 #endif
 
     {
-      mlir::PassManager pm(&context);
-      enablePrinting(pm);
       mlir::OpPassManager &gpuPM = pm.nest<gpu::GPUModuleOp>();
       gpuPM.addPass(polygeist::createFixGPUFuncPass());
       pm.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
           canonicalizerConfig, {}, {}));
       pm.addPass(polygeist::createLowerAlternativesPass());
       pm.addPass(polygeist::createCollectKernelStatisticsPass());
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 12;
-      }
+    }
+
+    if (mlir::failed(pm.run(module.get()))) {
+      module->dump();
+      return 12;
     }
 
     // Prune unused gpu module funcs
@@ -996,7 +959,7 @@ int main(int argc, char **argv) {
         pm2.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
             canonicalizerConfig, {}, {}));
       }
-      pm.nest<mlir::func::FuncOp>().addPass(
+      pm2.nest<mlir::func::FuncOp>().addPass(
           polygeist::createPolygeistMem2RegPass());
       pm2.addPass(mlir::createCSEPass());
       pm2.addPass(mlir::polygeist::createPolygeistCanonicalizePass(
@@ -1110,13 +1073,6 @@ int main(int argc, char **argv) {
           return 10;
         }
       }
-
-    } else {
-
-      if (mlir::failed(pm.run(module.get()))) {
-        module->dump();
-        return 11;
-      }
     }
     if (mlir::failed(mlir::verify(module.get()))) {
       module->dump();