From 5d389a2cc2ccc36d9c833237ffa7558d9294c0a9 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Fri, 28 Feb 2020 11:11:18 -0500
Subject: [PATCH 001/118] Add (incomplete) Multiple GPU Support

---
 src/GPUIterator.chpl | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 954f9a0..b6ff44b 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -20,6 +20,7 @@ module GPUIterator {
     use BlockDist;
 
     config param debugGPUIterator = false;
+    config const nGPUs = 1;
 
     // Utility functions
     inline proc computeSubranges(whole: range(?),
@@ -51,7 +52,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper: func(int, int, int, int, void))
       where tag == iterKind.leader {
 
       if (CPUrange.size == 0) {
@@ -84,7 +85,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("GPU portion: ", myIters);
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length, 0);
           }
         }
       }
@@ -94,14 +95,18 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper: func(int, int, int, int, void))
       where tag == iterKind.standalone {
 
       if (CPUrange.size == 0) {
-        const myIters = GPUrange;
+        const numGPUs = nGPUs;
         if (debugGPUIterator) then
-          writeln("GPU portion: ", myIters);
-        GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+          writeln("GPU portion: ", GPUrange, " by ", numGPUs, " GPUs");
+        coforall tid in 0..#numGPUs {
+          const myIters = computeChunk(GPUrange, tid, numGPUs);
+          writeln("GPU", tid, ":", myIters);
+          GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length, tid);
+        }
       } else if (GPUrange.size == 0) {
         const numTasks = here.maxTaskPar;
         if (debugGPUIterator) then
@@ -126,10 +131,13 @@ module GPUIterator {
           }
           // GPU portion
           {
-            const myIters = GPUrange;
+            const numGPUs = nGPUs;
             if (debugGPUIterator) then
-              writeln("GPU portion: ", myIters);
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+              writeln("GPU portion: ", GPUrange, " by ", numGPUs, " GPUs");
+            coforall tid in 0..#numGPUs {
+              const myIters = computeChunk(GPUrange, tid, numGPUs);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length, tid);
+            }
           }
         }
       }
@@ -138,7 +146,7 @@ module GPUIterator {
     iter createTaskAndYield(r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void)) {
+                            GPUWrapper: func(int, int, int, int, void)) {
       halt("This is dummy");
     }
 
@@ -170,7 +178,7 @@ module GPUIterator {
     // follower (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper: func(int, int, int, int, void),
              CPUPercent: int = 0,
              followThis
              )
@@ -194,7 +202,7 @@ module GPUIterator {
     // standalone (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper: func(int, int, int, int, void),
              CPUPercent: int = 0
              )
       where tag == iterKind.standalone
@@ -221,7 +229,7 @@ module GPUIterator {
 
     // serial iterator (block distributed domains)
     iter GPU(D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper: func(int, int, int, int, void),
              CPUPercent: int = 0
              )
       where isRectangularDom(D)
@@ -238,7 +246,7 @@ module GPUIterator {
     // leader (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper: func(int, int, int, int, void),
              CPUPercent: int = 0
              )
       where tag == iterKind.leader {
@@ -255,7 +263,7 @@ module GPUIterator {
     // follower
     iter GPU(param tag: iterKind,
              r:range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper: func(int, int, int, int, void),
              CPUPercent: int = 0,
              followThis
              )

From cc08faddb57b6953e81320cdd9c4bbf2e05c0084 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Feb 2020 12:49:43 -0500
Subject: [PATCH 002/118] Update Multi GPU support

---
 src/GPUAPI.chpl      |  21 +++++++++
 src/GPUIterator.chpl | 106 +++++++++++++++++++++++++++++++++----------
 2 files changed, 104 insertions(+), 23 deletions(-)
 create mode 100644 src/GPUAPI.chpl

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
new file mode 100644
index 0000000..750051a
--- /dev/null
+++ b/src/GPUAPI.chpl
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) 2019, Rice University
+ * Copyright (c) 2019, Georgia Institute of Technology
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+module GPUAPI {
+    extern proc GetDeviceCount(ref count: int);
+    extern proc SetDevice(device: int);   
+}
\ No newline at end of file
diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index b6ff44b..371cb2a 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -18,9 +18,16 @@
 module GPUIterator {
     use Time;
     use BlockDist;
+    use GPUAPI;
 
     config param debugGPUIterator = false;
-    config const nGPUs = 1;
+    config const nGPUs = getNumDevices();
+
+    proc getNumDevices() {
+       var count: int;
+       GetDeviceCount(count);
+       return count;
+    }
 
     // Utility functions
     inline proc computeSubranges(whole: range(?),
@@ -56,10 +63,25 @@ module GPUIterator {
       where tag == iterKind.leader {
 
       if (CPUrange.size == 0) {
-        const myIters = GPUrange;
-        if (debugGPUIterator) then
-          writeln("GPU portion: ", myIters);
-        GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+        select nGPUs {
+          when 0 {
+            writeln("Warning: No GPUs found");
+          }
+          when 1 {
+            const myIters = GPUrange;
+            if (debugGPUIterator) then
+              writeln("GPU portion: ", myIters);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+          }
+          otherwise {
+            coforall tid in 0..#nGPUs {
+              const myIters = computeChunk(GPUrange, tid, nGPUs);
+              writeln("GPU", tid, ":", myIters);
+              SetDevice(tid);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+            }
+          }
+        }
       } else if (GPUrange.size == 0) {
         const numTasks = here.maxTaskPar;
         if (debugGPUIterator) then
@@ -82,10 +104,25 @@ module GPUIterator {
           }
           // GPU portion
           {
-            const myIters = GPUrange;
-            if (debugGPUIterator) then
-              writeln("GPU portion: ", myIters);
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length, 0);
+            select nGPUs {
+              when 0 {
+                writeln("Warning: No GPUs found");
+              }
+              when 1 {
+                const myIters = GPUrange;
+                if (debugGPUIterator) then
+                  writeln("GPU portion: ", myIters);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+              }
+              otherwise {
+                coforall tid in 0..#nGPUs {
+                  const myIters = computeChunk(GPUrange, tid, nGPUs);
+                  writeln("GPU", tid, ":", myIters);
+                  SetDevice(tid);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                }
+              }
+            }
           }
         }
       }
@@ -99,13 +136,24 @@ module GPUIterator {
       where tag == iterKind.standalone {
 
       if (CPUrange.size == 0) {
-        const numGPUs = nGPUs;
-        if (debugGPUIterator) then
-          writeln("GPU portion: ", GPUrange, " by ", numGPUs, " GPUs");
-        coforall tid in 0..#numGPUs {
-          const myIters = computeChunk(GPUrange, tid, numGPUs);
-          writeln("GPU", tid, ":", myIters);
-          GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length, tid);
+        select nGPUs {
+          when 0 {
+            writeln("Warning: No GPUs found");
+          }
+          when 1 {
+            const myIters = GPUrange;
+            if (debugGPUIterator) then
+              writeln("GPU portion: ", myIters);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+          }
+          otherwise {
+            coforall tid in 0..#nGPUs {
+              const myIters = computeChunk(GPUrange, tid, nGPUs);
+              writeln("GPU", tid, ":", myIters);
+              SetDevice(tid);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+            }
+          }
         }
       } else if (GPUrange.size == 0) {
         const numTasks = here.maxTaskPar;
@@ -131,12 +179,24 @@ module GPUIterator {
           }
           // GPU portion
           {
-            const numGPUs = nGPUs;
-            if (debugGPUIterator) then
-              writeln("GPU portion: ", GPUrange, " by ", numGPUs, " GPUs");
-            coforall tid in 0..#numGPUs {
-              const myIters = computeChunk(GPUrange, tid, numGPUs);
-              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length, tid);
+            select nGPUs {
+              when 0 {
+                writeln("Warning: No GPUs found");
+              }
+              when 1 {
+                const myIters = GPUrange;
+                if (debugGPUIterator) then
+                  writeln("GPU portion: ", myIters);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+              }
+              otherwise {
+                coforall tid in 0..#nGPUs {
+                  const myIters = computeChunk(GPUrange, tid, nGPUs);
+                  writeln("GPU", tid, ":", myIters);
+                  SetDevice(tid);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                }
+              }
             }
           }
         }
@@ -310,4 +370,4 @@ module GPUIterator {
       for i in r do
         yield i;
     }
-}
\ No newline at end of file
+}

From 3f361398c212b7899004ea5bfa1e56fb9d899d69 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Feb 2020 12:56:39 -0500
Subject: [PATCH 003/118] Update Multi GPU support

---
 src/GPUAPI.cu | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 src/GPUAPI.cu

diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
new file mode 100644
index 0000000..0ddbf8b
--- /dev/null
+++ b/src/GPUAPI.cu
@@ -0,0 +1,55 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <assert.h>
+
+#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
+#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
+
+inline void __cudaSafeCall( cudaError err, const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+    if ( cudaSuccess != err )
+    {
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
+    }
+#endif
+
+    return;
+}
+
+inline void __cudaCheckError( const char *file, const int line )
+{
+#ifdef CUDA_ERROR_CHECK
+  cudaError err = cudaGetLastError();
+  if ( cudaSuccess != err )
+    {
+      fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
+               file, line, cudaGetErrorString( err ) );
+      exit( -1 );
+    }
+
+  // More careful checking. However, this will affect performance.
+  // Comment away if needed.
+  err = cudaDeviceSynchronize();
+  if( cudaSuccess != err )
+    {
+      fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+               file, line, cudaGetErrorString( err ) );
+      exit( -1 );
+    }
+#endif
+}
+
+extern "C" {
+
+  void GetDeviceCount(int *count) {
+    CudaSafeCall(cudaGetDeviceCount(count));
+  }
+  
+  void SetDevice(int device) {
+    CudaSafeCall(cudaSetDevice(device));
+  }
+}

From 6daa1dc8f5fc7ecaaae3a62e6b51a48e947a2fc2 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Fri, 28 Feb 2020 13:01:59 -0500
Subject: [PATCH 004/118] Update Multiple GPUs Support

---
 src/GPUIterator.chpl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 371cb2a..fc4c640 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -59,7 +59,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, int, void))
+                            GPUWrapper: func(int, int, int, void))
       where tag == iterKind.leader {
 
       if (CPUrange.size == 0) {
@@ -132,7 +132,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, int, void))
+                            GPUWrapper: func(int, int, int, void))
       where tag == iterKind.standalone {
 
       if (CPUrange.size == 0) {
@@ -144,7 +144,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("GPU portion: ", myIters);
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
@@ -206,7 +206,7 @@ module GPUIterator {
     iter createTaskAndYield(r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, int, void)) {
+                            GPUWrapper: func(int, int, int, void)) {
       halt("This is dummy");
     }
 
@@ -238,7 +238,7 @@ module GPUIterator {
     // follower (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, int, void),
+             GPUWrapper: func(int, int, int, void),
              CPUPercent: int = 0,
              followThis
              )
@@ -262,7 +262,7 @@ module GPUIterator {
     // standalone (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, int, void),
+             GPUWrapper: func(int, int, int, void),
              CPUPercent: int = 0
              )
       where tag == iterKind.standalone
@@ -289,7 +289,7 @@ module GPUIterator {
 
     // serial iterator (block distributed domains)
     iter GPU(D: domain,
-             GPUWrapper: func(int, int, int, int, void),
+             GPUWrapper: func(int, int, int, void),
              CPUPercent: int = 0
              )
       where isRectangularDom(D)
@@ -306,7 +306,7 @@ module GPUIterator {
     // leader (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, int, void),
+             GPUWrapper: func(int, int, int, void),
              CPUPercent: int = 0
              )
       where tag == iterKind.leader {
@@ -323,7 +323,7 @@ module GPUIterator {
     // follower
     iter GPU(param tag: iterKind,
              r:range(?),
-             GPUWrapper: func(int, int, int, int, void),
+             GPUWrapper: func(int, int, int, void),
              CPUPercent: int = 0,
              followThis
              )

From 01e74116a61bd2c0950312f32d4dffebb8ea81e3 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Fri, 28 Feb 2020 13:50:49 -0500
Subject: [PATCH 005/118] Update Debug Messages

---
 src/GPUIterator.chpl | 50 ++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index fc4c640..f5c4045 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -70,13 +70,14 @@ module GPUIterator {
           when 1 {
             const myIters = GPUrange;
             if (debugGPUIterator) then
-              writeln("GPU portion: ", myIters);
+              writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
             GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
-              writeln("GPU", tid, ":", myIters);
+              if (debugGPUIterator) then
+                writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid);
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
@@ -85,7 +86,7 @@ module GPUIterator {
       } else if (GPUrange.size == 0) {
         const numTasks = here.maxTaskPar;
         if (debugGPUIterator) then
-          writeln("CPU portion: ", CPUrange, " by ", numTasks, " tasks");
+          writeln("[DEBUG GPUITERATOR] CPU portion: ", CPUrange, " by ", numTasks, " tasks", " GPU portion is ZERO");
         coforall tid in 0..#numTasks {
           const myIters = computeChunk(CPUrange, tid, numTasks);
           yield (myIters.translate(-r.low),);
@@ -96,7 +97,7 @@ module GPUIterator {
           {
             const numTasks = here.maxTaskPar;
             if (debugGPUIterator) then
-              writeln("CPU portion: ", CPUrange, " by ", numTasks, " tasks");
+              writeln("[DEBUG GPUITERATOR] CPU portion: ", CPUrange, " by ", numTasks, " tasks");
             coforall tid in 0..#numTasks {
               const myIters = computeChunk(CPUrange, tid, numTasks);
               yield (myIters.translate(-r.low),);
@@ -111,13 +112,14 @@ module GPUIterator {
               when 1 {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
-                  writeln("GPU portion: ", myIters);
+                  writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
                 GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
-                  writeln("GPU", tid, ":", myIters);
+                  if (debugGPUIterator) then
+                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters);
                   SetDevice(tid);
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }
@@ -143,13 +145,14 @@ module GPUIterator {
           when 1 {
             const myIters = GPUrange;
             if (debugGPUIterator) then
-              writeln("GPU portion: ", myIters);
+              writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
             GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
-              writeln("GPU", tid, ":", myIters);
+              if (debugGPUIterator) then
+                writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid);
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
@@ -158,7 +161,7 @@ module GPUIterator {
       } else if (GPUrange.size == 0) {
         const numTasks = here.maxTaskPar;
         if (debugGPUIterator) then
-          writeln("CPU portion: ", CPUrange, " by ", numTasks, " tasks");
+          writeln("[DEBUG GPUITERATOR] CPU portion: ", CPUrange, " by ", numTasks, " tasks", " GPU portion is ZERO");
         coforall tid in 0..#numTasks {
           const myIters = computeChunk(CPUrange, tid, numTasks);
           for i in myIters do
@@ -170,7 +173,7 @@ module GPUIterator {
           {
             const numTasks = here.maxTaskPar;
             if (debugGPUIterator) then
-              writeln("CPU portion: ", CPUrange, " by ", numTasks, " tasks");
+              writeln("[DEBUG GPUITERATOR] CPU portion: ", CPUrange, " by ", numTasks, " tasks");
             coforall tid in 0..#numTasks {
               const myIters = computeChunk(CPUrange, tid, numTasks);
               for i in myIters do
@@ -186,13 +189,14 @@ module GPUIterator {
               when 1 {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
-                  writeln("GPU portion: ", myIters);
+                  writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
                 GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
-                  writeln("GPU", tid, ":", myIters);
+                  if (debugGPUIterator) then
+                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters);
                   SetDevice(tid);
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }
@@ -221,7 +225,7 @@ module GPUIterator {
        && D.dist.type <= Block {
 
       if (debugGPUIterator) {
-        writeln("GPUIterator (leader, block distributed)");
+        writeln("[DEBUG GPUITERATOR] GPUIterator (leader, block distributed)");
       }
 
       coforall loc in D.targetLocales() do on loc {
@@ -250,8 +254,8 @@ module GPUIterator {
       const lowBasedIters = followThis(1).translate(D.low);
 
       if (debugGPUIterator) {
-        writeln("GPUIterator (follower, block distributed)");
-        writeln("Follower received ", followThis, " as work chunk; shifting to ",
+        writeln("[DEBUG GPUITERATOR] GPUIterator (follower, block distributed)");
+        writeln("[DEBUG GPUITERATOR] Follower received ", followThis, " as work chunk; shifting to ",
                 lowBasedIters);
       }
 
@@ -270,13 +274,13 @@ module GPUIterator {
       && D.dist.type <= Block {
 
       if (debugGPUIterator) {
-        writeln("GPUIterator (standalone distributed)");
+        writeln("[DEBUG GPUITERATOR] GPUIterator (standalone distributed)");
       }
 
       // for each locale
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
-          if (debugGPUIterator) then writeln(here, " (", here.name,  ") is responsible for ", subdom);
+          if (debugGPUIterator) then writeln("[DEBUG GPUITERATOR]", here, " (", here.name,  ") is responsible for ", subdom);
           const r = subdom.dim(1);
           const portions = computeSubranges(r, CPUPercent);
 
@@ -296,7 +300,7 @@ module GPUIterator {
       && D.dist.type <= Block {
 
       if (debugGPUIterator) {
-        writeln("GPUIterator (serial distributed)");
+        writeln("[DEBUG GPUITERATOR] GPUIterator (serial distributed)");
       }
       for i in D {
         yield i;
@@ -312,7 +316,7 @@ module GPUIterator {
       where tag == iterKind.leader {
 
       if (debugGPUIterator) then
-	    writeln("In GPUIterator (leader range)");
+	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
 
       const portions = computeSubranges(r, CPUPercent);
       for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
@@ -333,8 +337,8 @@ module GPUIterator {
       const lowBasedIters = followThis(1).translate(r.low);
 
       if (debugGPUIterator) {
-        writeln("GPUIterator (follower)");
-        writeln("Follower received ", followThis, " as work chunk; shifting to ",
+        writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
+        writeln("[DEBUG GPUITERATOR] Follower received ", followThis, " as work chunk; shifting to ",
                 lowBasedIters);
       }
 
@@ -351,7 +355,7 @@ module GPUIterator {
   	  where tag == iterKind.standalone {
 
       if (debugGPUIterator) then
-	    writeln("In GPUIterator (standalone)");
+	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
 
       const portions = computeSubranges(r, CPUPercent);
       for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
@@ -365,7 +369,7 @@ module GPUIterator {
              CPUPercent: int = 0
              ) {
       if (debugGPUIterator) then
-        writeln("In GPUIterator (serial)");
+        writeln("[DEBUG GPUITERATOR] In GPUIterator (serial)");
 
       for i in r do
         yield i;

From 5f47a1d47b639854833680a43b32243f0364409d Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Fri, 28 Feb 2020 14:02:47 -0500
Subject: [PATCH 006/118] Update Debug Messages

---
 src/GPUIterator.chpl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index f5c4045..0902207 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -77,7 +77,7 @@ module GPUIterator {
             coforall tid in 0..#nGPUs {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
-                writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters, " CPU portion is ZERO");
+                writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid);
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
@@ -119,7 +119,7 @@ module GPUIterator {
                 coforall tid in 0..#nGPUs {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
-                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters);
+                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid);
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }
@@ -152,7 +152,7 @@ module GPUIterator {
             coforall tid in 0..#nGPUs {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
-                writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters, " CPU portion is ZERO");
+                writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid);
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
@@ -196,7 +196,7 @@ module GPUIterator {
                 coforall tid in 0..#nGPUs {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
-                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portation", ":", myIters);
+                    writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid);
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }

From c7f1615f539465385b58e28a4742a12ba4a1442b Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Feb 2020 14:51:44 -0500
Subject: [PATCH 007/118] Remove redundant spaces

---
 src/GPUAPI.chpl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 750051a..74b4a0d 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -17,5 +17,7 @@
 
 module GPUAPI {
     extern proc GetDeviceCount(ref count: int);
-    extern proc SetDevice(device: int);   
+    extern proc SetDevice(device: int);
+
+
 }
\ No newline at end of file

From 2ccbeca284d6837488227f76474c25b93fa615c2 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 2 Mar 2020 13:00:00 -0500
Subject: [PATCH 008/118] Add GPUAPI Wrapper

---
 src/GPUAPI.chpl |  6 ++++++
 src/GPUAPI.cu   | 53 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 74b4a0d..535d342 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -19,5 +19,11 @@ module GPUAPI {
     extern proc GetDeviceCount(ref count: int);
     extern proc SetDevice(device: int);
 
+    extern proc ProfilerStart();
+    extern proc ProfilerStop();
+
+    extern proc Malloc(ref devPtr: c_void_ptr, size: size_t);
+    extern proc Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int);
+    extern proc Launch(arg1: c_void_ptr, arg2: c_void_ptr, size: size_t);
 
 }
\ No newline at end of file
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index 0ddbf8b..547cf40 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -2,6 +2,7 @@
 #include <stdlib.h>
 #include <sys/time.h>
 #include <assert.h>
+#include <cuda_profiler_api.h>
 
 #define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
 #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
@@ -43,13 +44,63 @@ inline void __cudaCheckError( const char *file, const int line )
 #endif
 }
 
+template<typename functor_type>
+static __global__ void driver_kernel(functor_type functor, unsigned niters) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < niters) {
+        functor(tid);
+    }
+}
+
+template <typename functor_type>
+inline void call_gpu_functor(unsigned niters, unsigned tile_size,
+        cudaStream_t stream, functor_type functor) {
+    //functor_type *actual = (functor_type *)functor;
+
+    const unsigned block_size = tile_size;
+    const unsigned nblocks = (niters + block_size - 1) / block_size;
+
+    driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
+}
+
 extern "C" {
 
   void GetDeviceCount(int *count) {
     CudaSafeCall(cudaGetDeviceCount(count));
   }
-  
+
   void SetDevice(int device) {
     CudaSafeCall(cudaSetDevice(device));
   }
+
+  void ProfilerStart() {
+    CudaSafeCall(cudaProfilerStart());
+  }
+
+  void ProfilerStop() {
+    CudaSafeCall(cudaProfilerStop());
+  }
+
+  void Malloc(void** devPtr, size_t size) {
+    CudaSafeCall(cudaMalloc(devPtr, size));
+    printf("in malloc ptr: %p\n", *devPtr);
+  }
+
+  void Memcpy(void* dst, void* src, size_t count, int kind) {
+      switch (kind) {
+      case 0:
+          CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+          break;
+      case 1:
+          CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+          break;
+      default:
+          printf("Warning\n");
+      }
+  }
+
+  void Launch(int *dA, int *dB, int N) {
+    call_gpu_functor(N, N, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
+    cudaDeviceSynchronize();
+  }
 }

From f4e8bef324f9c5397348225452870c93131b8be5 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Mon, 2 Mar 2020 16:03:12 -0500
Subject: [PATCH 009/118] Update GPUAPI

---
 src/GPUAPI.chpl |  1 +
 src/GPUAPI.cu   | 11 ++++++++---
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 535d342..660386e 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -17,6 +17,7 @@
 
 module GPUAPI {
     extern proc GetDeviceCount(ref count: int);
+    extern proc GetDevice(ref device: int);
     extern proc SetDevice(device: int);
 
     extern proc ProfilerStart();
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index 547cf40..e279955 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -59,7 +59,6 @@ inline void call_gpu_functor(unsigned niters, unsigned tile_size,
 
     const unsigned block_size = tile_size;
     const unsigned nblocks = (niters + block_size - 1) / block_size;
-
     driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
 }
 
@@ -69,6 +68,10 @@ extern "C" {
     CudaSafeCall(cudaGetDeviceCount(count));
   }
 
+  void GetDevice(int *device) {
+    CudaSafeCall(cudaGetDevice(device));
+  }
+
   void SetDevice(int device) {
     CudaSafeCall(cudaSetDevice(device));
   }
@@ -99,8 +102,10 @@ extern "C" {
       }
   }
 
-  void Launch(int *dA, int *dB, int N) {
-    call_gpu_functor(N, N, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
+  void Launch(float *dA, float *dB, int N) {
+    printf("Launching kernel\n");
+    call_gpu_functor(N, 1024, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
     cudaDeviceSynchronize();
+    CudaCheckError();
   }
 }

From f7efe7b77dbf5484a77eb118c39751f1cef18f87 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Mon, 2 Mar 2020 16:07:12 -0500
Subject: [PATCH 010/118] Add vector copy with explicit API

---
 apps/Makefile                                 |  11 +-
 apps/vector_copy/vc.hybrid.dist.explicit.chpl | 125 ++++++++++++++++++
 2 files changed, 133 insertions(+), 3 deletions(-)
 create mode 100644 apps/vector_copy/vc.hybrid.dist.explicit.chpl

diff --git a/apps/Makefile b/apps/Makefile
index 5c81537..ce77b10 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,9 +1,10 @@
 # Flags for the Chapel compiler
-CHPLFLAGS=--fast
+CHPLFLAGS=--fast -sverbose -sdebugGPUIterator
 CHPLMODULE=../../src
 
 # For CUDA
-CUDALIBSFLAGS=-L/usr/local/cuda/lib64 -lcudart -lcuda
+#CUDALIBSFLAGS=-L/usr/local/cuda/lib64 -lcudart -lcuda
+CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
 NVCCFLAGS=-O3 -arch sm_37 -std=c++11
 
@@ -30,7 +31,7 @@ baseline: $(TARGET).baseline.chpl
 
 .PHONY: blas
 blas:
-	chpl $(CHPLFLAGS) $(TARGET).blas.chpl 
+	chpl $(CHPLFLAGS) $(TARGET).blas.chpl
 
 .PHONY: cudagpu
 cudagpu: $(TARGET).o $(TARGET).gpu.chpl
@@ -44,6 +45,10 @@ cudahybrid: $(TARGET).o $(TARGET).hybrid.chpl
 cudahybrid.dist: $(TARGET).o $(TARGET).hybrid.dist.chpl
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
+.PHONY: cudahybrid.dist.explicit
+cudahybrid.dist.explicit: $(TARGET).o $(TARGET).hybrid.dist.chpl
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(CHPLMODULE)/GPUAPI.o $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS)
+
 .PHONY: openclgpu
 oclgpu: $(TARGET).opencl.o $(TARGET).gpu.chpl
 	chpl $(CHPLFLAGS) $(TARGET).opencl.o $(TARGET).gpu.chpl --ldflags $(OCLLIBSFLAGS)
diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
new file mode 100644
index 0000000..44f335e
--- /dev/null
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
@@ -0,0 +1,125 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var A: [D] real(32);
+var B: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc vcCUDA(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+    var device, count: int;
+    GetDevice(device);
+    GetDeviceCount(count);
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, "), GPU", device, " of ", count, " @", here);
+  }
+  ref lA = A.localSlice(lo .. hi);
+  ref lB = B.localSlice(lo .. hi);
+  writeln("localSlice Size:", lA.size);
+  ProfilerStart();
+  var dA: c_void_ptr;
+  var dB: c_void_ptr;
+  var size: size_t = (lA.size * 4): size_t;
+  Malloc(dA, size);
+  Malloc(dB, size);
+  Memcpy(dB, c_ptrTo(lB), size, 0);
+  Launch(dA, dB, size);
+  Memcpy(c_ptrTo(lA), dA, size, 1);
+  ProfilerStop();
+
+  //vcCUDA(lA, lB, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	for i in 1..n {
+      A(i) = 0: real(32);
+      B(i) = i: real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio) {
+      A(i) = B(i);
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(A);
+	}
+    for i in 1..n {
+      if (A(i) != B(i)) {
+        writeln("Verification Error");
+        exit();
+      }
+    }
+  }
+  writeln("Verified");
+  printResults(execTimes);
+}

From f4bfc1689d5435458febb9b1b7ac018824efeb55 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 5 May 2020 21:53:48 -0400
Subject: [PATCH 011/118] Update GPUIterator and API

---
 src/GPUAPI.chpl      |  9 ++++++---
 src/GPUAPI.cu        | 27 +--------------------------
 src/GPUIterator.chpl | 10 +++++-----
 3 files changed, 12 insertions(+), 34 deletions(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 660386e..9a9a6c5 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -16,9 +16,11 @@
  */
 
 module GPUAPI {
-    extern proc GetDeviceCount(ref count: int);
-    extern proc GetDevice(ref device: int);
-    extern proc SetDevice(device: int);
+    use SysCTypes;
+
+    extern proc GetDeviceCount(ref count: int(32));
+    extern proc GetDevice(ref device: int(32));
+    extern proc SetDevice(device: int(32));
 
     extern proc ProfilerStart();
     extern proc ProfilerStop();
@@ -26,5 +28,6 @@ module GPUAPI {
     extern proc Malloc(ref devPtr: c_void_ptr, size: size_t);
     extern proc Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int);
     extern proc Launch(arg1: c_void_ptr, arg2: c_void_ptr, size: size_t);
+    extern proc LaunchStream(arg1: c_void_ptr, arg2: c_void_ptr, arg3: c_void_ptr, alpha: real(32), size: size_t);
 
 }
\ No newline at end of file
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index e279955..791a8cc 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -4,6 +4,7 @@
 #include <assert.h>
 #include <cuda_profiler_api.h>
 
+#define CUDA_ERROR_CHECK
 #define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
 #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
@@ -44,24 +45,6 @@ inline void __cudaCheckError( const char *file, const int line )
 #endif
 }
 
-template<typename functor_type>
-static __global__ void driver_kernel(functor_type functor, unsigned niters) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < niters) {
-        functor(tid);
-    }
-}
-
-template <typename functor_type>
-inline void call_gpu_functor(unsigned niters, unsigned tile_size,
-        cudaStream_t stream, functor_type functor) {
-    //functor_type *actual = (functor_type *)functor;
-
-    const unsigned block_size = tile_size;
-    const unsigned nblocks = (niters + block_size - 1) / block_size;
-    driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
-}
-
 extern "C" {
 
   void GetDeviceCount(int *count) {
@@ -86,7 +69,6 @@ extern "C" {
 
   void Malloc(void** devPtr, size_t size) {
     CudaSafeCall(cudaMalloc(devPtr, size));
-    printf("in malloc ptr: %p\n", *devPtr);
   }
 
   void Memcpy(void* dst, void* src, size_t count, int kind) {
@@ -101,11 +83,4 @@ extern "C" {
           printf("Warning\n");
       }
   }
-
-  void Launch(float *dA, float *dB, int N) {
-    printf("Launching kernel\n");
-    call_gpu_functor(N, 1024, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
-    cudaDeviceSynchronize();
-    CudaCheckError();
-  }
 }
diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 0902207..fd5bd75 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -24,7 +24,7 @@ module GPUIterator {
     config const nGPUs = getNumDevices();
 
     proc getNumDevices() {
-       var count: int;
+       var count: int(32);
        GetDeviceCount(count);
        return count;
     }
@@ -78,7 +78,7 @@ module GPUIterator {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
-              SetDevice(tid);
+              SetDevice(tid:int(32));
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
           }
@@ -120,7 +120,7 @@ module GPUIterator {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
-                  SetDevice(tid);
+                  SetDevice(tid:int(32));
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }
               }
@@ -153,7 +153,7 @@ module GPUIterator {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
-              SetDevice(tid);
+              SetDevice(tid:int(32));
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
             }
           }
@@ -197,7 +197,7 @@ module GPUIterator {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
-                  SetDevice(tid);
+                  SetDevice(tid:int(32));
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
                 }
               }

From c165e5b394feede425fceebad2f53cbc1a48c120 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 5 May 2020 22:46:18 -0400
Subject: [PATCH 012/118] Update explicit version of VC

---
 apps/vector_copy/vc.hybrid.dist.explicit.chpl |  9 +++--
 apps/vector_copy/vc.kernel.cu                 | 38 +++++++++++++++++++
 apps/vector_copy/vc.kernel.h                  |  1 +
 3 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 apps/vector_copy/vc.kernel.cu
 create mode 100644 apps/vector_copy/vc.kernel.h

diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
index 44f335e..7733fe7 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
@@ -4,6 +4,9 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -27,13 +30,13 @@ var B: [D] real(32);
 ////////////////////////////////////////////////////////////////////////////////
 /// C Interoperability
 ////////////////////////////////////////////////////////////////////////////////
-extern proc vcCUDA(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int);
+extern proc LaunchVC(A: c_void_ptr, B: c_void_ptr, N: size_t);
 
 // CUDAWrapper is called from GPUIterator
 // to invoke a specific CUDA program (using C interoperability)
 proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) {
-    var device, count: int;
+    var device, count: int(32);
     GetDevice(device);
     GetDeviceCount(count);
 	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, "), GPU", device, " of ", count, " @", here);
@@ -48,7 +51,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Malloc(dA, size);
   Malloc(dB, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
-  Launch(dA, dB, size);
+  LaunchVC(dA, dB, size);
   Memcpy(c_ptrTo(lA), dA, size, 1);
   ProfilerStop();
 
diff --git a/apps/vector_copy/vc.kernel.cu b/apps/vector_copy/vc.kernel.cu
new file mode 100644
index 0000000..faf21b1
--- /dev/null
+++ b/apps/vector_copy/vc.kernel.cu
@@ -0,0 +1,38 @@
+#ifndef USE_LAMBDA
+__global__ void vc(float *dA, float *dB, int N) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < N) {
+        dA[id] = dB[id];
+    }
+}
+#else
+template<typename functor_type>
+static __global__ void driver_kernel(functor_type functor, unsigned niters) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < niters) {
+        functor(tid);
+    }
+}
+
+template <typename functor_type>
+inline void call_gpu_functor(unsigned niters, unsigned tile_size,
+        cudaStream_t stream, functor_type functor) {
+    //functor_type *actual = (functor_type *)functor;
+
+    const unsigned block_size = tile_size;
+    const unsigned nblocks = (niters + block_size - 1) / block_size;
+    driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
+}
+#endif
+
+extern "C" {
+#ifndef USE_LAMBDA    
+    void LaunchVC(float* dA, float *dB, int N) {
+        vc<<<ceil(((float)N)/1024), 1024>>>(dA, dB, N);
+    }
+#else
+    void LaunchVC(float *dA, float *dB, int N) {
+        call_gpu_functor(N, 1024, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
+    }
+#endif    
+}
diff --git a/apps/vector_copy/vc.kernel.h b/apps/vector_copy/vc.kernel.h
new file mode 100644
index 0000000..eae6b4a
--- /dev/null
+++ b/apps/vector_copy/vc.kernel.h
@@ -0,0 +1 @@
+void LaunchVC(float *dA, float *dB, int N);

From 805fd50b0dcaab4e1d51ce181cc0f8d7d2935686 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 5 May 2020 22:46:34 -0400
Subject: [PATCH 013/118] Update Makefile

---
 apps/Makefile | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/apps/Makefile b/apps/Makefile
index ce77b10..1dc84d4 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,12 +1,13 @@
 # Flags for the Chapel compiler
 CHPLFLAGS=--fast -sverbose -sdebugGPUIterator
 CHPLMODULE=../../src
+GPUAPIFLAGS=$(CHPLMODULE)/GPUAPI.h GPUAPI.o
 
 # For CUDA
-#CUDALIBSFLAGS=-L/usr/local/cuda/lib64 -lcudart -lcuda
-CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
+CUDALIBSFLAGS=-L/usr/local/cuda/lib64 -lcudart -lcuda
+#CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
-NVCCFLAGS=-O3 -arch sm_37 -std=c++11
+NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda
 
 # For OpenCL
 OCLLIBSFLAGS=-framework OpenCL
@@ -22,6 +23,9 @@ all: baseline cudagpu cudahybrid cudahybrid.dist
 $(TARGET).o: $(TARGET).cu
 	nvcc $(NVCCFLAGS) -c $^
 
+GPUAPI.o: $(CHPLMODULE)/GPUAPI.cu
+	nvcc $(NVCCFLAGS) -c $^
+
 $(TARGET).opencl.o: $(TARGET).opencl.c
 	gcc -O3 -Wall $(OCLFLAGS) -c $^
 
@@ -39,15 +43,18 @@ cudagpu: $(TARGET).o $(TARGET).gpu.chpl
 
 .PHONY: cudahybrid
 cudahybrid: $(TARGET).o $(TARGET).hybrid.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(TARGET).hybrid.chpl $(CUDALIBSFLAGS)
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist
 cudahybrid.dist: $(TARGET).o $(TARGET).hybrid.dist.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist.explicit
-cudahybrid.dist.explicit: $(TARGET).o $(TARGET).hybrid.dist.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(CHPLMODULE)/GPUAPI.o $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS)
+cudahybrid.dist.explicit: $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.chpl
+	nvcc $(NVCCFLAGS) -DUSE_LAMBDA -c $(TARGET).kernel.cu -o $(TARGET).lambda.o
+	nvcc $(NVCCFLAGS)              -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.kernel
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).lambda.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.lambda
 
 .PHONY: openclgpu
 oclgpu: $(TARGET).opencl.o $(TARGET).gpu.chpl
@@ -71,4 +78,4 @@ hiphybrid: $(TARGET).cu $(TARGET).hybrid.chpl
 
 .PHONY: clean
 clean:
-	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).o *_real
+	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.explicit.kernel $(TARGET).hybrid.dist.explicit.lambda $(TARGET).o *_real

From 0cd94686baae9db76f50c793c0b2747cfe27ecd6 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 6 May 2020 11:49:10 -0400
Subject: [PATCH 014/118] Add GPUAPI.h

---
 src/GPUAPI.h | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 src/GPUAPI.h

diff --git a/src/GPUAPI.h b/src/GPUAPI.h
new file mode 100644
index 0000000..b1e5534
--- /dev/null
+++ b/src/GPUAPI.h
@@ -0,0 +1,11 @@
+#ifndef _GPU_API_H
+#define _GPU_API_H
+
+void GetDeviceCount(int*);
+void GetDevice(int*);
+void SetDevice(int);
+void ProfilerStart();
+void ProfilerStop();
+void Malloc(void**, size_t);
+void Memcpy(void*, void*, size_t, int);
+#endif

From dec2724742253506b303a658cd47aed3377ce75b Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 6 May 2020 11:50:40 -0400
Subject: [PATCH 015/118] Update Makefile

---
 apps/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/Makefile b/apps/Makefile
index 1dc84d4..f96c4f7 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -50,7 +50,7 @@ cudahybrid.dist: $(TARGET).o $(TARGET).hybrid.dist.chpl
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist.explicit
-cudahybrid.dist.explicit: $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.chpl
+cudahybrid.dist.explicit: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.chpl
 	nvcc $(NVCCFLAGS) -DUSE_LAMBDA -c $(TARGET).kernel.cu -o $(TARGET).lambda.o
 	nvcc $(NVCCFLAGS)              -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.kernel

From 0fa8484807827a7e8c20e1c5006f610d52587b07 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Mon, 11 May 2020 00:17:23 -0700
Subject: [PATCH 016/118] Add explicit version of stream

---
 apps/stream/stream.hybrid.dist.explicit.chpl | 136 +++++++++++++++++++
 apps/stream/stream.kernel.cu                 |  14 ++
 apps/stream/stream.kernel.h                  |   1 +
 3 files changed, 151 insertions(+)
 create mode 100644 apps/stream/stream.hybrid.dist.explicit.chpl
 create mode 100644 apps/stream/stream.kernel.cu
 create mode 100644 apps/stream/stream.kernel.h

diff --git a/apps/stream/stream.hybrid.dist.explicit.chpl b/apps/stream/stream.hybrid.dist.explicit.chpl
new file mode 100644
index 0000000..65db44f
--- /dev/null
+++ b/apps/stream/stream.hybrid.dist.explicit.chpl
@@ -0,0 +1,136 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config const alpha = 3.0: real(32);
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var A: [D] real(32);
+var B: [D] real(32);
+var C: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchStream(A: c_void_ptr, B: c_void_ptr, C: c_void_ptr, alpha: c_float, N: size_t);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+    var device, count: int(32);
+    GetDevice(device);
+    GetDeviceCount(count);
+    writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, "), GPU", device, " of ", count, " @", here);
+  }
+
+  ref lA = A.localSlice(lo .. hi);
+  ref lB = B.localSlice(lo .. hi);
+  ref lC = C.localSlice(lo .. hi);
+  writeln("localSlice Size:", lA.size);
+  ProfilerStart();
+  var dA: c_void_ptr;
+  var dB: c_void_ptr;
+  var dC: c_void_ptr;
+  var size: size_t = (lA.size * 4): size_t;
+  Malloc(dA, size);
+  Malloc(dB, size);
+  Malloc(dC, size);
+  Memcpy(dB, c_ptrTo(lB), size, 0);
+  Memcpy(dC, c_ptrTo(lC), size, 0);
+  LaunchStream(dA, dB, dC, alpha, size);
+  Memcpy(c_ptrTo(lA), dA, size, 1);
+  ProfilerStop();
+
+  //streamCUDA(lA, lB, lC, alpha, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Stream: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("alpha: ", alpha);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	for i in 1..n {
+      B(i) = i: real(32);
+      C(i) = 2*i: real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio) {
+      A(i) = B(i) + alpha * C(i);
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(A);
+      for i in 1..n {
+        if(A(i) != B(i) + alpha * C(i)) {
+          writeln("Verification Error");
+          exit();
+        }
+      }
+      writeln("Verified");
+	}
+  }
+  printResults(execTimes);
+}
diff --git a/apps/stream/stream.kernel.cu b/apps/stream/stream.kernel.cu
new file mode 100644
index 0000000..7a9f188
--- /dev/null
+++ b/apps/stream/stream.kernel.cu
@@ -0,0 +1,14 @@
+
+
+__global__ void stream(float* dA, float* dB, float* dC, float alpha, int N) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < N) {
+        dA[id] = dB[id] + alpha*dC[id];
+    }
+}
+
+extern "C" {
+    void LaunchStream(float* dA, float *dB, float* dC, float alpha, int N) {
+        stream<<<ceil(((float)N)/1024), 1024>>>(dA, dB, dC, alpha, N);
+    }
+}
diff --git a/apps/stream/stream.kernel.h b/apps/stream/stream.kernel.h
new file mode 100644
index 0000000..4a9a1af
--- /dev/null
+++ b/apps/stream/stream.kernel.h
@@ -0,0 +1 @@
+void LaunchStream(float* A, float* B, float* C, float alpha, int N);

From 254deeede20af93a1f8e348ff270bdd23135f331 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Mon, 11 May 2020 08:11:57 -0700
Subject: [PATCH 017/118] Update Makefile with GPUAPI.o dependency and
 CUDA_HOME

---
 apps/Makefile | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/apps/Makefile b/apps/Makefile
index f96c4f7..08a0761 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,10 +1,10 @@
 # Flags for the Chapel compiler
-CHPLFLAGS=--fast -sverbose -sdebugGPUIterator
+CHPLFLAGS=--fast -sverbose
 CHPLMODULE=../../src
-GPUAPIFLAGS=$(CHPLMODULE)/GPUAPI.h GPUAPI.o
+GPUAPIFLAGS=-sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
 
 # For CUDA
-CUDALIBSFLAGS=-L/usr/local/cuda/lib64 -lcudart -lcuda
+CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
 NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda
@@ -42,11 +42,11 @@ cudagpu: $(TARGET).o $(TARGET).gpu.chpl
 	chpl $(CHPLFLAGS) $(TARGET).o $(TARGET).gpu.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid
-cudahybrid: $(TARGET).o $(TARGET).hybrid.chpl
+cudahybrid: GPUAPI.o $(TARGET).o $(TARGET).hybrid.chpl
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist
-cudahybrid.dist: $(TARGET).o $(TARGET).hybrid.dist.chpl
+cudahybrid.dist: GPUAPI.o $(TARGET).o $(TARGET).hybrid.dist.chpl
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist.explicit

From e43db1b2b7c9ef626da61dd12926dbbe178cf530 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 12:56:18 -0400
Subject: [PATCH 018/118] Create a dedicated header for call_gpu_functor

---
 apps/Makefile                 |  2 +-
 apps/vector_copy/vc.kernel.cu | 18 +-----------------
 src/lambda.h                  | 17 +++++++++++++++++
 3 files changed, 19 insertions(+), 18 deletions(-)
 create mode 100644 src/lambda.h

diff --git a/apps/Makefile b/apps/Makefile
index 08a0761..100ab65 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -7,7 +7,7 @@ GPUAPIFLAGS=-sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
 CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
-NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda
+NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda -I$(CHPLMODULE)
 
 # For OpenCL
 OCLLIBSFLAGS=-framework OpenCL
diff --git a/apps/vector_copy/vc.kernel.cu b/apps/vector_copy/vc.kernel.cu
index faf21b1..c82e7f0 100644
--- a/apps/vector_copy/vc.kernel.cu
+++ b/apps/vector_copy/vc.kernel.cu
@@ -6,23 +6,7 @@ __global__ void vc(float *dA, float *dB, int N) {
     }
 }
 #else
-template<typename functor_type>
-static __global__ void driver_kernel(functor_type functor, unsigned niters) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    if (tid < niters) {
-        functor(tid);
-    }
-}
-
-template <typename functor_type>
-inline void call_gpu_functor(unsigned niters, unsigned tile_size,
-        cudaStream_t stream, functor_type functor) {
-    //functor_type *actual = (functor_type *)functor;
-
-    const unsigned block_size = tile_size;
-    const unsigned nblocks = (niters + block_size - 1) / block_size;
-    driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
-}
+#include "lambda.h"
 #endif
 
 extern "C" {
diff --git a/src/lambda.h b/src/lambda.h
new file mode 100644
index 0000000..242c40d
--- /dev/null
+++ b/src/lambda.h
@@ -0,0 +1,17 @@
+template<typename functor_type>
+static __global__ void driver_kernel(functor_type functor, unsigned niters) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < niters) {
+        functor(tid);
+    }
+}
+
+template <typename functor_type>
+inline void call_gpu_functor(unsigned niters, unsigned tile_size,
+        cudaStream_t stream, functor_type functor) {
+    //functor_type *actual = (functor_type *)functor;
+
+    const unsigned block_size = tile_size;
+    const unsigned nblocks = (niters + block_size - 1) / block_size;
+    driver_kernel<<<nblocks, block_size, 0, stream>>>(functor, niters);
+}

From 0dd5562cd20c5e3ed91f7a7ba782ec87344a79c6 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 14:13:43 -0400
Subject: [PATCH 019/118] Add Free

---
 src/GPUAPI.chpl | 4 +---
 src/GPUAPI.cu   | 3 +++
 src/GPUAPI.h    | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 9a9a6c5..97ac31b 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -27,7 +27,5 @@ module GPUAPI {
 
     extern proc Malloc(ref devPtr: c_void_ptr, size: size_t);
     extern proc Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int);
-    extern proc Launch(arg1: c_void_ptr, arg2: c_void_ptr, size: size_t);
-    extern proc LaunchStream(arg1: c_void_ptr, arg2: c_void_ptr, arg3: c_void_ptr, alpha: real(32), size: size_t);
-
+    extern proc Free(devPtr: c_void_ptr);
 }
\ No newline at end of file
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index 791a8cc..b2cedf4 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -83,4 +83,7 @@ extern "C" {
           printf("Warning\n");
       }
   }
+    void Free(void* devPtr) {
+        CudaSafeCall(cudaFree(devPtr));
+    }
 }
diff --git a/src/GPUAPI.h b/src/GPUAPI.h
index b1e5534..441b0e1 100644
--- a/src/GPUAPI.h
+++ b/src/GPUAPI.h
@@ -8,4 +8,5 @@ void ProfilerStart();
 void ProfilerStop();
 void Malloc(void**, size_t);
 void Memcpy(void*, void*, size_t, int);
+void Free(void*);
 #endif

From ef70ae5c9198ea56ed00c08cbeed8bfcfb042f04 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 14:15:13 -0400
Subject: [PATCH 020/118] Add Free

---
 apps/vector_copy/vc.hybrid.dist.explicit.chpl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
index 7733fe7..77c898f 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
@@ -53,6 +53,8 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Memcpy(dB, c_ptrTo(lB), size, 0);
   LaunchVC(dA, dB, size);
   Memcpy(c_ptrTo(lA), dA, size, 1);
+  Free(dA);
+  Free(dB);
   ProfilerStop();
 
   //vcCUDA(lA, lB, 0, hi-lo, N);

From 1f883b766f2992e8042a685c7aaad899777d014d Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 14:16:33 -0400
Subject: [PATCH 021/118] Add explicit version bs (incomplete)

---
 apps/blackscholes/bs.hybrid.dist.chpl         |   4 +-
 .../blackscholes/bs.hybrid.dist.explicit.chpl | 209 ++++++++++++++++++
 apps/blackscholes/bs.kernel.cu                | 163 ++++++++++++++
 apps/blackscholes/bs.kernel.h                 |   1 +
 4 files changed, 375 insertions(+), 2 deletions(-)
 create mode 100644 apps/blackscholes/bs.hybrid.dist.explicit.chpl
 create mode 100644 apps/blackscholes/bs.kernel.cu
 create mode 100644 apps/blackscholes/bs.kernel.h

diff --git a/apps/blackscholes/bs.hybrid.dist.chpl b/apps/blackscholes/bs.hybrid.dist.chpl
index d2f5ef2..f486fb0 100644
--- a/apps/blackscholes/bs.hybrid.dist.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.chpl
@@ -182,8 +182,8 @@ proc main() {
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
-      writeln(call);
-      writeln(put);
+      writeln("call: ", call);
+      writeln("put: ", put);
 	}
   }
   printResults(execTimes);
diff --git a/apps/blackscholes/bs.hybrid.dist.explicit.chpl b/apps/blackscholes/bs.hybrid.dist.explicit.chpl
new file mode 100644
index 0000000..4c11ac4
--- /dev/null
+++ b/apps/blackscholes/bs.hybrid.dist.explicit.chpl
@@ -0,0 +1,209 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var rand: [D] real(32);
+var put: [D] real(32);
+var call: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchBS(drand: c_void_ptr, dput: c_void_ptr, dcall: c_void_ptr, N: size_t);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  ref lrand = rand.localSlice(lo .. hi);  
+  ref lput = put.localSlice(lo .. hi);
+  ref lcall = call.localSlice(lo .. hi);
+  writeln(lrand, lput, lcall);
+
+  ProfilerStart();
+  var drand, dput, dcall: c_void_ptr;
+  var size: size_t = (lrand.size:size_t * c_sizeof(lrand.eltType)) : size_t;
+  Malloc(drand, size);
+  Malloc(dput, size);
+  Malloc(dcall, size);
+  Memcpy(drand, c_ptrTo(lrand), size, 0);
+  LaunchBS(drand, dput, dcall, size);
+  Memcpy(c_ptrTo(lput), dput, size, 1);
+  Memcpy(c_ptrTo(lcall), dcall, size, 1);
+
+  Free(drand);
+  Free(dput);
+  Free(dcall);
+  
+  ProfilerStop();
+  
+  //  bsCUDA(lrand, lput, lcall, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  const S_LOWER_LIMIT = 10.0: real(32);
+  const S_UPPER_LIMIT = 100.0: real(32);
+  const K_LOWER_LIMIT = 10.0: real(32);
+  const K_UPPER_LIMIT = 100.0: real(32);
+  const T_LOWER_LIMIT = 1.0: real(32);
+  const T_UPPER_LIMIT = 10.0: real(32);
+  const R_LOWER_LIMIT = 0.01: real(32);
+  const R_UPPER_LIMIT = 0.05: real(32);
+  const SIGMA_LOWER_LIMIT = 0.01: real(32);
+  const SIGMA_UPPER_LIMIT = 0.10: real(32);
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	for i in 1..n {
+      rand(i) = (i: real(32) / n): real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio)  {
+      var c1 = 0.319381530: real(32);
+      var c2 = -0.356563782: real(32);
+      var c3 = 1.781477937: real(32);
+      var c4 = -1.821255978: real(32);
+      var c5 = 1.330274429: real(32);
+
+      var zero = 0.0: real(32);
+      var one = 1.0: real(32);
+      var two = 2.0: real(32);
+      var temp4 = 0.2316419: real(32);
+
+      var oneBySqrt2pi = 0.398942280: real(32);
+
+      var inRand = rand(i);
+
+      var S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0 - inRand);
+      var K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0 - inRand);
+      var T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0 - inRand);
+      var R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0 - inRand);
+      var sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0 - inRand);
+
+      var sigmaSqrtT = sigmaVal * sqrt(T);
+
+      var d1 = (log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+      var d2 = d1 - sigmaSqrtT;
+
+      var KexpMinusRT = K * exp(-R * T);
+
+      var phiD1, phiD2: real(32);
+
+      // phiD1 = phi(d1)
+      var X = d1;
+      var absX = abs(X);
+      var t = one / (one + temp4 * absX);
+      var y = one - oneBySqrt2pi * Math.exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD1 = one - y;
+      } else {
+		phiD1 = y;
+      }
+      // phiD2 = phi(d2)
+      X = d2;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD2 = one - y;
+      } else {
+		phiD2 = y;
+      }
+
+      call(i) = S * phiD1 - KexpMinusRT * phiD2;
+
+      // phiD1 = phi(-d1);
+      X = -d1;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD1 = one - y;
+      } else {
+		phiD1 = y;
+      }
+
+      // phiD2 = phi(-d2);
+      X = -d2;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD2 = one - y;
+      } else {
+		phiD2 = y;
+      }
+
+      put(i) = KexpMinusRT * phiD2 - S * phiD1;
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln("call: ", call);
+      writeln("put: ", put);
+	}
+  }
+  printResults(execTimes);
+}
diff --git a/apps/blackscholes/bs.kernel.cu b/apps/blackscholes/bs.kernel.cu
new file mode 100644
index 0000000..7fea017
--- /dev/null
+++ b/apps/blackscholes/bs.kernel.cu
@@ -0,0 +1,163 @@
+#define S_LOWER_LIMIT 10.0f
+#define S_UPPER_LIMIT 100.0f
+#define K_LOWER_LIMIT 10.0f
+#define K_UPPER_LIMIT 100.0f
+#define T_LOWER_LIMIT 1.0f
+#define T_UPPER_LIMIT 10.0f
+#define R_LOWER_LIMIT 0.01f
+#define R_UPPER_LIMIT 0.05f
+#define SIGMA_LOWER_LIMIT 0.01f
+#define SIGMA_UPPER_LIMIT 0.10f
+
+#ifndef USE_LAMBDA
+__global__ void bs(float *drand, float *dput, float *dcall, int n) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < n) {
+	float c1 = 0.319381530f;
+	float c2 = -0.356563782f;
+	float c3 = 1.781477937f;
+	float c4 = -1.821255978f;
+	float c5 = 1.330274429f;
+		    
+	float zero = 0.0f;
+	float one = 1.0f;
+	float two = 2.0f;
+	float temp4 = 0.2316419f;
+		    
+	float oneBySqrt2pi = 0.398942280f;
+		    
+	float d1, d2;
+	float phiD1, phiD2;
+	float sigmaSqrtT;
+	float KexpMinusRT;
+		    
+	float inRand;		    
+		    
+	inRand = drand[id];
+		    
+	float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
+	float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
+	float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
+	float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
+	float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
+		    
+	sigmaSqrtT = sigmaVal * (float)sqrt(T);
+		    
+	d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+	d2 = d1 - sigmaSqrtT;
+		    
+	KexpMinusRT = K * (float)exp(-R * T);
+		    
+	// phiD1 = phi(d1)
+	float X = d1;
+	float absX = (float)abs(X);
+	float t = one / (one + temp4 * absX);	
+	float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+	phiD1 = (X < zero) ? (one - y) : y; 
+	// phiD2 = phi(d2)
+	X = d2;
+	absX = abs(X);
+	t = one / (one + temp4 * absX);	
+	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+	phiD2 = (X < zero) ? (one - y) : y; 
+		    
+	dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
+	
+	// phiD1 = phi(-d1);
+	X = -d1;
+	absX = abs(X);
+	t = one / (one + temp4 * absX);	
+	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+	phiD1 = (X < zero) ? (one - y) : y; 
+		    
+	// phiD2 = phi(-d2);
+	X = -d2;
+	absX = abs(X);
+	t = one / (one + temp4 * absX);	
+	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+	phiD2 = (X < zero) ? (one - y) : y; 
+		    
+	dput[id] = KexpMinusRT * phiD2 - S * phiD1;			
+    }
+}
+#else
+#include "lambda.h"
+#endif
+
+extern "C" {
+#ifndef USE_LAMBDA    
+    void LaunchBS(float* drand, float *dput, float *dcall, int N) {
+        bs<<<ceil(((float)N)/1024), 1024>>>(drand, dput, dcall, N);
+    }
+#else
+    void LaunchBS(float* drand, float *dput, float *dcall, int N) {    
+        call_gpu_functor(N, 1024, NULL, [=] __device__ (int id) {
+                float c1 = 0.319381530f;
+                float c2 = -0.356563782f;
+                float c3 = 1.781477937f;
+                float c4 = -1.821255978f;
+                float c5 = 1.330274429f;
+                
+                float zero = 0.0f;
+                float one = 1.0f;
+                float two = 2.0f;
+                float temp4 = 0.2316419f;
+                
+                float oneBySqrt2pi = 0.398942280f;
+                
+                float d1, d2;
+                float phiD1, phiD2;
+                float sigmaSqrtT;
+                float KexpMinusRT;
+                
+                float inRand;		    
+                
+                inRand = drand[id];
+                
+                float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
+                float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
+                float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
+                float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
+                float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
+                
+                sigmaSqrtT = sigmaVal * (float)sqrt(T);
+                
+                d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+                d2 = d1 - sigmaSqrtT;
+                
+                KexpMinusRT = K * (float)exp(-R * T);
+                
+                // phiD1 = phi(d1)
+                float X = d1;
+                float absX = (float)abs(X);
+                float t = one / (one + temp4 * absX);	
+                float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+                phiD1 = (X < zero) ? (one - y) : y; 
+                // phiD2 = phi(d2)
+                X = d2;
+                absX = abs(X);
+                t = one / (one + temp4 * absX);	
+                y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+                phiD2 = (X < zero) ? (one - y) : y; 
+                
+                dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
+                
+                // phiD1 = phi(-d1);
+                X = -d1;
+                absX = abs(X);
+                t = one / (one + temp4 * absX);	
+                y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+                phiD1 = (X < zero) ? (one - y) : y; 
+                
+                // phiD2 = phi(-d2);
+                X = -d2;
+                absX = abs(X);
+                t = one / (one + temp4 * absX);	
+                y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+                phiD2 = (X < zero) ? (one - y) : y; 
+                
+                dput[id] = KexpMinusRT * phiD2 - S * phiD1;			
+            });
+}
+#endif    
+}
diff --git a/apps/blackscholes/bs.kernel.h b/apps/blackscholes/bs.kernel.h
new file mode 100644
index 0000000..83276a7
--- /dev/null
+++ b/apps/blackscholes/bs.kernel.h
@@ -0,0 +1 @@
+void LaunchBS(float* drand, float *dput, float *dcall, int N);

From 5360478a61bca7a3d7a76daec48d1bbde21356c8 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 17:11:25 -0400
Subject: [PATCH 022/118] Update explicit version bs (now work)

---
 apps/blackscholes/bs.hybrid.dist.explicit.chpl | 9 ++++++---
 apps/blackscholes/bs.kernel.cu                 | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/apps/blackscholes/bs.hybrid.dist.explicit.chpl b/apps/blackscholes/bs.hybrid.dist.explicit.chpl
index 4c11ac4..25198a3 100644
--- a/apps/blackscholes/bs.hybrid.dist.explicit.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.explicit.chpl
@@ -39,16 +39,18 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   ref lrand = rand.localSlice(lo .. hi);  
   ref lput = put.localSlice(lo .. hi);
   ref lcall = call.localSlice(lo .. hi);
-  writeln(lrand, lput, lcall);
 
   ProfilerStart();
-  var drand, dput, dcall: c_void_ptr;
+  var drand: c_void_ptr;
+  var dput: c_void_ptr;
+  var dcall: c_void_ptr;
   var size: size_t = (lrand.size:size_t * c_sizeof(lrand.eltType)) : size_t;
   Malloc(drand, size);
   Malloc(dput, size);
   Malloc(dcall, size);
   Memcpy(drand, c_ptrTo(lrand), size, 0);
-  LaunchBS(drand, dput, dcall, size);
+  LaunchBS(drand, dput, dcall, N:size_t);
+  DeviceSynchronize();
   Memcpy(c_ptrTo(lput), dput, size, 1);
   Memcpy(c_ptrTo(lcall), dcall, size, 1);
 
@@ -202,6 +204,7 @@ proc main() {
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
       writeln("call: ", call);
+      writeln("");
       writeln("put: ", put);
 	}
   }
diff --git a/apps/blackscholes/bs.kernel.cu b/apps/blackscholes/bs.kernel.cu
index 7fea017..22125c6 100644
--- a/apps/blackscholes/bs.kernel.cu
+++ b/apps/blackscholes/bs.kernel.cu
@@ -62,7 +62,7 @@ __global__ void bs(float *drand, float *dput, float *dcall, int n) {
 	phiD2 = (X < zero) ? (one - y) : y; 
 		    
 	dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
-	
+
 	// phiD1 = phi(-d1);
 	X = -d1;
 	absX = abs(X);

From ba8398ef627b18bada5484f22b009b4edac4bb3e Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 17:13:26 -0400
Subject: [PATCH 023/118] Update vc.hybrid.dist.explicit

---
 apps/vector_copy/vc.hybrid.dist.explicit.chpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
index 77c898f..919d3be 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
@@ -51,7 +51,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Malloc(dA, size);
   Malloc(dB, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
-  LaunchVC(dA, dB, size);
+  LaunchVC(dA, dB, N:size_t);
   Memcpy(c_ptrTo(lA), dA, size, 1);
   Free(dA);
   Free(dB);

From b8280d5d55f5a2a5aaed50a05f091e8ef3efcbae Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 17:15:19 -0400
Subject: [PATCH 024/118] Update stream.hybrid.dist.explicit

---
 apps/stream/stream.hybrid.dist.explicit.chpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/apps/stream/stream.hybrid.dist.explicit.chpl b/apps/stream/stream.hybrid.dist.explicit.chpl
index 65db44f..3ee5b28 100644
--- a/apps/stream/stream.hybrid.dist.explicit.chpl
+++ b/apps/stream/stream.hybrid.dist.explicit.chpl
@@ -58,7 +58,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Malloc(dC, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
   Memcpy(dC, c_ptrTo(lC), size, 0);
-  LaunchStream(dA, dB, dC, alpha, size);
+  LaunchStream(dA, dB, dC, alpha, N:size_t);
   Memcpy(c_ptrTo(lA), dA, size, 1);
   ProfilerStop();
 

From b35e867593d8c9b0f5fcb773c345496f984bddfb Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 17:15:57 -0400
Subject: [PATCH 025/118] Update GPUAPI

---
 src/GPUAPI.chpl |  2 ++
 src/GPUAPI.cu   | 15 +++++++++++----
 src/GPUAPI.h    |  1 +
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 97ac31b..8c3b932 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -25,6 +25,8 @@ module GPUAPI {
     extern proc ProfilerStart();
     extern proc ProfilerStop();
 
+    extern proc DeviceSynchronize();
+
     extern proc Malloc(ref devPtr: c_void_ptr, size: size_t);
     extern proc Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int);
     extern proc Free(devPtr: c_void_ptr);
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index b2cedf4..b8b06bd 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -67,6 +67,11 @@ extern "C" {
     CudaSafeCall(cudaProfilerStop());
   }
 
+  void DeviceSynchronize() {
+    CudaCheckError();
+    CudaSafeCall(cudaDeviceSynchronize());
+  }
+
   void Malloc(void** devPtr, size_t size) {
     CudaSafeCall(cudaMalloc(devPtr, size));
   }
@@ -80,10 +85,12 @@ extern "C" {
           CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
           break;
       default:
-          printf("Warning\n");
+          printf("Fatal: Wrong Memcpy kind!\n");
+          exit(1);
       }
   }
-    void Free(void* devPtr) {
-        CudaSafeCall(cudaFree(devPtr));
-    }
+    
+  void Free(void* devPtr) {
+      CudaSafeCall(cudaFree(devPtr));
+  }
 }
diff --git a/src/GPUAPI.h b/src/GPUAPI.h
index 441b0e1..9a7311c 100644
--- a/src/GPUAPI.h
+++ b/src/GPUAPI.h
@@ -6,6 +6,7 @@ void GetDevice(int*);
 void SetDevice(int);
 void ProfilerStart();
 void ProfilerStop();
+void DeviceSynchronize();
 void Malloc(void**, size_t);
 void Memcpy(void*, void*, size_t, int);
 void Free(void*);

From bcdb79cdfccf72abdc68133d21d7b69b6f42ecd7 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 11 May 2020 17:19:09 -0400
Subject: [PATCH 026/118] Add DeviceSynchronize

---
 apps/stream/stream.hybrid.dist.explicit.chpl  | 3 ++-
 apps/vector_copy/vc.hybrid.dist.explicit.chpl | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/apps/stream/stream.hybrid.dist.explicit.chpl b/apps/stream/stream.hybrid.dist.explicit.chpl
index 3ee5b28..6082e06 100644
--- a/apps/stream/stream.hybrid.dist.explicit.chpl
+++ b/apps/stream/stream.hybrid.dist.explicit.chpl
@@ -58,7 +58,8 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Malloc(dC, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
   Memcpy(dC, c_ptrTo(lC), size, 0);
-  LaunchStream(dA, dB, dC, alpha, N:size_t);
+  LaunchStream(dA, dB, dC, alpha, N: size_t);
+  DeviceSynchronize();
   Memcpy(c_ptrTo(lA), dA, size, 1);
   ProfilerStop();
 
diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
index 919d3be..2e73472 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.chpl
@@ -51,7 +51,8 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Malloc(dA, size);
   Malloc(dB, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
-  LaunchVC(dA, dB, N:size_t);
+  LaunchVC(dA, dB, N: size_t);
+  DeviceSynchronize();
   Memcpy(c_ptrTo(lA), dA, size, 1);
   Free(dA);
   Free(dB);

From 7f416e39cb578b097b247c2d64efc5b3df63d46b Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 12 May 2020 15:32:59 -0400
Subject: [PATCH 027/118] Add an initial version of MID API

---
 apps/Makefile                                 |  10 +-
 .../vc.hybrid.dist.explicit.mid.chpl          | 127 ++++++++++++++++++
 src/GPUAPI.chpl                               |  76 +++++++++++
 3 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl

diff --git a/apps/Makefile b/apps/Makefile
index 100ab65..4d60488 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,7 +1,7 @@
 # Flags for the Chapel compiler
 CHPLFLAGS=--fast -sverbose
 CHPLMODULE=../../src
-GPUAPIFLAGS=-sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
+GPUAPIFLAGS=-sdebugGPUAPI -sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
 
 # For CUDA
 CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda
@@ -56,6 +56,14 @@ cudahybrid.dist.explicit: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.exp
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.kernel
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).lambda.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.lambda
 
+.PHONY: cudahybrid.dist.explicit.mid
+cudahybrid.dist.explicit.mid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.mid.chpl
+	nvcc $(NVCCFLAGS) -DUSE_LAMBDA -c $(TARGET).kernel.cu -o $(TARGET).lambda.o
+	nvcc $(NVCCFLAGS)              -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.mid.kernel
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).lambda.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.mid.lambda
+
+
 .PHONY: openclgpu
 oclgpu: $(TARGET).opencl.o $(TARGET).gpu.chpl
 	chpl $(CHPLFLAGS) $(TARGET).opencl.o $(TARGET).gpu.chpl --ldflags $(OCLLIBSFLAGS)
diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl b/apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl
new file mode 100644
index 0000000..e17cf1f
--- /dev/null
+++ b/apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl
@@ -0,0 +1,127 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var A: [D] real(32);
+var B: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchVC(A: c_void_ptr, B: c_void_ptr, N: size_t);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+    var device, count: int(32);
+    GetDevice(device);
+    GetDeviceCount(count);
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, "), GPU", device, " of ", count, " @", here);
+  }
+  ref lA = A.localSlice(lo .. hi);
+  ref lB = B.localSlice(lo .. hi);
+  writeln("localSlice Size:", lA.size);
+  ProfilerStart();
+  var dA = new GPUArray(lA, h2d=false, d2h=true);
+  var dB = new GPUArray(lB, h2d=true, d2h=false);
+  toDevice(dA, dB);
+  LaunchVC(dA.dPtr(), dB.dPtr(), N: size_t);
+  DeviceSynchronize();
+  fromDevice(dA, dB);
+  free(dA, dB);
+  ProfilerStop();
+
+  //vcCUDA(lA, lB, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	for i in 1..n {
+      A(i) = 0: real(32);
+      B(i) = i: real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio) {
+      A(i) = B(i);
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(A);
+	}
+    for i in 1..n {
+      if (A(i) != B(i)) {
+        writeln("Verification Error");
+        exit();
+      }
+    }
+  }
+  writeln("Verified");
+  printResults(execTimes);
+}
diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 8c3b932..480fd86 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -18,6 +18,8 @@
 module GPUAPI {
     use SysCTypes;
 
+    config param debugGPUAPI = false;
+
     extern proc GetDeviceCount(ref count: int(32));
     extern proc GetDevice(ref device: int(32));
     extern proc SetDevice(device: int(32));
@@ -30,4 +32,78 @@ module GPUAPI {
     extern proc Malloc(ref devPtr: c_void_ptr, size: size_t);
     extern proc Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int);
     extern proc Free(devPtr: c_void_ptr);
+
+    class GPUArray {
+      var h2d: bool;
+      var d2h: bool;
+      var devPtr: c_void_ptr;
+      var hosPtr: c_void_ptr;
+      var size: size_t;
+      var sizeInBytes: size_t;
+
+      proc init(ref arr, h2d, d2h) {
+        // Properties
+        this.h2d = h2d;
+        this.d2h = d2h;
+        // Low-level info
+        this.devPtr = nil;
+        this.hosPtr = c_ptrTo(arr);
+        // size info
+        size = arr.size: size_t;
+        sizeInBytes = (((arr.size: size_t) * c_sizeof(arr.eltType)) : size_t);
+        this.complete();
+        // allocation
+        Malloc(devPtr, sizeInBytes);
+        if (debugGPUAPI) { writeln("malloc'ed: ", devPtr, " sizeInBytes: ", sizeInBytes); }
+      }
+
+      proc toDevice() {
+        if (this.h2d) {
+          Memcpy(this.dPtr(), this.hPtr(), this.sizeInBytes, 0);
+          if (debugGPUAPI) { writeln("h2d : ", this.hPtr(), " -> ", this.dPtr(), " transBytes: ", this.sizeInBytes); }
+        } else {
+          if (debugGPUAPI) { writeln("h2d ignored"); }
+        }
+      }
+
+      proc fromDevice() {
+        if (this.d2h) {
+          Memcpy(this.hPtr(), this.dPtr(), this.sizeInBytes, 1);
+          if (debugGPUAPI) { writeln("d2h : ", this.dPtr(), " -> ", this.hPtr(), " transBytes: ", this.sizeInBytes); }
+        } else {
+          if (debugGPUAPI) { writeln("d2h ignored"); }
+        }
+      }
+
+      proc free() {
+        Free(this.dPtr());
+        if (debugGPUAPI) { writeln("free : ", this.dPtr()); }
+      }
+
+      proc dPtr(): c_void_ptr {
+        return devPtr;
+      }
+
+      proc hPtr(): c_void_ptr {
+        return hosPtr;
+      }
+    }
+
+    proc toDevice(args: GPUArray ...?n) {
+      for ga in args {
+        ga.toDevice();
+      }
+    }
+
+    proc fromDevice(args: GPUArray ...?n) {
+      for ga in args {
+        ga.fromDevice();
+      }
+    }
+
+    proc free(args: GPUArray ...?n) {
+      for ga in args {
+        ga.free();
+      }
+    }
 }
\ No newline at end of file

From 1e20acbf4e695400e142d5c198475d9c0fcde376 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 13 May 2020 00:40:40 -0400
Subject: [PATCH 028/118] Add distributed version of LR

---
 apps/Makefile                               |   4 +-
 apps/logisticregression/lr.h                |   2 +
 apps/logisticregression/lr.hybrid.dist.chpl | 156 ++++++++++++++++++++
 3 files changed, 160 insertions(+), 2 deletions(-)
 create mode 100644 apps/logisticregression/lr.h
 create mode 100644 apps/logisticregression/lr.hybrid.dist.chpl

diff --git a/apps/Makefile b/apps/Makefile
index 4d60488..177d15d 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -4,7 +4,7 @@ CHPLMODULE=../../src
 GPUAPIFLAGS=-sdebugGPUAPI -sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
 
 # For CUDA
-CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda
+CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas
 #CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
 #CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
 NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda -I$(CHPLMODULE)
@@ -47,7 +47,7 @@ cudahybrid: GPUAPI.o $(TARGET).o $(TARGET).hybrid.chpl
 
 .PHONY: cudahybrid.dist
 cudahybrid.dist: GPUAPI.o $(TARGET).o $(TARGET).hybrid.dist.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
 .PHONY: cudahybrid.dist.explicit
 cudahybrid.dist.explicit: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.chpl
diff --git a/apps/logisticregression/lr.h b/apps/logisticregression/lr.h
new file mode 100644
index 0000000..7c1d28a
--- /dev/null
+++ b/apps/logisticregression/lr.h
@@ -0,0 +1,2 @@
+void lrCUDA1(float *W, float *Wcurr, int start, int end, int GPUN);
+void lrCUDA2(float* X, float *Y, float *W, float *Wcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN);
diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
new file mode 100644
index 0000000..c505319
--- /dev/null
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -0,0 +1,156 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const nFeatures = 32: int;
+config const nSamples = 32: int;
+config const nIters = 32: int;
+config const CPUPercent1 = 0: int;
+config const CPUPercent2 = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const Space1 = {1..nSamples, 1..nFeatures};
+const ReplicatedSpace1 = Space1 dmapped Replicated();
+var X: [ReplicatedSpace1] real(32);
+
+const Space2 = {1..nSamples};
+const ReplicatedSpace2 = Space2 dmapped Replicated();
+var Y: [ReplicatedSpace2] real(32);
+
+const Space3 = {1..nFeatures};
+const ReplicatedSpace3 = Space3 dmapped Replicated();
+var Wcurr: [ReplicatedSpace3] real(32);
+
+var D: domain(1) dmapped Block(boundingBox = {1..nFeatures}) = {1..nFeatures};
+var W: [D] real(32);
+var alpha = 0.1 : real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc lrCUDA1(W: [] real(32), Wcurr: [] real(32), lo: int, hi: int, N: int);
+extern proc lrCUDA2(X: [] real(32), Y: [] real(32), W: [] real(32), Wcurr: [] real(32), alpha: real(32), nSamples: int, nFeatures: int, lo: int, hi: int, N: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper1(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  lrCUDA1(W, Wcurr, lo, hi, N);
+}
+
+proc CUDAWrapper2(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  ref lW = W.localSlice(lo .. hi);
+  lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
+  writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
+  writeln("CPU Percent1: ", CPUPercent1);
+  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	forall i in D {
+      W(i) = 0: real(32);
+	}
+    coforall loc in Locales do on loc {
+      for i in 1..nSamples {
+        Y(i) = (i % 2): real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = (i % 2): real(32);
+          } else {
+            X(i, j) = 1;
+          }
+        }
+      }
+    }
+
+    const startTime = getCurrentTime();
+	for ite in 1..nIters {
+      coforall loc in Locales {
+        on loc {
+          Wcurr = W;
+        }
+      }
+      const start = getCurrentTime();
+      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
+      //forall i in D {
+		var err = 0: real(32);
+		for s in 1..nSamples {
+          var arg = 0: real(32);
+          for f in 1..nFeatures {
+			arg += Wcurr(f) * X(s, f);
+          }
+          var hypo = 1 / (1 + exp(-arg));
+          err += (hypo - Y(s)) * X(s, i);
+		}
+		W(i) = Wcurr(i) - alpha * err;
+      }
+      writeln(getCurrentTime() - start, " sec");
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(W);
+	}
+  }
+  printResults(execTimes);
+}

From 0fb09a8e832fbd154884d9debd702ef5a10113e7 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Wed, 13 May 2020 15:24:23 -0700
Subject: [PATCH 029/118] Add MM-distributed. Update MM-cuda to calculate MM on
 only GPU portion rather than whole matrix

---
 apps/mm/mm.cu               |  10 +--
 apps/mm/mm.h                |   3 +
 apps/mm/mm.hybrid.dist.chpl | 126 ++++++++++++++++++++++++++++++++++++
 3 files changed, 134 insertions(+), 5 deletions(-)
 create mode 100644 apps/mm/mm.h
 create mode 100644 apps/mm/mm.hybrid.dist.chpl

diff --git a/apps/mm/mm.cu b/apps/mm/mm.cu
index 340a226..16e26ad 100644
--- a/apps/mm/mm.cu
+++ b/apps/mm/mm.cu
@@ -127,9 +127,9 @@ extern "C" {
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaMallocEvent));
 #endif	    
-	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * N));
+	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
 	    CudaSafeCall(cudaMalloc(&dB, sizeof(float) * N));
-	    CudaSafeCall(cudaMalloc(&dC, sizeof(float) * N));
+	    CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(endCudaMallocEvent));
 	    CudaSafeCall(cudaEventSynchronize(endCudaMallocEvent));
@@ -138,7 +138,7 @@ extern "C" {
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaMemcpyH2DEvent));
 #endif
-	    CudaSafeCall(cudaMemcpy(dA, A, sizeof(float) * N, cudaMemcpyHostToDevice));
+	    CudaSafeCall(cudaMemcpy(dA, A+start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
 	    CudaSafeCall(cudaMemcpy(dB, B, sizeof(float) * N, cudaMemcpyHostToDevice));
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(endCudaMemcpyH2DEvent));
@@ -149,7 +149,7 @@ extern "C" {
 	    CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
 #endif
 	    if (!tiled) {
-		mm<<<ceil(((float)N)/1024), 1024>>>(dA, dB, dC, ceil(sqrt(N)), N, N);
+		mm<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, dC, ceil(sqrt(N)), N, GPUN);
 	    } else if (tiled == 1){
 		dim3 block(32,32);
 		dim3 grid(ceil(sqrt(N)/32), ceil(sqrt(N)/32));
@@ -183,7 +183,7 @@ extern "C" {
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaMemcpyD2HEvent));
 #endif
-	    CudaSafeCall(cudaMemcpy(C + start, dC + start, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+	    CudaSafeCall(cudaMemcpy(C + start, dC, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(endCudaMemcpyD2HEvent));
 	    CudaSafeCall(cudaEventSynchronize(endCudaMemcpyD2HEvent));
diff --git a/apps/mm/mm.h b/apps/mm/mm.h
new file mode 100644
index 0000000..35fb4f5
--- /dev/null
+++ b/apps/mm/mm.h
@@ -0,0 +1,3 @@
+
+void mmCUDA(float* A, float *B, float *C, int N, int start, int end, int GPUN, int tiled);
+
diff --git a/apps/mm/mm.hybrid.dist.chpl b/apps/mm/mm.hybrid.dist.chpl
new file mode 100644
index 0000000..76bc917
--- /dev/null
+++ b/apps/mm/mm.hybrid.dist.chpl
@@ -0,0 +1,126 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUPercent = 0: int;
+config const numTrials = 1: int;
+config const tiled = 0;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const S = {1..n, 1..n};
+const RS = S dmapped Replicated();
+var D: domain(1) dmapped Block(boundingBox = {1..n*n}) = {1..n*n};
+
+var A: [D] real(32);
+var B: [RS] real(32);
+var C: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc mmCUDA(A: [] real(32), B: [] real(32), C: [] real(32), N:int, lo: int, hi: int, GPUN: int, tiled: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  //if(tiled) {
+  //  assert(N/n>=32 && (N/n)%32==0, "should use multiples of 32 rows in GPU when tiled");
+  //}
+  assert(N%n == 0, "should offload full rows to GPU");
+  ref lA = A.localSlice(lo .. hi);
+  ref lC = C.localSlice(lo .. hi);
+  mmCUDA(lA, B, lC, n*n, 0, hi-lo, N, tiled);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n, "x", n);
+  writeln("CPU ratio: ", CPUPercent);
+  writeln("nTrials: ", numTrials);
+  writeln("tiled: ", tiled);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+    coforall loc in Locales do on loc {
+      for i in 1..n {
+        for j in 1..n {
+          var e: int = (i-1)*n+(j-1)+1;
+          A(e) = (i*1.0/1000): real(32);
+          B(i, j) = (i*1.0/1000): real(32);
+          C(e) = 0: real(32);
+        }
+      }
+    }
+
+	const startTime = getCurrentTime();
+	// TODO: Consider using a 2D iterator
+	forall e in GPU(D, CUDAWrapper, CPUPercent) {
+      var i: int = (e - 1) / n + 1;
+      var j: int = (e - 1) % n + 1;
+      var sum: real(32) = C(e);
+      for k in 1..n {
+		sum += A((i-1)*n+k) * B(k, j);
+      }
+      C(e) = sum;
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(reshape(C, {1..n, 1..n}));
+	}
+  }
+  printResults(execTimes);
+}

From 1eeddceec155b0d528e5dd4a8d72e5b89ba5efd0 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Wed, 13 May 2020 17:34:54 -0700
Subject: [PATCH 030/118] Update MM blas to use only GPU partition of matrix
 rather than whole matrix

---
 apps/mm/mm.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/apps/mm/mm.cu b/apps/mm/mm.cu
index 16e26ad..1f9a8e2 100644
--- a/apps/mm/mm.cu
+++ b/apps/mm/mm.cu
@@ -167,7 +167,11 @@ extern "C" {
 		long long end = getCurrentTime();
 		printf("cuBLAS prep: %lf msec\n", (float)(end-start)/1000);
 #endif
-		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), sqrt(N), sqrt(N), &alpha, dA, lda, dB, ldb, &beta, dC, ldc);
+		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, dB, ldb, dA, lda, &beta, dC, ldc);
+
+        //http://peterwittek.com/cublas-matrix-c-style.html
+        //C:mxn = A:mxk X B:kxn
+        //stat=cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,n,m,k,&a1,d_b,n,d_a,k,&bet,d_c,n);
 #ifdef PROF
 		long long end2 = getCurrentTime();
 		printf("cuBLAS finish: %lf msec\n", (float)(end2-start)/1000);

From 2e43dcae561f6bb0d4a0eb49520b3f3304c29635 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 13 May 2020 21:47:53 -0400
Subject: [PATCH 031/118] Update distributed version of LR

---
 apps/logisticregression/lr.baseline.chpl    | 40 ++++++++++++------
 apps/logisticregression/lr.cu               | 10 ++---
 apps/logisticregression/lr.hybrid.dist.chpl | 46 ++++++++++++++-------
 3 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/apps/logisticregression/lr.baseline.chpl b/apps/logisticregression/lr.baseline.chpl
index 0ccaa66..6c35459 100644
--- a/apps/logisticregression/lr.baseline.chpl
+++ b/apps/logisticregression/lr.baseline.chpl
@@ -66,19 +66,35 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..nFeatures {
-      W(i) = 0: real(32);
-	}
-	for i in 1..nSamples {
-      Y(i) = (i % 2): real(32);
-      for j in 1..nFeatures {
-		if (j != 0) {
-          X(i, j) = (i % 2): real(32);
-		} else {
-          X(i, j) = 1;
-		}
+    if (false) {
+      for i in 1..nFeatures {
+        W(i) = 0: real(32);
       }
-	}
+      for i in 1..nSamples {
+        Y(i) = (i % 2): real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = (i % 2): real(32);
+          } else {
+            X(i, j) = 1;
+          }
+        }
+      }
+    } else {
+      forall i in 1..nFeatures {
+        W(i) = 0: real(32);
+      }
+      for i in 1..nSamples {
+        Y(i) = i: real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = j: real(32);
+          } else {
+            X(i, j) = j : real(32);
+          }
+        }
+      }
+    }
 
 	const startTime = getCurrentTime();
 	for ite in 1..nIters {
diff --git a/apps/logisticregression/lr.cu b/apps/logisticregression/lr.cu
index ecb76f9..3cef4a7 100644
--- a/apps/logisticregression/lr.cu
+++ b/apps/logisticregression/lr.cu
@@ -26,7 +26,7 @@ __global__ void kernel1(float *dW, float *dWcurr, int N) {
     }
 }
 
-__global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float alpha, int nSamples, int nFeatures, int N) {
+__global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float alpha, int nSamples, int nFeatures, int start, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
 	float err = 0.0;
@@ -36,9 +36,9 @@ __global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float al
 		arg += dWcurr[f] * dX[s * (nFeatures) + f];
 	    }
 	    float hypo = 1 / (1 + exp(-arg));
-	    err += (hypo - dY[s]) * dX[s * (nFeatures) + id];
+	    err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
 	}
-	dW[id] = dWcurr[id] - alpha * err;
+	dW[id] = dWcurr[start + id] - alpha * err;
     }
 }
 
@@ -84,9 +84,9 @@ extern "C" {
 	    CudaSafeCall(cudaMemcpy(dY, Y, sizeof(float) * nSamples, cudaMemcpyHostToDevice));		
 	    CudaSafeCall(cudaMemcpy(dWcurr, Wcurr, sizeof(float) * nFeatures, cudaMemcpyHostToDevice));
 	    
-	    kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, GPUN);
+	    kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
 	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(W + start, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+	    CudaSafeCall(cudaMemcpy(W, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 	    
 	    CudaSafeCall(cudaFree(dX));
 	    CudaSafeCall(cudaFree(dY));
diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index c505319..ede9c57 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -55,11 +55,11 @@ proc CUDAWrapper1(lo: int, hi: int, N: int) {
 }
 
 proc CUDAWrapper2(lo: int, hi: int, N: int) {
-  if (verbose) {
+  //if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
-  }
+  //}
   ref lW = W.localSlice(lo .. hi);
-  lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, 0, hi-lo, N);
+  lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, lo, hi, N);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -108,20 +108,38 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	forall i in D {
-      W(i) = 0: real(32);
-	}
-    coforall loc in Locales do on loc {
-      for i in 1..nSamples {
-        Y(i) = (i % 2): real(32);
-        for j in 1..nFeatures {
-          if (j != 0) {
-            X(i, j) = (i % 2): real(32);
-          } else {
-            X(i, j) = 1;
+    if (false) {
+      forall i in D {
+        W(i) = 0: real(32);
+      }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = (i % 2): real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = (i % 2): real(32);
+              } else {
+                X(i, j) = 1;
+              }
+            }
           }
         }
+    } else {
+      forall i in D {
+        W(i) = 0: real(32);
       }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = i: real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = j: real(32);
+              } else {
+                X(i, j) = j : real(32);
+              }
+            }
+          }
+        }
     }
 
     const startTime = getCurrentTime();

From cadc7cdce58abb35d5b5733e0214f9df959f240c Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 13 May 2020 23:19:42 -0400
Subject: [PATCH 032/118] Update LOW-MID version of LR

---
 .../lr.hybrid.dist.explicit.chpl              | 202 ++++++++++++++++++
 apps/logisticregression/lr.kernel.cu          |  42 ++++
 apps/logisticregression/lr.kernel.h           |   1 +
 3 files changed, 245 insertions(+)
 create mode 100644 apps/logisticregression/lr.hybrid.dist.explicit.chpl
 create mode 100644 apps/logisticregression/lr.kernel.cu
 create mode 100644 apps/logisticregression/lr.kernel.h

diff --git a/apps/logisticregression/lr.hybrid.dist.explicit.chpl b/apps/logisticregression/lr.hybrid.dist.explicit.chpl
new file mode 100644
index 0000000..b837ef2
--- /dev/null
+++ b/apps/logisticregression/lr.hybrid.dist.explicit.chpl
@@ -0,0 +1,202 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const nFeatures = 32: int;
+config const nSamples = 32: int;
+config const nIters = 32: int;
+config const CPUPercent1 = 0: int;
+config const CPUPercent2 = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const Space1 = {1..nSamples, 1..nFeatures};
+const ReplicatedSpace1 = Space1 dmapped Replicated();
+var X: [ReplicatedSpace1] real(32);
+
+const Space2 = {1..nSamples};
+const ReplicatedSpace2 = Space2 dmapped Replicated();
+var Y: [ReplicatedSpace2] real(32);
+
+const Space3 = {1..nFeatures};
+const ReplicatedSpace3 = Space3 dmapped Replicated();
+var Wcurr: [ReplicatedSpace3] real(32);
+
+var D: domain(1) dmapped Block(boundingBox = {1..nFeatures}) = {1..nFeatures};
+var W: [D] real(32);
+var alpha = 0.1 : real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchLR(X: c_void_ptr, Y: c_void_ptr, W: c_void_ptr, Wcurr: c_void_ptr, alpha: real(32), nSamples: int, nFeatures: int, lo: int, hi: int, N: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper1(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  lrCUDA1(W, Wcurr, lo, hi, N);
+}
+
+proc CUDAWrapper2(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  ref lW = W.localSlice(lo .. hi);
+  ProfilerStart();
+  var dX: c_void_ptr;
+  var dY: c_void_ptr;
+  var dWcurr: c_void_ptr;
+  var dW: c_void_ptr;
+  writeln("X.size: ", X.size, " Y.size: ", Y.size);
+  Malloc(dX, X.size:size_t * c_sizeof(X.eltType));
+  Malloc(dY, Y.size:size_t * c_sizeof(Y.eltType));
+  Malloc(dWcurr, Wcurr.size:size_t * c_sizeof(Wcurr.eltType));
+  Malloc(dW, lW.size:size_t * c_sizeof(lW.eltType));
+
+  Memcpy(dX, c_ptrTo(X), X.size:size_t * c_sizeof(X.eltType), 0);
+  Memcpy(dY, c_ptrTo(Y), Y.size:size_t * c_sizeof(Y.eltType), 0);
+  Memcpy(dWcurr, c_ptrTo(Wcurr), Wcurr.size:size_t * c_sizeof(Wcurr.eltType), 0);
+
+  LaunchLR(dX, dY, dW, dWcurr, alpha, nSamples, nFeatures, lo, hi, N);
+  DeviceSynchronize();
+
+  Memcpy(c_ptrTo(lW), dW, lW.size:size_t * c_sizeof(lW.eltType), 1);
+
+  Free(dX);
+  Free(dY);
+  Free(dW);
+  Free(dWcurr);   
+
+  ProfilerStop();
+  //lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, lo, hi, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
+  writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
+  writeln("CPU Percent1: ", CPUPercent1);
+  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+    if (false) {
+      forall i in D {
+        W(i) = 0: real(32);
+      }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = (i % 2): real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = (i % 2): real(32);
+              } else {
+                X(i, j) = 1;
+              }
+            }
+          }
+        }
+    } else {
+      forall i in D {
+        W(i) = 0: real(32);
+      }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = i: real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = j: real(32);
+              } else {
+                X(i, j) = j : real(32);
+              }
+            }
+          }
+        }
+    }
+
+    const startTime = getCurrentTime();
+	for ite in 1..nIters {
+      coforall loc in Locales {
+        on loc {
+          Wcurr = W;
+        }
+      }
+      const start = getCurrentTime();
+      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
+      //forall i in D {
+		var err = 0: real(32);
+		for s in 1..nSamples {
+          var arg = 0: real(32);
+          for f in 1..nFeatures {
+			arg += Wcurr(f) * X(s, f);
+          }
+          var hypo = 1 / (1 + exp(-arg));
+          err += (hypo - Y(s)) * X(s, i);
+		}
+		W(i) = Wcurr(i) - alpha * err;
+      }
+      writeln(getCurrentTime() - start, " sec");
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(W);
+	}
+  }
+  printResults(execTimes);
+}
diff --git a/apps/logisticregression/lr.kernel.cu b/apps/logisticregression/lr.kernel.cu
new file mode 100644
index 0000000..7386d6b
--- /dev/null
+++ b/apps/logisticregression/lr.kernel.cu
@@ -0,0 +1,42 @@
+#ifndef USE_LAMBDA
+__global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float alpha, int nSamples, int nFeatures, int start, int N) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < N) {
+	float err = 0.0;
+	for (int s = 0; s < nSamples; s++) {
+	    float arg = 0.0;
+	    for (int f = 0; f < nFeatures; f++) {
+		arg += dWcurr[f] * dX[s * (nFeatures) + f];
+	    }
+	    float hypo = 1 / (1 + exp(-arg));
+	    err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
+	}
+	dW[id] = dWcurr[start + id] - alpha * err;
+    }
+}
+#else
+#include "lambda.h"
+#endif
+
+extern "C" {
+#ifndef USE_LAMBDA
+    void LaunchLR(float* dX, float *dY, float *dW, float *dWcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
+ 	    kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
+    }
+#else
+    void LaunchLR(float* dX, float *dY, float *dW, float *dWcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
+        call_gpu_functor(GPUN, 1024, NULL, [=] __device__ (int id) {
+                float err = 0.0;
+                for (int s = 0; s < nSamples; s++) {
+                    float arg = 0.0;
+                    for (int f = 0; f < nFeatures; f++) {
+                        arg += dWcurr[f] * dX[s * (nFeatures) + f];
+                    }
+                    float hypo = 1 / (1 + exp(-arg));
+                    err += (hypo - dY[s]) * dX[s * (nFeatures) + (start - 1) + id];
+                }
+                dW[id] = dWcurr[(start - 1) + id] - alpha * err;
+            });
+    }
+#endif    
+}
diff --git a/apps/logisticregression/lr.kernel.h b/apps/logisticregression/lr.kernel.h
new file mode 100644
index 0000000..2dadf99
--- /dev/null
+++ b/apps/logisticregression/lr.kernel.h
@@ -0,0 +1 @@
+void LaunchLR(float* X, float *Y, float *W, float *Wcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN);

From 2d6aad763a03b8387bbd364d6b31f5f73b56552f Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Wed, 13 May 2020 23:55:08 -0700
Subject: [PATCH 033/118] Add LOW-MID version of MM and sample cublas gemm
 matrix multiplication

---
 apps/mm/m.cu                         | 176 +++++++++++++++++++++++++++
 apps/mm/mm.hybrid.dist.explicit.chpl | 153 +++++++++++++++++++++++
 apps/mm/mm.kernel.cu                 |  48 ++++++++
 apps/mm/mm.kernel.h                  |   2 +
 4 files changed, 379 insertions(+)
 create mode 100644 apps/mm/m.cu
 create mode 100644 apps/mm/mm.hybrid.dist.explicit.chpl
 create mode 100644 apps/mm/mm.kernel.cu
 create mode 100644 apps/mm/mm.kernel.h

diff --git a/apps/mm/m.cu b/apps/mm/m.cu
new file mode 100644
index 0000000..c28fdd6
--- /dev/null
+++ b/apps/mm/m.cu
@@ -0,0 +1,176 @@
+
+//https://github.com/ernestyalumni/CompPhys/blob/master/moreCUDA/CUBLAS/036sgemm.c
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <cuda_runtime.h>
+#include "cublas_v2.h"
+#define IDX2C(i,j,ld) (((j)*(ld))+( i ))
+#define m 2 									// a - mxk matrix
+#define n 4 									// b - kxn matrix
+#define k 3 									// c - mxn matrix
+int main(void) {
+	cudaError_t cudaStat; 					// cudaMalloc status
+	cublasStatus_t stat; 			// CUBLAS functions status
+	cublasHandle_t handle;						// CUBLAS context
+	int i,j;						// i-row index,j-column index
+	float* a;							// mxk matrix a on the host
+	float* b;							// kxn matrix b on the host
+	float* c; 							// mxn matrix c on the host
+	a=(float*)malloc(m*k*sizeof(float));		// host memory for a
+	b=(float*)malloc(k*n*sizeof(float));		// host memory for b
+	c=(float*)malloc(m*n*sizeof(float));		// host memory for c
+	// define an mxk matrix a column by column
+	int ind=11;										// a:
+	for (j=0;j<k;j++){								// 11,17,23,29,35
+		for (i=0;i<m;i++){							// 12,18,24,30,36
+			a[IDX2C(i,j,m)]=(float)ind++;			// 13,19,25,31,37
+		}											// 14,20,26,32,38
+	}												// 15,21,27,33,39
+													// 16,22,28,34,40
+    int co = 0;
+    for (i=0;i<m;i++){
+        for (j=0;j<k;j++){
+            a[co] = co+1;
+            co++;
+        }
+    }
+	// print a row by row
+    co=0;
+	printf("a:\n");
+		for (i=0;i<m;i++) {
+			for (j=0;j<k;j++){
+				//printf("%5.0f",a[IDX2C(i,j,m)]);
+				printf("%5.0f",a[co++]);
+			}
+		printf("\n");
+	}
+	// define a kxn matrix b column by column
+	ind=11;												// b:
+	for (j=0;j<n;j++) {									// 11,16,21,26
+		for (i=0;i<k;i++) {								// 12,17,22,27
+			b[IDX2C(i,j,k)]=(float)ind++;				// 13,18,23,28
+		}												// 14,19,24,29
+    }												// 15,20,25,30
+
+    co = 0;
+    for (i=0;i<k;i++){
+        for (j=0;j<n;j++){
+            b[co] = co+1;
+            co++;
+        }
+    }
+	// print b row by row
+    co=0;
+	printf("b: \n");
+	for(i=0;i<k;i++){
+		for (j=0;j<n;j++) {
+			printf("%5.0f",b[co++]);
+		}
+		printf("\n");
+	}
+	// define an mxn matrix c column by column
+    co = 0;
+	ind =11;												// c:
+	for (j=0;j<n;j++){									// 11,17,23,29
+		for (i=0;i<m;i++){								// 12,18,24,30
+			//c[IDX2C(i,j,m)]=(float)ind++;				// 13,19,25,31
+            c[co++] = 0;
+		}												// 14,20,26,32
+	}													// 15,21,27,33
+														// 16,22,28,34
+	// print c row by row
+    co=0;
+	printf("c:\n");
+		for (i=0;i<m;i++) {
+			for (j=0;j<n;j++) {
+				printf("%5.0f",c[co++]);
+			}
+			printf("\n");
+		}
+
+	// on the device
+	float* d_a; 							// d_a - a on the device
+	float* d_b; 							// d_b - b on the device
+	float* d_c;								// d_c - c on the device
+	cudaStat=cudaMalloc((void**)&d_a,m*k*sizeof(*a));	// device
+												// memory alloc for a
+	cudaStat=cudaMalloc((void**)&d_b,k*n*sizeof(*b)); 	// device
+												// memory alloc for b
+	cudaStat=cudaMalloc((void**)&d_c,m*n*sizeof(*c));	// device
+												// memory alloc for c
+	stat = cublasCreate(&handle); 		// initialize CUBLAS context
+	// copy matrices from the host to the device
+	stat = cublasSetMatrix(m,k,sizeof(*a),a,m,d_a,m); 	// a -> d_a
+	stat = cublasSetMatrix(k,n,sizeof(*b),b,k,d_b,k);	// b -> d_b
+	stat = cublasSetMatrix(m,n,sizeof(*c),c,m,d_c,m);	// c -> d_c
+	float a1=1.0f;											// a1=1
+	float bet=0.0f;											// bet=1
+	// matrix-matrix multiplication: d_c = a1*d_a*d_b + bet*d_c
+	// d_a -mxk matrix, d_b - kxn matrix, d_c -mxn matrix;
+	// a1,bet - scalars
+
+    //c with some sort of transposed way
+    stat=cublasSgemm(handle,CUBLAS_OP_T,CUBLAS_OP_T,m,n,k,&a1,d_a,k,d_b,n,&bet,d_c,m);
+
+	stat=cublasGetMatrix(m,n,sizeof(*c),d_c,m,c,m);	// cp d_c -> c
+	printf("c after Sgemm :\n");
+    co=0;
+	for(i=0;i<m;i++){
+		for (j=0;j<n;j++){
+			printf("%7.0f", c[co++]); 	// print c after Sgemm
+		}
+		printf("\n");
+	}
+
+    //C
+	stat=cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,n,m,k,&a1,d_b,n,d_a,k,&bet,d_c,n);
+    stat=cublasGetMatrix(m,n,sizeof(*c),d_c,m,c,m);	// cp d_c -> c
+	printf("c after Sgemm :\n");
+    co=0;
+	for(i=0;i<m;i++){
+		for (j=0;j<n;j++){
+			printf("%7.0f", c[co++]); 	// print c after Sgemm
+		}
+		printf("\n");
+	}
+
+	cudaFree(d_a);								// free device memory
+	cudaFree(d_b);								// free device memory
+	cudaFree(d_c);								// free device memory
+	cublasDestroy(handle);					// destroy CUBLAS context
+	free(a);									// free host memory
+	free(b);									// free host memory
+	free(c);									// free host memory
+	return EXIT_SUCCESS;
+}
+
+// a:
+//   11   17   23   29   35
+//   12   18   24   30   36
+//   13   19   25   31   37
+//   14   20   26   32   38
+//   15   21   27   33   39
+//   16   22   28   34   40
+// b:
+//   11   16   21   26
+//   12   17   22   27
+//   13   18   23   28
+//   14   19   24   29
+//   15   20   25   30
+// c:
+//   11   17   23   29
+//   12   18   24   30
+//   13   19   25   31
+//   14   20   26   32
+//   15   21   27   33
+//   16   22   28   34
+// c after Sgemm :
+//   1566   2147   2728   3309
+//   1632   2238   2844   3450  	// c=a1*a*b+bet*c
+//   1698   2329   2960   3591
+//   1764   2420   3076   3732
+//   1830   2511   3192   3873
+//   1896   2602   3308   4014
+
+
diff --git a/apps/mm/mm.hybrid.dist.explicit.chpl b/apps/mm/mm.hybrid.dist.explicit.chpl
new file mode 100644
index 0000000..5ebab1c
--- /dev/null
+++ b/apps/mm/mm.hybrid.dist.explicit.chpl
@@ -0,0 +1,153 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUPercent = 0: int;
+config const numTrials = 1: int;
+config const tiled = 0;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const S = {1..n, 1..n};
+const RS = S dmapped Replicated();
+var D: domain(1) dmapped Block(boundingBox = {1..n*n}) = {1..n*n};
+
+var A: [D] real(32);
+var B: [RS] real(32);
+var C: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchMM(A: c_void_ptr, B: c_void_ptr, C: c_void_ptr, N: int, lo:int, hi:int, GPUN: int, tiled: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  //if(tiled) {
+  //  assert(N/n>=32 && (N/n)%32==0, "should use multiples of 32 rows in GPU when tiled");
+  //}
+  assert(N%n == 0, "should offload full rows to GPU");
+  ref lA = A.localSlice(lo .. hi);
+  ref lC = C.localSlice(lo .. hi);
+  assert(lA.size == lC.size);
+
+  ProfilerStart();
+  var dA: c_void_ptr;
+  var dB: c_void_ptr;
+  var dC: c_void_ptr;
+
+  writeln("lA.size: ", lA.size, " B.size: ", B.size);
+  Malloc(dA, lA.size:size_t * c_sizeof(lA.eltType));
+  Malloc(dB, B.size:size_t  * c_sizeof(B.eltType));
+  Malloc(dC, lC.size:size_t * c_sizeof(lC.eltType));
+
+  Memcpy(dA, c_ptrTo(lA), lA.size:size_t * c_sizeof(lA.eltType), 0);
+  Memcpy(dB, c_ptrTo(B),  B.size:size_t  * c_sizeof(B.eltType),  0);
+
+  LaunchMM(dA, dB, dC, n*n, 0, hi-lo, N, tiled);
+  DeviceSynchronize();
+  Memcpy(c_ptrTo(lC), dC, lC.size:size_t * c_sizeof(lC.eltType), 1);
+
+  Free(dA);
+  Free(dB);
+  Free(dC);
+  ProfilerStop();
+
+  //mmCUDA(lA, B, lC, n*n, 0, hi-lo, N, tiled);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n, "x", n);
+  writeln("CPU ratio: ", CPUPercent);
+  writeln("nTrials: ", numTrials);
+  writeln("tiled: ", tiled);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+    coforall loc in Locales do on loc {
+      for i in 1..n {
+        for j in 1..n {
+          var e: int = (i-1)*n+(j-1)+1;
+          A(e) = (i*1.0/1000): real(32);
+          B(i, j) = (i*1.0/1000): real(32);
+          C(e) = 0: real(32);
+        }
+      }
+    }
+
+	const startTime = getCurrentTime();
+	// TODO: Consider using a 2D iterator
+	forall e in GPU(D, CUDAWrapper, CPUPercent) {
+      var i: int = (e - 1) / n + 1;
+      var j: int = (e - 1) % n + 1;
+      var sum: real(32) = C(e);
+      for k in 1..n {
+		sum += A((i-1)*n+k) * B(k, j);
+      }
+      C(e) = sum;
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(reshape(C, {1..n, 1..n}));
+	}
+  }
+  printResults(execTimes);
+}
diff --git a/apps/mm/mm.kernel.cu b/apps/mm/mm.kernel.cu
new file mode 100644
index 0000000..849f49e
--- /dev/null
+++ b/apps/mm/mm.kernel.cu
@@ -0,0 +1,48 @@
+#include <assert.h>
+#include <cublas_v2.h>
+
+__global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id <= GPUN) {
+        int i = id / DIM;
+        int j = id % DIM;
+        float sum = 0.0f;
+        for (int k = 0; k < DIM; k++) {
+            sum += dA[i*DIM+k] * dB[k*DIM+j];
+        }
+        dC[id] += sum;
+    }
+}
+
+extern "C" {
+
+void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, int tiled) {
+    if (GPUN > 0) {
+        assert(hi - low + 1 == GPUN);
+#ifdef VERBOSE
+        printf("In mmCUDA\n");
+        printf("\t GPUN: %d\n", GPUN);
+        printf("\t range: %d..%d\n", start, end);
+#endif
+        if (!tiled) {
+            mm<<<ceil(((float)GPUN)/1024), 1024>>>(A, B, C, ceil(sqrt(N)), N, GPUN);
+        }
+        else if(tiled == 1) {
+            printf("Tile not imlemented\n");
+            assert(false);
+        }
+        else {
+            printf("Using cublas\n");
+            cublasHandle_t handle;
+
+            cublasCreate(&handle);
+            float alpha = 1.0F;
+            float beta = 0.0F;
+            int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
+
+            cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, B, ldb, A, lda, &beta, C, ldc);
+        }
+    }
+}
+
+}
diff --git a/apps/mm/mm.kernel.h b/apps/mm/mm.kernel.h
new file mode 100644
index 0000000..2982cd7
--- /dev/null
+++ b/apps/mm/mm.kernel.h
@@ -0,0 +1,2 @@
+
+void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, int tiled);

From dd520a1763366d786926bab602cc722c802910a5 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 12:58:26 -0400
Subject: [PATCH 034/118] Rename explicit versions of VC, Update Makefile,
 Update GPUAPI.chpl (h2d=true, d2h=true, by default)

---
 apps/Makefile                                 | 23 ++++++++-----------
 ...plicit.chpl => vc.hybrid.dist.lowmid.chpl} | 12 ++++------
 ...licit.mid.chpl => vc.hybrid.dist.mid.chpl} | 15 +++++-------
 src/GPUAPI.chpl                               |  6 ++---
 4 files changed, 22 insertions(+), 34 deletions(-)
 rename apps/vector_copy/{vc.hybrid.dist.explicit.chpl => vc.hybrid.dist.lowmid.chpl} (95%)
 rename apps/vector_copy/{vc.hybrid.dist.explicit.mid.chpl => vc.hybrid.dist.mid.chpl} (93%)

diff --git a/apps/Makefile b/apps/Makefile
index 177d15d..214d1c0 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -49,20 +49,15 @@ cudahybrid: GPUAPI.o $(TARGET).o $(TARGET).hybrid.chpl
 cudahybrid.dist: GPUAPI.o $(TARGET).o $(TARGET).hybrid.dist.chpl
 	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
 
-.PHONY: cudahybrid.dist.explicit
-cudahybrid.dist.explicit: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.chpl
-	nvcc $(NVCCFLAGS) -DUSE_LAMBDA -c $(TARGET).kernel.cu -o $(TARGET).lambda.o
-	nvcc $(NVCCFLAGS)              -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.kernel
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).lambda.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.lambda
-
-.PHONY: cudahybrid.dist.explicit.mid
-cudahybrid.dist.explicit.mid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.explicit.mid.chpl
-	nvcc $(NVCCFLAGS) -DUSE_LAMBDA -c $(TARGET).kernel.cu -o $(TARGET).lambda.o
-	nvcc $(NVCCFLAGS)              -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.mid.kernel
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).lambda.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.explicit.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.explicit.mid.lambda
+.PHONY: cudahybrid.dist.lowmid
+cudahybrid.dist.lowmid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.lowmid.chpl
+	nvcc $(NVCCFLAGS) -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.lowmid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.lowmid
 
+.PHONY: cudahybrid.dist.mid
+cudahybrid.dist.mid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.mid.chpl
+	nvcc $(NVCCFLAGS) -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.mid
 
 .PHONY: openclgpu
 oclgpu: $(TARGET).opencl.o $(TARGET).gpu.chpl
@@ -86,4 +81,4 @@ hiphybrid: $(TARGET).cu $(TARGET).hybrid.chpl
 
 .PHONY: clean
 clean:
-	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.explicit.kernel $(TARGET).hybrid.dist.explicit.lambda $(TARGET).o *_real
+	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.lowmid $(TARGET).hybrid.dist.mid $(TARGET).o *_real
diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.chpl b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
similarity index 95%
rename from apps/vector_copy/vc.hybrid.dist.explicit.chpl
rename to apps/vector_copy/vc.hybrid.dist.lowmid.chpl
index 2e73472..1a2c5e1 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
@@ -43,11 +43,9 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   }
   ref lA = A.localSlice(lo .. hi);
   ref lB = B.localSlice(lo .. hi);
-  writeln("localSlice Size:", lA.size);
-  ProfilerStart();
-  var dA: c_void_ptr;
-  var dB: c_void_ptr;
-  var size: size_t = (lA.size * 4): size_t;
+  if (verbose) { ProfilerStart(); }
+  var dA, dB: c_void_ptr;
+  var size: size_t = (lA.size:size_t * c_sizeof(lA.eltType));
   Malloc(dA, size);
   Malloc(dB, size);
   Memcpy(dB, c_ptrTo(lB), size, 0);
@@ -56,9 +54,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Memcpy(c_ptrTo(lA), dA, size, 1);
   Free(dA);
   Free(dB);
-  ProfilerStop();
-
-  //vcCUDA(lA, lB, 0, hi-lo, N);
+  if (verbose) { ProfilerStop(); }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl b/apps/vector_copy/vc.hybrid.dist.mid.chpl
similarity index 93%
rename from apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl
rename to apps/vector_copy/vc.hybrid.dist.mid.chpl
index e17cf1f..582e19e 100644
--- a/apps/vector_copy/vc.hybrid.dist.explicit.mid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.mid.chpl
@@ -43,18 +43,15 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   }
   ref lA = A.localSlice(lo .. hi);
   ref lB = B.localSlice(lo .. hi);
-  writeln("localSlice Size:", lA.size);
-  ProfilerStart();
-  var dA = new GPUArray(lA, h2d=false, d2h=true);
-  var dB = new GPUArray(lB, h2d=true, d2h=false);
-  toDevice(dA, dB);
+  if (verbose) { ProfilerStart(); }
+  var dA = new GPUArray(lA);
+  var dB = new GPUArray(lB);
+  toDevice(dB);
   LaunchVC(dA.dPtr(), dB.dPtr(), N: size_t);
   DeviceSynchronize();
-  fromDevice(dA, dB);
+  fromDevice(dA);
   free(dA, dB);
-  ProfilerStop();
-
-  //vcCUDA(lA, lB, 0, hi-lo, N);
+  if (verbose) { ProfilerStop(); }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/src/GPUAPI.chpl b/src/GPUAPI.chpl
index 480fd86..93f29ba 100644
--- a/src/GPUAPI.chpl
+++ b/src/GPUAPI.chpl
@@ -41,10 +41,10 @@ module GPUAPI {
       var size: size_t;
       var sizeInBytes: size_t;
 
-      proc init(ref arr, h2d, d2h) {
+      proc init(ref arr) {
         // Properties
-        this.h2d = h2d;
-        this.d2h = d2h;
+        this.h2d = true;
+        this.d2h = true;
         // Low-level info
         this.devPtr = nil;
         this.hosPtr = c_ptrTo(arr);

From b54329ef09d9955070586beeb22523051adeebcf Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 13:27:01 -0400
Subject: [PATCH 035/118] Update vc.hybrid.dist.mid.chpl

---
 apps/vector_copy/vc.hybrid.dist.mid.chpl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/vector_copy/vc.hybrid.dist.mid.chpl b/apps/vector_copy/vc.hybrid.dist.mid.chpl
index 582e19e..23a041e 100644
--- a/apps/vector_copy/vc.hybrid.dist.mid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.mid.chpl
@@ -46,10 +46,10 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) { ProfilerStart(); }
   var dA = new GPUArray(lA);
   var dB = new GPUArray(lB);
-  toDevice(dB);
+  dB.toDevice();
   LaunchVC(dA.dPtr(), dB.dPtr(), N: size_t);
   DeviceSynchronize();
-  fromDevice(dA);
+  dA.fromDevice();
   free(dA, dB);
   if (verbose) { ProfilerStop(); }
 }

From 0b3ef62d67cc22b26bf73bc3b3302d5b1297256e Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 15:44:58 -0400
Subject: [PATCH 036/118] Add MID version of bS

---
 ...plicit.chpl => bs.hybrid.dist.lowmid.chpl} |  15 +-
 apps/blackscholes/bs.hybrid.dist.mid.chpl     | 200 ++++++++++++++++++
 2 files changed, 204 insertions(+), 11 deletions(-)
 rename apps/blackscholes/{bs.hybrid.dist.explicit.chpl => bs.hybrid.dist.lowmid.chpl} (97%)
 create mode 100644 apps/blackscholes/bs.hybrid.dist.mid.chpl

diff --git a/apps/blackscholes/bs.hybrid.dist.explicit.chpl b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
similarity index 97%
rename from apps/blackscholes/bs.hybrid.dist.explicit.chpl
rename to apps/blackscholes/bs.hybrid.dist.lowmid.chpl
index 25198a3..2528fa3 100644
--- a/apps/blackscholes/bs.hybrid.dist.explicit.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
@@ -39,12 +39,9 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   ref lrand = rand.localSlice(lo .. hi);  
   ref lput = put.localSlice(lo .. hi);
   ref lcall = call.localSlice(lo .. hi);
-
-  ProfilerStart();
-  var drand: c_void_ptr;
-  var dput: c_void_ptr;
-  var dcall: c_void_ptr;
-  var size: size_t = (lrand.size:size_t * c_sizeof(lrand.eltType)) : size_t;
+  if (verbose) { ProfilerStart(); }
+  var drand, dput, dcall: c_void_ptr;
+  var size: size_t = (lrand.size:size_t * c_sizeof(lrand.eltType));
   Malloc(drand, size);
   Malloc(dput, size);
   Malloc(dcall, size);
@@ -53,14 +50,10 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   DeviceSynchronize();
   Memcpy(c_ptrTo(lput), dput, size, 1);
   Memcpy(c_ptrTo(lcall), dcall, size, 1);
-
   Free(drand);
   Free(dput);
   Free(dcall);
-  
-  ProfilerStop();
-  
-  //  bsCUDA(lrand, lput, lcall, 0, hi-lo, N);
+  if (verbose) { ProfilerStop(); }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/blackscholes/bs.hybrid.dist.mid.chpl b/apps/blackscholes/bs.hybrid.dist.mid.chpl
new file mode 100644
index 0000000..b42e6e4
--- /dev/null
+++ b/apps/blackscholes/bs.hybrid.dist.mid.chpl
@@ -0,0 +1,200 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var rand: [D] real(32);
+var put: [D] real(32);
+var call: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchBS(drand: c_void_ptr, dput: c_void_ptr, dcall: c_void_ptr, N: size_t);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  ref lrand = rand.localSlice(lo .. hi);  
+  ref lput = put.localSlice(lo .. hi);
+  ref lcall = call.localSlice(lo .. hi);
+  if (verbose) { ProfilerStart(); }
+  var drand = new GPUArray(lrand);
+  var dput = new GPUArray(lput);
+  var dcall = new GPUArray(lcall);
+  toDevice(drand);
+  LaunchBS(drand.dPtr(), dput.dPtr(), dcall.dPtr(), N:size_t);
+  DeviceSynchronize();
+  fromDevice(dput, dcall);
+  free(drand, dput, dcall);
+  if (verbose) { ProfilerStop(); }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  const S_LOWER_LIMIT = 10.0: real(32);
+  const S_UPPER_LIMIT = 100.0: real(32);
+  const K_LOWER_LIMIT = 10.0: real(32);
+  const K_UPPER_LIMIT = 100.0: real(32);
+  const T_LOWER_LIMIT = 1.0: real(32);
+  const T_UPPER_LIMIT = 10.0: real(32);
+  const R_LOWER_LIMIT = 0.01: real(32);
+  const R_UPPER_LIMIT = 0.05: real(32);
+  const SIGMA_LOWER_LIMIT = 0.01: real(32);
+  const SIGMA_UPPER_LIMIT = 0.10: real(32);
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	for i in 1..n {
+      rand(i) = (i: real(32) / n): real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio)  {
+      var c1 = 0.319381530: real(32);
+      var c2 = -0.356563782: real(32);
+      var c3 = 1.781477937: real(32);
+      var c4 = -1.821255978: real(32);
+      var c5 = 1.330274429: real(32);
+
+      var zero = 0.0: real(32);
+      var one = 1.0: real(32);
+      var two = 2.0: real(32);
+      var temp4 = 0.2316419: real(32);
+
+      var oneBySqrt2pi = 0.398942280: real(32);
+
+      var inRand = rand(i);
+
+      var S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0 - inRand);
+      var K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0 - inRand);
+      var T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0 - inRand);
+      var R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0 - inRand);
+      var sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0 - inRand);
+
+      var sigmaSqrtT = sigmaVal * sqrt(T);
+
+      var d1 = (log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+      var d2 = d1 - sigmaSqrtT;
+
+      var KexpMinusRT = K * exp(-R * T);
+
+      var phiD1, phiD2: real(32);
+
+      // phiD1 = phi(d1)
+      var X = d1;
+      var absX = abs(X);
+      var t = one / (one + temp4 * absX);
+      var y = one - oneBySqrt2pi * Math.exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD1 = one - y;
+      } else {
+		phiD1 = y;
+      }
+      // phiD2 = phi(d2)
+      X = d2;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD2 = one - y;
+      } else {
+		phiD2 = y;
+      }
+
+      call(i) = S * phiD1 - KexpMinusRT * phiD2;
+
+      // phiD1 = phi(-d1);
+      X = -d1;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD1 = one - y;
+      } else {
+		phiD1 = y;
+      }
+
+      // phiD2 = phi(-d2);
+      X = -d2;
+      absX = Math.abs(X);
+      t = one / (one + temp4 * absX);
+      y = one - oneBySqrt2pi * exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+      if (X  < zero) {
+		phiD2 = one - y;
+      } else {
+		phiD2 = y;
+      }
+
+      put(i) = KexpMinusRT * phiD2 - S * phiD1;
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln("call: ", call);
+      writeln("");
+      writeln("put: ", put);
+	}
+  }
+  printResults(execTimes);
+}

From fba467c5a8cd2e2e21181f771b513b8b62b1a577 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 15:58:39 -0400
Subject: [PATCH 037/118] Add MID version of LR

---
 ...plicit.chpl => lr.hybrid.dist.lowmid.chpl} |  18 +-
 .../lr.hybrid.dist.mid.chpl                   | 186 ++++++++++++++++++
 2 files changed, 190 insertions(+), 14 deletions(-)
 rename apps/logisticregression/{lr.hybrid.dist.explicit.chpl => lr.hybrid.dist.lowmid.chpl} (95%)
 create mode 100644 apps/logisticregression/lr.hybrid.dist.mid.chpl

diff --git a/apps/logisticregression/lr.hybrid.dist.explicit.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
similarity index 95%
rename from apps/logisticregression/lr.hybrid.dist.explicit.chpl
rename to apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index b837ef2..78e6437 100644
--- a/apps/logisticregression/lr.hybrid.dist.explicit.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -61,33 +61,23 @@ proc CUDAWrapper2(lo: int, hi: int, N: int) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
   ref lW = W.localSlice(lo .. hi);
-  ProfilerStart();
-  var dX: c_void_ptr;
-  var dY: c_void_ptr;
-  var dWcurr: c_void_ptr;
-  var dW: c_void_ptr;
-  writeln("X.size: ", X.size, " Y.size: ", Y.size);
+  if (verbose) { ProfilerStart(); }
+  var dX, dY, dWcurr, dW: c_void_ptr;
   Malloc(dX, X.size:size_t * c_sizeof(X.eltType));
   Malloc(dY, Y.size:size_t * c_sizeof(Y.eltType));
   Malloc(dWcurr, Wcurr.size:size_t * c_sizeof(Wcurr.eltType));
   Malloc(dW, lW.size:size_t * c_sizeof(lW.eltType));
-
   Memcpy(dX, c_ptrTo(X), X.size:size_t * c_sizeof(X.eltType), 0);
   Memcpy(dY, c_ptrTo(Y), Y.size:size_t * c_sizeof(Y.eltType), 0);
   Memcpy(dWcurr, c_ptrTo(Wcurr), Wcurr.size:size_t * c_sizeof(Wcurr.eltType), 0);
-
   LaunchLR(dX, dY, dW, dWcurr, alpha, nSamples, nFeatures, lo, hi, N);
   DeviceSynchronize();
-
   Memcpy(c_ptrTo(lW), dW, lW.size:size_t * c_sizeof(lW.eltType), 1);
-
   Free(dX);
   Free(dY);
   Free(dW);
-  Free(dWcurr);   
-
-  ProfilerStop();
-  //lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, lo, hi, N);
+  Free(dWcurr);
+  if (verbose) { ProfilerStop(); }
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
new file mode 100644
index 0000000..987cace
--- /dev/null
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -0,0 +1,186 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const nFeatures = 32: int;
+config const nSamples = 32: int;
+config const nIters = 32: int;
+config const CPUPercent1 = 0: int;
+config const CPUPercent2 = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const Space1 = {1..nSamples, 1..nFeatures};
+const ReplicatedSpace1 = Space1 dmapped Replicated();
+var X: [ReplicatedSpace1] real(32);
+
+const Space2 = {1..nSamples};
+const ReplicatedSpace2 = Space2 dmapped Replicated();
+var Y: [ReplicatedSpace2] real(32);
+
+const Space3 = {1..nFeatures};
+const ReplicatedSpace3 = Space3 dmapped Replicated();
+var Wcurr: [ReplicatedSpace3] real(32);
+
+var D: domain(1) dmapped Block(boundingBox = {1..nFeatures}) = {1..nFeatures};
+var W: [D] real(32);
+var alpha = 0.1 : real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchLR(X: c_void_ptr, Y: c_void_ptr, W: c_void_ptr, Wcurr: c_void_ptr, alpha: real(32), nSamples: int, nFeatures: int, lo: int, hi: int, N: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper1(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  lrCUDA1(W, Wcurr, lo, hi, N);
+}
+
+proc CUDAWrapper2(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  ref lW = W.localSlice(lo .. hi);
+  if (verbose) { ProfilerStart(); }
+  var dX = new GPUArray(X);
+  var dY = new GPUArray(Y);
+  var dWcurr = new GPUArray(Wcurr);
+  var dW = new GPUArray(lW);
+  toDevice(dX, dY, dWcurr);
+  LaunchLR(dX.dPtr(), dY.dPtr(), dW.dPtr(), dWcurr.dPtr(), alpha, nSamples, nFeatures, lo, hi, N);
+  DeviceSynchronize();
+  fromDevice(dW);
+  free(dX, dY, dW, dWcurr);
+  if (verbose) { ProfilerStop(); }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
+  writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
+  writeln("CPU Percent1: ", CPUPercent1);
+  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+    if (false) {
+      forall i in D {
+        W(i) = 0: real(32);
+      }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = (i % 2): real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = (i % 2): real(32);
+              } else {
+                X(i, j) = 1;
+              }
+            }
+          }
+        }
+    } else {
+      forall i in D {
+        W(i) = 0: real(32);
+      }
+      coforall loc in Locales do on loc {
+          for i in 1..nSamples {
+            Y(i) = i: real(32);
+            for j in 1..nFeatures {
+              if (j != 0) {
+                X(i, j) = j: real(32);
+              } else {
+                X(i, j) = j : real(32);
+              }
+            }
+          }
+        }
+    }
+
+    const startTime = getCurrentTime();
+	for ite in 1..nIters {
+      coforall loc in Locales {
+        on loc {
+          Wcurr = W;
+        }
+      }
+      const start = getCurrentTime();
+      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
+      //forall i in D {
+		var err = 0: real(32);
+		for s in 1..nSamples {
+          var arg = 0: real(32);
+          for f in 1..nFeatures {
+			arg += Wcurr(f) * X(s, f);
+          }
+          var hypo = 1 / (1 + exp(-arg));
+          err += (hypo - Y(s)) * X(s, i);
+		}
+		W(i) = Wcurr(i) - alpha * err;
+      }
+      writeln(getCurrentTime() - start, " sec");
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(W);
+	}
+  }
+  printResults(execTimes);
+}

From cf65b0ee9f7c14b8190402ae4babfe38c1c52e50 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 17:31:59 -0400
Subject: [PATCH 038/118] Update LOW, LOWMID, MID version of LR

---
 apps/logisticregression/lr.hybrid.dist.chpl   | 19 ++++---------------
 .../lr.hybrid.dist.lowmid.chpl                | 18 ++++--------------
 .../lr.hybrid.dist.mid.chpl                   | 18 ++++--------------
 3 files changed, 12 insertions(+), 43 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index ede9c57..d324930 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -11,8 +11,7 @@ use GPUIterator;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent1 = 0: int;
-config const CPUPercent2 = 0: int;
+config const CPUPercent = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -42,19 +41,11 @@ var alpha = 0.1 : real(32);
 ////////////////////////////////////////////////////////////////////////////////
 /// C Interoperability
 ////////////////////////////////////////////////////////////////////////////////
-extern proc lrCUDA1(W: [] real(32), Wcurr: [] real(32), lo: int, hi: int, N: int);
 extern proc lrCUDA2(X: [] real(32), Y: [] real(32), W: [] real(32), Wcurr: [] real(32), alpha: real(32), nSamples: int, nFeatures: int, lo: int, hi: int, N: int);
 
 // CUDAWrapper is called from GPUIterator
 // to invoke a specific CUDA program (using C interoperability)
-proc CUDAWrapper1(lo: int, hi: int, N: int) {
-  if (verbose) {
-	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
-  }
-  lrCUDA1(W, Wcurr, lo, hi, N);
-}
-
-proc CUDAWrapper2(lo: int, hi: int, N: int) {
+proc CUDAWrapper(lo: int, hi: int, N: int) {
   //if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   //}
@@ -99,8 +90,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent1: ", CPUPercent1);
-  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("CPU Percent: ", CPUPercent);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -150,8 +140,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
-      //forall i in D {
+      forall i in GPU(D, CUDAWrapper, CPUPercent) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index 78e6437..261e1e8 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -14,8 +14,7 @@ use SysCTypes;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent1 = 0: int;
-config const CPUPercent2 = 0: int;
+config const CPUPercent = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -49,14 +48,7 @@ extern proc LaunchLR(X: c_void_ptr, Y: c_void_ptr, W: c_void_ptr, Wcurr: c_void_
 
 // CUDAWrapper is called from GPUIterator
 // to invoke a specific CUDA program (using C interoperability)
-proc CUDAWrapper1(lo: int, hi: int, N: int) {
-  if (verbose) {
-	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
-  }
-  lrCUDA1(W, Wcurr, lo, hi, N);
-}
-
-proc CUDAWrapper2(lo: int, hi: int, N: int) {
+proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
@@ -117,8 +109,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent1: ", CPUPercent1);
-  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("CPU Percent: ", CPUPercent);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -168,8 +159,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
-      //forall i in D {
+      forall i in GPU(D, CUDAWrapper, CPUPercent) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index 987cace..bee2a64 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -14,8 +14,7 @@ use SysCTypes;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent1 = 0: int;
-config const CPUPercent2 = 0: int;
+config const CPUPercent = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -49,14 +48,7 @@ extern proc LaunchLR(X: c_void_ptr, Y: c_void_ptr, W: c_void_ptr, Wcurr: c_void_
 
 // CUDAWrapper is called from GPUIterator
 // to invoke a specific CUDA program (using C interoperability)
-proc CUDAWrapper1(lo: int, hi: int, N: int) {
-  if (verbose) {
-	writeln("In CUDAWrapper1(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
-  }
-  lrCUDA1(W, Wcurr, lo, hi, N);
-}
-
-proc CUDAWrapper2(lo: int, hi: int, N: int) {
+proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
@@ -111,8 +103,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent1: ", CPUPercent1);
-  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("CPU Percent: ", CPUPercent);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -162,8 +153,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper2, CPUPercent2) {
-      //forall i in D {
+      forall i in GPU(D, CUDAWrapper, CPUPercent) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);

From 316f5a1ae9bbd4c93662241e7c9ea26c095f9a9a Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 14 May 2020 17:52:37 -0400
Subject: [PATCH 039/118] Update Makefile

---
 apps/Makefile | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/apps/Makefile b/apps/Makefile
index 214d1c0..d1c3908 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,22 +1,25 @@
-# Flags for the Chapel compiler
+# Chapel
 CHPLFLAGS=--fast -sverbose
 CHPLMODULE=../../src
 GPUAPIFLAGS=-sdebugGPUAPI -sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
 
-# For CUDA
+# CUDA
+CUDA_HOME?=/usr/local/cuda
+CUDA_SM?=sm_70
 CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas
-#CUDALIBSFLAGS=-L/sw/summit/cuda/10.1.168/lib64 -lcudart -lcuda
-#CUDALIBSFLAGS=-L/opt/apps/software/Compiler/GCC/6.4.0/CUDA/8.0.61/lib -lcudart -lcublas
-NVCCFLAGS=-O3 -arch sm_37 -std=c++11 --extended-lambda -I$(CHPLMODULE)
+NVCCFLAGS=-O3 -arch $(CUDA_SM) -std=c++11 --extended-lambda -I$(CHPLMODULE)
+$(info CUDA_HOME is $(CUDA_HOME))
+$(info CUDA_SM is $(CUDA_SM))
 
-# For OpenCL
-OCLLIBSFLAGS=-framework OpenCL
-OCLFLAGS=-framework OpenCL
-
-# For HIP
-ROCM_HOME=/opt/rocm
+# ROCM
+ROCM_HOME?=/opt/rocm
 HIP_HOME=$(ROCM_HOME)/hip
 HIPLIBSFLAGS=-L$(ROCM_HOME)/lib -lhip_hcc
+$(info ROCM_HOME is $(ROCM_HOME))
+
+# For OpenCL (MacOS)
+OCLLIBSFLAGS=-framework OpenCL
+OCLFLAGS=-framework OpenCL
 
 all: baseline cudagpu cudahybrid cudahybrid.dist
 
@@ -81,4 +84,4 @@ hiphybrid: $(TARGET).cu $(TARGET).hybrid.chpl
 
 .PHONY: clean
 clean:
-	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.lowmid $(TARGET).hybrid.dist.mid $(TARGET).o *_real
+	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.lowmid $(TARGET).hybrid.dist.mid $(TARGET).o GPUAPI.o *_real

From 5eb226525e9475684d5c2b8161d05fc7311b63bf Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 15:12:13 -0400
Subject: [PATCH 040/118] Add vc.h

---
 apps/vector_copy/vc.h | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 apps/vector_copy/vc.h

diff --git a/apps/vector_copy/vc.h b/apps/vector_copy/vc.h
new file mode 100644
index 0000000..9af412d
--- /dev/null
+++ b/apps/vector_copy/vc.h
@@ -0,0 +1 @@
+void vcCUDA(float* A, float *B, int start, int end, int GPUN);

From a8d2cf1109ee5e3765d081248379b1833e030071 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 15:18:19 -0400
Subject: [PATCH 041/118] Add stream.h

---
 apps/stream/stream.h | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 apps/stream/stream.h

diff --git a/apps/stream/stream.h b/apps/stream/stream.h
new file mode 100644
index 0000000..c3cfe5f
--- /dev/null
+++ b/apps/stream/stream.h
@@ -0,0 +1,4 @@
+//extern "C" {
+    void LaunchStream(float *dA, float *dB, float *dC, float alpha, int N);
+    void streamCUDA(float* A, float *B, float *C, float alpha, int start, int end, int GPUN);
+//}

From a0021fa2822d25a2263f3ba14c40bfcc5fbd7635 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 15:21:32 -0400
Subject: [PATCH 042/118] Add bs.h

---
 apps/blackscholes/bs.h | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 apps/blackscholes/bs.h

diff --git a/apps/blackscholes/bs.h b/apps/blackscholes/bs.h
new file mode 100644
index 0000000..941e6ba
--- /dev/null
+++ b/apps/blackscholes/bs.h
@@ -0,0 +1 @@
+void bsCUDA(float* rand, float *put, float *call, int start, int end, int GPUN);

From 1302d5bd339082241386274a564ba66cc073e4d7 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 15:55:54 -0400
Subject: [PATCH 043/118] Update VC (parallelize the input loop)

---
 apps/vector_copy/vc.hybrid.dist.chpl        | 2 +-
 apps/vector_copy/vc.hybrid.dist.lowmid.chpl | 2 +-
 apps/vector_copy/vc.hybrid.dist.mid.chpl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/vector_copy/vc.hybrid.dist.chpl b/apps/vector_copy/vc.hybrid.dist.chpl
index 2b5199e..3e2fd17 100644
--- a/apps/vector_copy/vc.hybrid.dist.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.chpl
@@ -85,7 +85,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       A(i) = 0: real(32);
       B(i) = i: real(32);
 	}
diff --git a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
index 1a2c5e1..db5f864 100644
--- a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
@@ -102,7 +102,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       A(i) = 0: real(32);
       B(i) = i: real(32);
 	}
diff --git a/apps/vector_copy/vc.hybrid.dist.mid.chpl b/apps/vector_copy/vc.hybrid.dist.mid.chpl
index 23a041e..0eee5cd 100644
--- a/apps/vector_copy/vc.hybrid.dist.mid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.mid.chpl
@@ -99,7 +99,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       A(i) = 0: real(32);
       B(i) = i: real(32);
 	}

From 31e416262f1795bb8eac7fd06d45caaa127a5080 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 16:01:06 -0400
Subject: [PATCH 044/118] Update VC

---
 apps/vector_copy/vc.hybrid.dist.chpl        | 1 +
 apps/vector_copy/vc.hybrid.dist.lowmid.chpl | 1 +
 apps/vector_copy/vc.hybrid.dist.mid.chpl    | 1 +
 3 files changed, 3 insertions(+)

diff --git a/apps/vector_copy/vc.hybrid.dist.chpl b/apps/vector_copy/vc.hybrid.dist.chpl
index 3e2fd17..3a803ec 100644
--- a/apps/vector_copy/vc.hybrid.dist.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.chpl
@@ -78,6 +78,7 @@ proc main() {
   writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
diff --git a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
index db5f864..59ebf9d 100644
--- a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
@@ -95,6 +95,7 @@ proc main() {
   writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
diff --git a/apps/vector_copy/vc.hybrid.dist.mid.chpl b/apps/vector_copy/vc.hybrid.dist.mid.chpl
index 0eee5cd..224cfb1 100644
--- a/apps/vector_copy/vc.hybrid.dist.mid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.mid.chpl
@@ -92,6 +92,7 @@ proc main() {
   writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 

From bfe4f7a4d9f03549381554c2f63e1672e9caf4d6 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 16:20:31 -0400
Subject: [PATCH 045/118] Update BS, LR, MM, and STREAM

---
 apps/blackscholes/bs.hybrid.dist.chpl              | 4 +++-
 apps/blackscholes/bs.hybrid.dist.lowmid.chpl       | 4 +++-
 apps/blackscholes/bs.hybrid.dist.mid.chpl          | 4 +++-
 apps/logisticregression/lr.hybrid.dist.chpl        | 5 +++--
 apps/logisticregression/lr.hybrid.dist.lowmid.chpl | 5 +++--
 apps/logisticregression/lr.hybrid.dist.mid.chpl    | 5 +++--
 apps/mm/mm.hybrid.dist.chpl                        | 5 +++--
 apps/stream/stream.hybrid.dist.chpl                | 3 ++-
 8 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/apps/blackscholes/bs.hybrid.dist.chpl b/apps/blackscholes/bs.hybrid.dist.chpl
index f486fb0..da89c30 100644
--- a/apps/blackscholes/bs.hybrid.dist.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.chpl
@@ -79,6 +79,8 @@ proc printLocaleInfo() {
 proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);  
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -97,7 +99,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       rand(i) = (i: real(32) / n): real(32);
 	}
 
diff --git a/apps/blackscholes/bs.hybrid.dist.lowmid.chpl b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
index 2528fa3..10d3ec5 100644
--- a/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
@@ -93,6 +93,8 @@ proc printLocaleInfo() {
 proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);    
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -111,7 +113,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       rand(i) = (i: real(32) / n): real(32);
 	}
 
diff --git a/apps/blackscholes/bs.hybrid.dist.mid.chpl b/apps/blackscholes/bs.hybrid.dist.mid.chpl
index b42e6e4..84c32fe 100644
--- a/apps/blackscholes/bs.hybrid.dist.mid.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.mid.chpl
@@ -88,6 +88,8 @@ proc printLocaleInfo() {
 proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);    
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -106,7 +108,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       rand(i) = (i: real(32) / n): real(32);
 	}
 
diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index d324930..e6ac152 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -91,6 +91,7 @@ proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
   writeln("CPU Percent: ", CPUPercent);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -119,9 +120,9 @@ proc main() {
         W(i) = 0: real(32);
       }
       coforall loc in Locales do on loc {
-          for i in 1..nSamples {
+          forall i in 1..nSamples {
             Y(i) = i: real(32);
-            for j in 1..nFeatures {
+            forall j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index 261e1e8..45ab20f 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -110,6 +110,7 @@ proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
   writeln("CPU Percent: ", CPUPercent);
+  writeln("nGPUs: ", nGPUs);  
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -138,9 +139,9 @@ proc main() {
         W(i) = 0: real(32);
       }
       coforall loc in Locales do on loc {
-          for i in 1..nSamples {
+          forall i in 1..nSamples {
             Y(i) = i: real(32);
-            for j in 1..nFeatures {
+            forall j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index bee2a64..1aa6b21 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -104,6 +104,7 @@ proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
   writeln("CPU Percent: ", CPUPercent);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -132,9 +133,9 @@ proc main() {
         W(i) = 0: real(32);
       }
       coforall loc in Locales do on loc {
-          for i in 1..nSamples {
+          forall i in 1..nSamples {
             Y(i) = i: real(32);
-            for j in 1..nFeatures {
+            forall j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {
diff --git a/apps/mm/mm.hybrid.dist.chpl b/apps/mm/mm.hybrid.dist.chpl
index 76bc917..b32ef46 100644
--- a/apps/mm/mm.hybrid.dist.chpl
+++ b/apps/mm/mm.hybrid.dist.chpl
@@ -87,6 +87,7 @@ proc main() {
   writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n, "x", n);
   writeln("CPU ratio: ", CPUPercent);
+  writeln("nGPUs: ", nGPUs);    
   writeln("nTrials: ", numTrials);
   writeln("tiled: ", tiled);
   writeln("output: ", output);
@@ -96,8 +97,8 @@ proc main() {
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
     coforall loc in Locales do on loc {
-      for i in 1..n {
-        for j in 1..n {
+      forall i in 1..n {
+        forall j in 1..n {
           var e: int = (i-1)*n+(j-1)+1;
           A(e) = (i*1.0/1000): real(32);
           B(i, j) = (i*1.0/1000): real(32);
diff --git a/apps/stream/stream.hybrid.dist.chpl b/apps/stream/stream.hybrid.dist.chpl
index db06f06..3067790 100644
--- a/apps/stream/stream.hybrid.dist.chpl
+++ b/apps/stream/stream.hybrid.dist.chpl
@@ -81,6 +81,7 @@ proc main() {
   writeln("Stream: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);  
   writeln("alpha: ", alpha);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -89,7 +90,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       B(i) = i: real(32);
       C(i) = 2*i: real(32);
 	}

From 6fff14a781fecb81573bda1d91fb00af294e47d9 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 16:34:04 -0400
Subject: [PATCH 046/118] Update MM

---
 apps/mm/mm.hybrid.dist.chpl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/mm/mm.hybrid.dist.chpl b/apps/mm/mm.hybrid.dist.chpl
index b32ef46..2ee668c 100644
--- a/apps/mm/mm.hybrid.dist.chpl
+++ b/apps/mm/mm.hybrid.dist.chpl
@@ -9,7 +9,7 @@ use GPUIterator;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const tiled = 0;
 config const output = 0: int;
@@ -86,7 +86,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n, "x", n);
-  writeln("CPU ratio: ", CPUPercent);
+  writeln("CPU ratio: ", CPUratio);
   writeln("nGPUs: ", nGPUs);    
   writeln("nTrials: ", numTrials);
   writeln("tiled: ", tiled);
@@ -109,7 +109,7 @@ proc main() {
 
 	const startTime = getCurrentTime();
 	// TODO: Consider using a 2D iterator
-	forall e in GPU(D, CUDAWrapper, CPUPercent) {
+	forall e in GPU(D, CUDAWrapper, CPUratio) {
       var i: int = (e - 1) / n + 1;
       var j: int = (e - 1) % n + 1;
       var sum: real(32) = C(e);

From 94f795102543c23a8f7085192db796836a9e73b0 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 16:44:26 -0400
Subject: [PATCH 047/118] Update LR

---
 apps/logisticregression/lr.hybrid.dist.chpl        | 6 +++---
 apps/logisticregression/lr.hybrid.dist.lowmid.chpl | 6 +++---
 apps/logisticregression/lr.hybrid.dist.mid.chpl    | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index e6ac152..fb76205 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -11,7 +11,7 @@ use GPUIterator;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -90,7 +90,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent: ", CPUPercent);
+  writeln("CPU Percent: ", CPUratio);
   writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -141,7 +141,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper, CPUPercent) {
+      forall i in GPU(D, CUDAWrapper, CPUratio) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index 45ab20f..5316ca1 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -14,7 +14,7 @@ use SysCTypes;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -109,7 +109,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent: ", CPUPercent);
+  writeln("CPU Percent: ", CPUratio);
   writeln("nGPUs: ", nGPUs);  
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -160,7 +160,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper, CPUPercent) {
+      forall i in GPU(D, CUDAWrapper, CPUratio) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index 1aa6b21..5f2b23c 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -14,7 +14,7 @@ use SysCTypes;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -103,7 +103,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent: ", CPUPercent);
+  writeln("CPU Percent: ", CPUratio);
   writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -154,7 +154,7 @@ proc main() {
         }
       }
       const start = getCurrentTime();
-      forall i in GPU(D, CUDAWrapper, CPUPercent) {
+      forall i in GPU(D, CUDAWrapper, CPUratio) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);

From b8be7c7959a6b55152ccf3a2df22aca89461000b Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 16:52:56 -0400
Subject: [PATCH 048/118] Update LR

---
 apps/logisticregression/lr.hybrid.dist.chpl        | 2 +-
 apps/logisticregression/lr.hybrid.dist.lowmid.chpl | 2 +-
 apps/logisticregression/lr.hybrid.dist.mid.chpl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index fb76205..32f8f92 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -122,7 +122,7 @@ proc main() {
       coforall loc in Locales do on loc {
           forall i in 1..nSamples {
             Y(i) = i: real(32);
-            forall j in 1..nFeatures {
+            for j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index 5316ca1..be7266e 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -141,7 +141,7 @@ proc main() {
       coforall loc in Locales do on loc {
           forall i in 1..nSamples {
             Y(i) = i: real(32);
-            forall j in 1..nFeatures {
+            for j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index 5f2b23c..009b69e 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -135,7 +135,7 @@ proc main() {
       coforall loc in Locales do on loc {
           forall i in 1..nSamples {
             Y(i) = i: real(32);
-            forall j in 1..nFeatures {
+            for j in 1..nFeatures {
               if (j != 0) {
                 X(i, j) = j: real(32);
               } else {

From 6e1aab1693882d337657608406ace8175d6e09aa Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 17:16:42 -0400
Subject: [PATCH 049/118] Update VC

---
 apps/vector_copy/vc.hybrid.dist.lowmid.chpl | 14 +++++++-------
 apps/vector_copy/vc.hybrid.dist.mid.chpl    | 15 ++++++++-------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
index 59ebf9d..076c148 100644
--- a/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.lowmid.chpl
@@ -115,14 +115,14 @@ proc main() {
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
       writeln(A);
-	}
-    for i in 1..n {
-      if (A(i) != B(i)) {
-        writeln("Verification Error");
-        exit();
+      for i in 1..n {
+        if (A(i) != B(i)) {
+          writeln("Verification Error");
+          exit();
+        }
       }
-    }
+      writeln("Verified");
+	}
   }
-  writeln("Verified");
   printResults(execTimes);
 }
diff --git a/apps/vector_copy/vc.hybrid.dist.mid.chpl b/apps/vector_copy/vc.hybrid.dist.mid.chpl
index 224cfb1..14a5577 100644
--- a/apps/vector_copy/vc.hybrid.dist.mid.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.mid.chpl
@@ -112,14 +112,15 @@ proc main() {
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
       writeln(A);
-	}
-    for i in 1..n {
-      if (A(i) != B(i)) {
-        writeln("Verification Error");
-        exit();
+      for i in 1..n {
+        if (A(i) != B(i)) {
+          writeln("Verification Error");
+          exit();
+        }
       }
-    }
+      writeln("Verified");
+	}
   }
-  writeln("Verified");
+
   printResults(execTimes);
 }

From 88e535217b77a50676c6e6c0e2413d07cc139b57 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 18:10:58 -0400
Subject: [PATCH 050/118] Update LR

---
 apps/logisticregression/lr.hybrid.dist.chpl        | 4 +++-
 apps/logisticregression/lr.hybrid.dist.lowmid.chpl | 4 +++-
 apps/logisticregression/lr.hybrid.dist.mid.chpl    | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index 32f8f92..3142c37 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -98,6 +98,7 @@ proc main() {
   printLocaleInfo();
 
   var execTimes: [1..numTrials] real;
+  var execTimes2: [1..numTrials] real;
   for trial in 1..numTrials {
     if (false) {
       forall i in D {
@@ -153,7 +154,7 @@ proc main() {
 		}
 		W(i) = Wcurr(i) - alpha * err;
       }
-      writeln(getCurrentTime() - start, " sec");
+      execTimes2(trial) = getCurrentTime() - start;
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
@@ -161,4 +162,5 @@ proc main() {
 	}
   }
   printResults(execTimes);
+  printResults(execTimes2);
 }
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index be7266e..1a2c397 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -117,6 +117,7 @@ proc main() {
   printLocaleInfo();
 
   var execTimes: [1..numTrials] real;
+  var execTimes2: [1..numTrials] real;  
   for trial in 1..numTrials {
     if (false) {
       forall i in D {
@@ -172,7 +173,7 @@ proc main() {
 		}
 		W(i) = Wcurr(i) - alpha * err;
       }
-      writeln(getCurrentTime() - start, " sec");
+      execTimes2(trial) = getCurrentTime() - start;
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
@@ -180,4 +181,5 @@ proc main() {
 	}
   }
   printResults(execTimes);
+  printResults(execTimes2);  
 }
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index 009b69e..0cc5786 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -111,6 +111,7 @@ proc main() {
   printLocaleInfo();
 
   var execTimes: [1..numTrials] real;
+  var execTimes2: [1..numTrials] real;  
   for trial in 1..numTrials {
     if (false) {
       forall i in D {
@@ -166,7 +167,7 @@ proc main() {
 		}
 		W(i) = Wcurr(i) - alpha * err;
       }
-      writeln(getCurrentTime() - start, " sec");
+      execTimes2(trial) = getCurrentTime() - start;
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
@@ -174,4 +175,5 @@ proc main() {
 	}
   }
   printResults(execTimes);
+  printResults(execTimes2);
 }

From 075c7d6b0004f3df15e507ff58b4da27c4db5323 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Fri, 15 May 2020 18:29:49 -0400
Subject: [PATCH 051/118] Update vc.cu, and Add lsf scripts for vc, bs, and lr

---
 apps/blackscholes/bs.lsf       | 31 +++++++++++++++++++++++++++++++
 apps/logisticregression/lr.lsf | 31 +++++++++++++++++++++++++++++++
 apps/vector_copy/vc.cu         | 10 +++++-----
 apps/vector_copy/vc.lsf        | 31 +++++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 5 deletions(-)
 create mode 100755 apps/blackscholes/bs.lsf
 create mode 100755 apps/logisticregression/lr.lsf
 create mode 100755 apps/vector_copy/vc.lsf

diff --git a/apps/blackscholes/bs.lsf b/apps/blackscholes/bs.lsf
new file mode 100755
index 0000000..95063b9
--- /dev/null
+++ b/apps/blackscholes/bs.lsf
@@ -0,0 +1,31 @@
+#!/bin/bash
+#BSUB -P GEN010sollve
+#BSUB -W 1:00
+#BSUB -nnodes 8
+#BSUB -alloc_flags smt1
+##BSUB -alloc_flags gpumps
+#BSUB -J BS
+#BSUB -o BS.%J
+#BSUB -e BS.%J
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-1.20.0
+source ./chiuw2020.sh
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-gpu/apps/blackscholes
+
+N=1073741824
+
+for nodes in 1 2 4 8;
+do
+    echo "nTasks: " $nodes
+    echo "[LOW]"
+    for ratio in 100 0 ;
+    do
+        echo "CPUratio:" $ratio
+	    jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./bs.hybrid.dist_real -nl $nodes --n=$N --numTrials=10 --CPUratio=$ratio -v
+    done
+    echo "[LOWMID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./bs.hybrid.dist.lowmid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+    echo "[MID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./bs.hybrid.dist.mid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+done
diff --git a/apps/logisticregression/lr.lsf b/apps/logisticregression/lr.lsf
new file mode 100755
index 0000000..31b3dd5
--- /dev/null
+++ b/apps/logisticregression/lr.lsf
@@ -0,0 +1,31 @@
+#!/bin/bash
+#BSUB -P GEN010sollve
+#BSUB -W 1:30
+#BSUB -nnodes 8
+#BSUB -alloc_flags smt1
+##BSUB -alloc_flags gpumps
+#BSUB -J LR
+#BSUB -o LR.%J
+#BSUB -e LR.%J
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-1.20.0
+source ./chiuw2020.sh
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-gpu/apps/logisticregression
+
+OPT=--nFeatures=65536 --nSamples=1024 --nIters=1
+
+for nodes in 1 2 4 8;
+do
+    echo "nTasks: " $nodes
+    echo "[LOW]"
+    for ratio in 100 0;
+    do
+        echo "CPUratio:" $ratio
+	    jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./lr.hybrid.dist_real -nl $nodes $OPT --numTrials=10 --CPUratio=$ratio -v
+    done
+    echo "[LOWMID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./lr.hybrid.dist.lowmid_real -nl $nodes $OPT --numTrials=10 --CPUratio=0 -v
+    echo "[MID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./lr.hybrid.dist.mid_real -nl $nodes $OPT --numTrials=10 --CPUratio=0 -v
+done
diff --git a/apps/vector_copy/vc.cu b/apps/vector_copy/vc.cu
index dbc3a36..826fbf5 100644
--- a/apps/vector_copy/vc.cu
+++ b/apps/vector_copy/vc.cu
@@ -4,7 +4,7 @@
 #include <assert.h>
 
 //#define VERBOSE
-#define PROF
+//#define PROF
 #define CUDA_ERROR_CHECK
 
 #define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
@@ -63,7 +63,7 @@ extern "C" {
 	    printf("In vcCUDA\n");
 	    printf("\t GPUN: %d\n", GPUN);
 	    printf("\t range: %d..%d\n", start, end);
-#endif	
+#endif
 #ifdef PROF
 	    cudaEvent_t startCudaKernelEvent, endCudaKernelEvent;
 	    CudaSafeCall(cudaEventCreate(&startCudaKernelEvent));
@@ -75,12 +75,12 @@ extern "C" {
 	    CudaSafeCall(cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
-#endif	    
+#endif
 	    vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
 	    CudaSafeCall(cudaEventSynchronize(endCudaKernelEvent));
-#endif	    
+#endif
 	    CudaCheckError();
 	    CudaSafeCall(cudaDeviceSynchronize());
 	    CudaSafeCall(cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
@@ -90,7 +90,7 @@ extern "C" {
 	    CudaSafeCall(cudaEventElapsedTime(&msecKernel, startCudaKernelEvent, endCudaKernelEvent));
 	    printf("CUDA kernel: %lf msec\n", msecKernel);
 #endif
-	    
+
 	    CudaSafeCall(cudaFree(dA));
 	    CudaSafeCall(cudaFree(dB));
 	}
diff --git a/apps/vector_copy/vc.lsf b/apps/vector_copy/vc.lsf
new file mode 100755
index 0000000..cd13b61
--- /dev/null
+++ b/apps/vector_copy/vc.lsf
@@ -0,0 +1,31 @@
+#!/bin/bash
+#BSUB -P GEN010sollve
+#BSUB -W 0:30
+#BSUB -nnodes 8
+#BSUB -alloc_flags smt1
+##BSUB -alloc_flags gpumps
+#BSUB -J VC
+#BSUB -o VC.%J
+#BSUB -e VC.%J
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-1.20.0
+source ./chiuw2020.sh
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-gpu/apps/vector_copy
+
+N=1073741824
+
+for nodes in 1 2 4 8;
+do
+    echo "nTasks: " $nodes
+    echo "[LOW]"
+    for ratio in 100 0;
+    do
+        echo "CPUratio:" $ratio
+	    jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./vc.hybrid.dist_real -nl $nodes --n=$N --numTrials=10 --CPUratio=$ratio -v
+    done
+    echo "[LOWMID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./vc.hybrid.dist.lowmid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+    echo "[MID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./vc.hybrid.dist.mid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+done

From 5ac7722175d6c6e23669f434e820edf705b5c71f Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 18:33:38 -0400
Subject: [PATCH 052/118] Update LR

---
 apps/logisticregression/lr.hybrid.dist.chpl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index 3142c37..bf96614 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -46,9 +46,9 @@ extern proc lrCUDA2(X: [] real(32), Y: [] real(32), W: [] real(32), Wcurr: [] re
 // CUDAWrapper is called from GPUIterator
 // to invoke a specific CUDA program (using C interoperability)
 proc CUDAWrapper(lo: int, hi: int, N: int) {
-  //if (verbose) {
+  if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
-  //}
+  }
   ref lW = W.localSlice(lo .. hi);
   lrCUDA2(X, Y, lW, Wcurr, alpha, nSamples, nFeatures, lo, hi, N);
 }

From 2a84c38c140de0fe3afc059e26cf029be2a71814 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 15 May 2020 22:08:16 -0400
Subject: [PATCH 053/118] Update HIP Support

---
 apps/Makefile | 28 +++++++++++++++++++++++++++-
 src/GPUAPI.cu |  7 +++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/apps/Makefile b/apps/Makefile
index d1c3908..7de5867 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -29,6 +29,10 @@ $(TARGET).o: $(TARGET).cu
 GPUAPI.o: $(CHPLMODULE)/GPUAPI.cu
 	nvcc $(NVCCFLAGS) -c $^
 
+GPUAPI.hip.o: $(CHPLMODULE)/GPUAPI.cu
+	$(HIP_HOME)/bin/hipify-perl $^ > GPUAPI.hip.cpp
+	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c GPUAPI.hip.cpp -fno-gpu-rdc
+
 $(TARGET).opencl.o: $(TARGET).opencl.c
 	gcc -O3 -Wall $(OCLFLAGS) -c $^
 
@@ -80,7 +84,29 @@ hipgpu: $(TARGET).cu $(TARGET).gpu.chpl
 hiphybrid: $(TARGET).cu $(TARGET).hybrid.chpl
 	$(HIP_HOME)/bin/hipify-perl $(TARGET).cu > $(TARGET).hip.cpp
 	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).hip.cpp -fno-gpu-rdc
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).hip.o $(TARGET).hybrid.chpl --ldflags $(HIPLIBSFLAGS)
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).hip.o $(TARGET).hybrid.chpl --ldflags $(HIPLIBSFLAGS)
+
+.PHONY: hiphybrid.dist
+hiphybrid.dist: GPUAPI.hip.o $(TARGET).cu $(TARGET).hybrid.dist.chpl
+	$(HIP_HOME)/bin/hipify-perl $(TARGET).cu > $(TARGET).hip.cpp
+	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).hip.cpp -fno-gpu-rdc
+	cp GPUAPI.hip.o GPUAPI.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl --ldflags $(HIPLIBSFLAGS)
+
+.PHONY: hiphybrid.dist.lowmid
+hiphybrid.dist.lowmid: GPUAPI.hip.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.lowmid.chpl
+	$(HIP_HOME)/bin/hipify-perl $(TARGET).kernel.cu > $(TARGET).kernel.hip.cpp
+	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).kernel.hip.cpp -fno-gpu-rdc
+	cp GPUAPI.hip.o GPUAPI.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.lowmid.chpl --ldflags $(HIPLIBSFLAGS)
+
+.PHONY: hiphybrid.dist.mid
+hiphybrid.dist.mid: GPUAPI.hip.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.mid.chpl
+	$(HIP_HOME)/bin/hipify-perl $(TARGET).kernel.cu > $(TARGET).kernel.hip.cpp
+	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).kernel.hip.cpp -fno-gpu-rdc
+	cp GPUAPI.hip.o GPUAPI.o
+	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.mid.chpl --ldflags $(HIPLIBSFLAGS)
+
 
 .PHONY: clean
 clean:
diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index b8b06bd..5fbda7b 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -2,7 +2,10 @@
 #include <stdlib.h>
 #include <sys/time.h>
 #include <assert.h>
+#include <cuda.h>
+#ifdef __NVCC__
 #include <cuda_profiler_api.h>
+#endif
 
 #define CUDA_ERROR_CHECK
 #define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
@@ -60,11 +63,15 @@ extern "C" {
   }
 
   void ProfilerStart() {
+#ifdef __NVCC__
     CudaSafeCall(cudaProfilerStart());
+#endif
   }
 
   void ProfilerStop() {
+#ifdef __NVCC__
     CudaSafeCall(cudaProfilerStop());
+#endif
   }
 
   void DeviceSynchronize() {

From b3da131eb0f507d3656bf75dec2e28097052c722 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Sat, 16 May 2020 01:06:41 -0400
Subject: [PATCH 054/118] Update LR

---
 apps/logisticregression/lr.hybrid.dist.chpl        | 10 ++++++++--
 apps/logisticregression/lr.hybrid.dist.lowmid.chpl | 10 ++++++++--
 apps/logisticregression/lr.hybrid.dist.mid.chpl    | 10 ++++++++--
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index bf96614..9735a76 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -15,6 +15,7 @@ config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
+config const reduction = false;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Global Arrays
@@ -94,6 +95,7 @@ proc main() {
   writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
+  writeln("reduction: ", reduction);
 
   printLocaleInfo();
 
@@ -146,8 +148,12 @@ proc main() {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
-          for f in 1..nFeatures {
-			arg += Wcurr(f) * X(s, f);
+          if (reduction) {
+            arg = (+ reduce (Wcurr(1..nFeatures) * X(s, 1..nFeatures)));
+          } else {
+            for f in 1..nFeatures {
+              arg += Wcurr(f) * X(s, f);
+            }
           }
           var hypo = 1 / (1 + exp(-arg));
           err += (hypo - Y(s)) * X(s, i);
diff --git a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
index 1a2c397..e646afa 100644
--- a/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.lowmid.chpl
@@ -18,6 +18,7 @@ config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
+config const reduction = false;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Global Arrays
@@ -113,6 +114,7 @@ proc main() {
   writeln("nGPUs: ", nGPUs);  
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
+  writeln("reduction: ", reduction);
 
   printLocaleInfo();
 
@@ -165,8 +167,12 @@ proc main() {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
-          for f in 1..nFeatures {
-			arg += Wcurr(f) * X(s, f);
+          if (reduction) {
+            arg = (+ reduce (Wcurr(1..nFeatures) * X(s, 1..nFeatures)));
+          } else {
+            for f in 1..nFeatures {
+              arg += Wcurr(f) * X(s, f);
+            }
           }
           var hypo = 1 / (1 + exp(-arg));
           err += (hypo - Y(s)) * X(s, i);
diff --git a/apps/logisticregression/lr.hybrid.dist.mid.chpl b/apps/logisticregression/lr.hybrid.dist.mid.chpl
index 0cc5786..6bd96b2 100644
--- a/apps/logisticregression/lr.hybrid.dist.mid.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.mid.chpl
@@ -18,6 +18,7 @@ config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
+config const reduction = false;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Global Arrays
@@ -107,6 +108,7 @@ proc main() {
   writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
+  writeln("reduction: ", reduction);
 
   printLocaleInfo();
 
@@ -159,8 +161,12 @@ proc main() {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
-          for f in 1..nFeatures {
-			arg += Wcurr(f) * X(s, f);
+          if (reduction) {
+            arg = (+ reduce (Wcurr(1..nFeatures) * X(s, 1..nFeatures)));
+          } else {
+            for f in 1..nFeatures {
+              arg += Wcurr(f) * X(s, f);
+            }
           }
           var hypo = 1 / (1 + exp(-arg));
           err += (hypo - Y(s)) * X(s, i);

From 599159d6bcd8a70157c7fdd751ce7869424498d4 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Fri, 15 May 2020 22:25:42 -0700
Subject: [PATCH 055/118] Add stream mid version and some formatting

---
 apps/stream/stream.baseline.chpl              |   2 +-
 apps/stream/stream.gpu.chpl                   |   2 +-
 apps/stream/stream.hybrid.chpl                |   2 +-
 ...it.chpl => stream.hybrid.dist.lowmid.chpl} |  20 +--
 apps/stream/stream.hybrid.dist.mid.chpl       | 136 ++++++++++++++++++
 5 files changed, 151 insertions(+), 11 deletions(-)
 rename apps/stream/{stream.hybrid.dist.explicit.chpl => stream.hybrid.dist.lowmid.chpl} (93%)
 create mode 100644 apps/stream/stream.hybrid.dist.mid.chpl

diff --git a/apps/stream/stream.baseline.chpl b/apps/stream/stream.baseline.chpl
index 6cbedd7..a2dd263 100644
--- a/apps/stream/stream.baseline.chpl
+++ b/apps/stream/stream.baseline.chpl
@@ -64,7 +64,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in 1..n {
       B(i) = i: real(32);
       C(i) = 2*i: real(32);
 	}
diff --git a/apps/stream/stream.gpu.chpl b/apps/stream/stream.gpu.chpl
index 3b85486..d5f2003 100644
--- a/apps/stream/stream.gpu.chpl
+++ b/apps/stream/stream.gpu.chpl
@@ -69,7 +69,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in 1..n {
       B(i) = i: real(32);
       C(i) = 2*i: real(32);
 	}
diff --git a/apps/stream/stream.hybrid.chpl b/apps/stream/stream.hybrid.chpl
index d059596..8387180 100644
--- a/apps/stream/stream.hybrid.chpl
+++ b/apps/stream/stream.hybrid.chpl
@@ -85,7 +85,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in 1..n {
       B(i) = i: real(32);
       C(i) = 2*i: real(32);
 	}
diff --git a/apps/stream/stream.hybrid.dist.explicit.chpl b/apps/stream/stream.hybrid.dist.lowmid.chpl
similarity index 93%
rename from apps/stream/stream.hybrid.dist.explicit.chpl
rename to apps/stream/stream.hybrid.dist.lowmid.chpl
index 6082e06..05d4319 100644
--- a/apps/stream/stream.hybrid.dist.explicit.chpl
+++ b/apps/stream/stream.hybrid.dist.lowmid.chpl
@@ -47,21 +47,24 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   ref lA = A.localSlice(lo .. hi);
   ref lB = B.localSlice(lo .. hi);
   ref lC = C.localSlice(lo .. hi);
-  writeln("localSlice Size:", lA.size);
-  ProfilerStart();
-  var dA: c_void_ptr;
-  var dB: c_void_ptr;
-  var dC: c_void_ptr;
-  var size: size_t = (lA.size * 4): size_t;
+  //writeln("localSlice Size:", lA.size);
+  if (verbose) { ProfilerStart(); }
+  var dA, dB, dC: c_void_ptr;
+  var size: size_t = (lA.size:size_t * c_sizeof(lA.eltType));
   Malloc(dA, size);
   Malloc(dB, size);
   Malloc(dC, size);
+
   Memcpy(dB, c_ptrTo(lB), size, 0);
   Memcpy(dC, c_ptrTo(lC), size, 0);
   LaunchStream(dA, dB, dC, alpha, N: size_t);
   DeviceSynchronize();
   Memcpy(c_ptrTo(lA), dA, size, 1);
-  ProfilerStop();
+
+  Free(dA);
+  Free(dB);
+  Free(dC);
+  if (verbose) { ProfilerStop(); }
 
   //streamCUDA(lA, lB, lC, alpha, 0, hi-lo, N);
 }
@@ -104,6 +107,7 @@ proc main() {
   writeln("Stream: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
   writeln("alpha: ", alpha);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -112,7 +116,7 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
+	forall i in D {
       B(i) = i: real(32);
       C(i) = 2*i: real(32);
 	}
diff --git a/apps/stream/stream.hybrid.dist.mid.chpl b/apps/stream/stream.hybrid.dist.mid.chpl
new file mode 100644
index 0000000..a217c21
--- /dev/null
+++ b/apps/stream/stream.hybrid.dist.mid.chpl
@@ -0,0 +1,136 @@
+use Time;
+
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const output = 0: int;
+config const alpha = 3.0: real(32);
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+var A: [D] real(32);
+var B: [D] real(32);
+var C: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchStream(A: c_void_ptr, B: c_void_ptr, C: c_void_ptr, alpha: c_float, N: size_t);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+    var device, count: int(32);
+    GetDevice(device);
+    GetDeviceCount(count);
+    writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, "), GPU", device, " of ", count, " @", here);
+  }
+
+  ref lA = A.localSlice(lo .. hi);
+  ref lB = B.localSlice(lo .. hi);
+  ref lC = C.localSlice(lo .. hi);
+  //writeln("localSlice Size:", lA.size);
+  if (verbose) { ProfilerStart(); }
+  var dA = new GPUArray(lA);
+  var dB = new GPUArray(lB);
+  var dC = new GPUArray(lC);
+
+  toDevice(dB, dC);
+  LaunchStream(dA.dPtr(), dB.dPtr(), dC.dPtr(), alpha, N: size_t);
+  DeviceSynchronize();
+  dA.fromDevice();
+
+  free(dA, dB, dC);
+  if (verbose) { ProfilerStop(); }
+
+  //streamCUDA(lA, lB, lC, alpha, 0, hi-lo, N);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Stream: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
+  writeln("alpha: ", alpha);
+  writeln("nTrials: ", numTrials);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+	forall i in D {
+      B(i) = i: real(32);
+      C(i) = 2*i: real(32);
+	}
+
+	const startTime = getCurrentTime();
+	forall i in GPU(D, CUDAWrapper, CPUratio) {
+      A(i) = B(i) + alpha * C(i);
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(A);
+      for i in 1..n {
+        if(A(i) != B(i) + alpha * C(i)) {
+          writeln("Verification Error");
+          exit();
+        }
+      }
+      writeln("Verified");
+	}
+  }
+  printResults(execTimes);
+}

From 5f107fdadcf5e8c43ef6b85af2b1ee857d146ab3 Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Fri, 15 May 2020 23:32:34 -0700
Subject: [PATCH 056/118] Add MM mid version and some formatting

---
 apps/mm/mm.baseline.chpl                      |   4 +-
 apps/mm/mm.gpu.chpl                           |   4 +-
 apps/mm/mm.hybrid.chpl                        |  10 +-
 ...plicit.chpl => mm.hybrid.dist.lowmid.chpl} |  21 ++-
 apps/mm/mm.hybrid.dist.mid.chpl               | 146 ++++++++++++++++++
 apps/mm/po.slurm                              |   6 +-
 6 files changed, 168 insertions(+), 23 deletions(-)
 rename apps/mm/{mm.hybrid.dist.explicit.chpl => mm.hybrid.dist.lowmid.chpl} (92%)
 create mode 100644 apps/mm/mm.hybrid.dist.mid.chpl

diff --git a/apps/mm/mm.baseline.chpl b/apps/mm/mm.baseline.chpl
index 8cb7b03..a864a69 100644
--- a/apps/mm/mm.baseline.chpl
+++ b/apps/mm/mm.baseline.chpl
@@ -62,8 +62,8 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
-      for j in 1..n {
+	forall i in 1..n {
+      forall j in 1..n {
 		A(i, j) = (i*1.0/1000): real(32);
 		B(i, j) = (i*1.0/1000): real(32);
 		C(i, j) = 0: real(32);
diff --git a/apps/mm/mm.gpu.chpl b/apps/mm/mm.gpu.chpl
index 9107ee4..9f9234f 100644
--- a/apps/mm/mm.gpu.chpl
+++ b/apps/mm/mm.gpu.chpl
@@ -69,8 +69,8 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
-      for j in 1..n {
+	forall i in 1..n {
+      forall j in 1..n {
 		A(i, j) = (i*1.0/1000): real(32);
 		B(i, j) = (i*1.0/1000): real(32);
 		C(i, j) = 0: real(32);
diff --git a/apps/mm/mm.hybrid.chpl b/apps/mm/mm.hybrid.chpl
index 89f0025..a9770ce 100644
--- a/apps/mm/mm.hybrid.chpl
+++ b/apps/mm/mm.hybrid.chpl
@@ -9,7 +9,7 @@ use GPUIterator;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const tiled = 0;
 config const output = 0: int;
@@ -76,7 +76,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n, "x", n);
-  writeln("CPU ratio: ", CPUPercent);
+  writeln("CPU ratio: ", CPUratio);
   writeln("nTrials: ", numTrials);
   writeln("tiled: ", tiled);
   writeln("output: ", output);
@@ -85,8 +85,8 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..n {
-      for j in 1..n {
+	forall i in 1..n {
+      forall j in 1..n {
 		A(i, j) = (i*1.0/1000): real(32);
 		B(i, j) = (i*1.0/1000): real(32);
 		C(i, j) = 0: real(32);
@@ -95,7 +95,7 @@ proc main() {
 
 	const startTime = getCurrentTime();
 	// TODO: Consider using a 2D iterator
-	forall e in GPU(1..n*n, CUDAWrapper, CPUPercent) {
+	forall e in GPU(1..n*n, CUDAWrapper, CPUratio) {
       var i: int = (e - 1) / n + 1;
       var j: int = (e - 1) % n + 1;
       var sum: real(32) = C(i, j);
diff --git a/apps/mm/mm.hybrid.dist.explicit.chpl b/apps/mm/mm.hybrid.dist.lowmid.chpl
similarity index 92%
rename from apps/mm/mm.hybrid.dist.explicit.chpl
rename to apps/mm/mm.hybrid.dist.lowmid.chpl
index 5ebab1c..ab6db79 100644
--- a/apps/mm/mm.hybrid.dist.explicit.chpl
+++ b/apps/mm/mm.hybrid.dist.lowmid.chpl
@@ -12,7 +12,7 @@ use SysCTypes;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const tiled = 0;
 config const output = 0: int;
@@ -51,12 +51,10 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   ref lC = C.localSlice(lo .. hi);
   assert(lA.size == lC.size);
 
-  ProfilerStart();
-  var dA: c_void_ptr;
-  var dB: c_void_ptr;
-  var dC: c_void_ptr;
+  if (verbose) { ProfilerStart(); }
+  var dA, dB, dC: c_void_ptr;
 
-  writeln("lA.size: ", lA.size, " B.size: ", B.size);
+  //writeln("lA.size: ", lA.size, " B.size: ", B.size);
   Malloc(dA, lA.size:size_t * c_sizeof(lA.eltType));
   Malloc(dB, B.size:size_t  * c_sizeof(B.eltType));
   Malloc(dC, lC.size:size_t * c_sizeof(lC.eltType));
@@ -71,7 +69,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   Free(dA);
   Free(dB);
   Free(dC);
-  ProfilerStop();
+  if (verbose) { ProfilerStop(); }
 
   //mmCUDA(lA, B, lC, n*n, 0, hi-lo, N, tiled);
 }
@@ -113,7 +111,8 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n, "x", n);
-  writeln("CPU ratio: ", CPUPercent);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("tiled: ", tiled);
   writeln("output: ", output);
@@ -123,8 +122,8 @@ proc main() {
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
     coforall loc in Locales do on loc {
-      for i in 1..n {
-        for j in 1..n {
+      forall i in 1..n {
+        forall j in 1..n {
           var e: int = (i-1)*n+(j-1)+1;
           A(e) = (i*1.0/1000): real(32);
           B(i, j) = (i*1.0/1000): real(32);
@@ -135,7 +134,7 @@ proc main() {
 
 	const startTime = getCurrentTime();
 	// TODO: Consider using a 2D iterator
-	forall e in GPU(D, CUDAWrapper, CPUPercent) {
+	forall e in GPU(D, CUDAWrapper, CPUratio) {
       var i: int = (e - 1) / n + 1;
       var j: int = (e - 1) % n + 1;
       var sum: real(32) = C(e);
diff --git a/apps/mm/mm.hybrid.dist.mid.chpl b/apps/mm/mm.hybrid.dist.mid.chpl
new file mode 100644
index 0000000..20394ed
--- /dev/null
+++ b/apps/mm/mm.hybrid.dist.mid.chpl
@@ -0,0 +1,146 @@
+use Time;
+use ReplicatedDist;
+////////////////////////////////////////////////////////////////////////////////
+/// GPUIterator
+////////////////////////////////////////////////////////////////////////////////
+use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Runtime Options
+////////////////////////////////////////////////////////////////////////////////
+config const n = 32: int;
+config const CPUratio = 0: int;
+config const numTrials = 1: int;
+config const tiled = 0;
+config const output = 0: int;
+config param verbose = false;
+
+////////////////////////////////////////////////////////////////////////////////
+/// Global Arrays
+////////////////////////////////////////////////////////////////////////////////
+// For now, these arrays are global so the arrays can be seen from CUDAWrapper
+// TODO: Explore the possiblity of declaring the arrays and CUDAWrapper
+//       in the main proc (e.g., by using lambdas)
+const S = {1..n, 1..n};
+const RS = S dmapped Replicated();
+var D: domain(1) dmapped Block(boundingBox = {1..n*n}) = {1..n*n};
+
+var A: [D] real(32);
+var B: [RS] real(32);
+var C: [D] real(32);
+
+////////////////////////////////////////////////////////////////////////////////
+/// C Interoperability
+////////////////////////////////////////////////////////////////////////////////
+extern proc LaunchMM(A: c_void_ptr, B: c_void_ptr, C: c_void_ptr, N: int, lo:int, hi:int, GPUN: int, tiled: int);
+
+// CUDAWrapper is called from GPUIterator
+// to invoke a specific CUDA program (using C interoperability)
+proc CUDAWrapper(lo: int, hi: int, N: int) {
+  if (verbose) {
+	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
+  }
+  //if(tiled) {
+  //  assert(N/n>=32 && (N/n)%32==0, "should use multiples of 32 rows in GPU when tiled");
+  //}
+  assert(N%n == 0, "should offload full rows to GPU");
+  ref lA = A.localSlice(lo .. hi);
+  ref lC = C.localSlice(lo .. hi);
+  assert(lA.size == lC.size);
+
+  if (verbose) { ProfilerStart(); }
+  var dA = new GPUArray(lA);
+  var dB = new GPUArray(B);
+  var dC = new GPUArray(lC);
+
+  //writeln("lA.size: ", lA.size, " B.size: ", B.size);
+  toDevice(dA, dB);
+  LaunchMM(dA.dPtr(), dB.dPtr(), dC.dPtr(), n*n, 0, hi-lo, N, tiled);
+  DeviceSynchronize();
+  dC.fromDevice();
+
+  free(dA, dB, dC);
+  if (verbose) { ProfilerStop(); }
+
+  //mmCUDA(lA, B, lC, n*n, 0, hi-lo, N, tiled);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Utility Functions
+////////////////////////////////////////////////////////////////////////////////
+proc printResults(execTimes) {
+  const totalTime = + reduce execTimes,
+	avgTime = totalTime / numTrials,
+	minTime = min reduce execTimes;
+  writeln("Execution time:");
+  writeln("  tot = ", totalTime);
+  writeln("  avg = ", avgTime);
+  writeln("  min = ", minTime);
+}
+
+proc printLocaleInfo() {
+  for loc in Locales {
+    writeln(loc, " info: ");
+    const numSublocs = loc.getChildCount();
+    if (numSublocs != 0) {
+      for sublocID in 0..#numSublocs {
+        const subloc = loc.getChild(sublocID);
+        writeln("\t Subloc: ", sublocID);
+        writeln("\t Name: ", subloc);
+        writeln("\t maxTaskPar: ", subloc.maxTaskPar);
+      }
+    } else {
+      writeln("\t Name: ", loc);
+      writeln("\t maxTaskPar: ", loc.maxTaskPar);
+    }
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+/// Chapel main
+////////////////////////////////////////////////////////////////////////////////
+proc main() {
+  writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
+  writeln("Size: ", n, "x", n);
+  writeln("CPU ratio: ", CPUratio);
+  writeln("nGPUs: ", nGPUs);
+  writeln("nTrials: ", numTrials);
+  writeln("tiled: ", tiled);
+  writeln("output: ", output);
+
+  printLocaleInfo();
+
+  var execTimes: [1..numTrials] real;
+  for trial in 1..numTrials {
+    coforall loc in Locales do on loc {
+      forall i in 1..n {
+        forall j in 1..n {
+          var e: int = (i-1)*n+(j-1)+1;
+          A(e) = (i*1.0/1000): real(32);
+          B(i, j) = (i*1.0/1000): real(32);
+          C(e) = 0: real(32);
+        }
+      }
+    }
+
+	const startTime = getCurrentTime();
+	// TODO: Consider using a 2D iterator
+	forall e in GPU(D, CUDAWrapper, CPUratio) {
+      var i: int = (e - 1) / n + 1;
+      var j: int = (e - 1) % n + 1;
+      var sum: real(32) = C(e);
+      for k in 1..n {
+		sum += A((i-1)*n+k) * B(k, j);
+      }
+      C(e) = sum;
+	}
+	execTimes(trial) = getCurrentTime() - startTime;
+	if (output) {
+      writeln(reshape(C, {1..n, 1..n}));
+	}
+  }
+  printResults(execTimes);
+}
diff --git a/apps/mm/po.slurm b/apps/mm/po.slurm
index 4e0faa5..01db0c7 100755
--- a/apps/mm/po.slurm
+++ b/apps/mm/po.slurm
@@ -22,20 +22,20 @@ do
 
     for ratio in 0 25 50 75 100;
     do
-	./mm.hybrid -nl 1 --n=2048 --tiled=0 --numTrials=10 --CPUPercent=$ratio
+	./mm.hybrid -nl 1 --n=2048 --tiled=0 --numTrials=10 --CPUratio=$ratio
     done
 
     ./mm.gpu -nl 1 --n=2048 --tiled=1 --numTrials=10
 
     for ratio in 0 25 50 75 100;
     do
-	./mm.hybrid -nl 1 --n=2048 --tiled=1 --numTrials=10 --CPUPercent=$ratio
+	./mm.hybrid -nl 1 --n=2048 --tiled=1 --numTrials=10 --CPUratio=$ratio
     done
 
     ./mm.gpu -nl 1 --n=2048 --tiled=2 --numTrials=10
 
     for ratio in 0 25 50 75 100;
     do
-	./mm.hybrid -nl 1 --n=2048 --tiled=2 --numTrials=10 --CPUPercent=$ratio
+	./mm.hybrid -nl 1 --n=2048 --tiled=2 --numTrials=10 --CPUratio=$ratio
     done
 done

From 072325f4532dd2029a109d2fc573e965d623b2dc Mon Sep 17 00:00:00 2001
From: sriraj <srirajpaul@gmail.com>
Date: Fri, 15 May 2020 23:40:10 -0700
Subject: [PATCH 057/118] Change all CPUPercent to CPUratio in apps

---
 apps/blackscholes/bs.hybrid.chpl       |  6 +++---
 apps/blackscholes/po.slurm             |  2 +-
 apps/logisticregression/lr.hybrid.chpl | 12 ++++++------
 apps/logisticregression/po.slurm       |  2 +-
 apps/stream/po.slurm                   |  2 +-
 apps/stream/stream.hybrid.chpl         |  6 +++---
 apps/vector_copy/po.slurm              |  2 +-
 apps/vector_copy/vc.hybrid.chpl        |  6 +++---
 8 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/apps/blackscholes/bs.hybrid.chpl b/apps/blackscholes/bs.hybrid.chpl
index 6034c30..968ea8e 100644
--- a/apps/blackscholes/bs.hybrid.chpl
+++ b/apps/blackscholes/bs.hybrid.chpl
@@ -9,7 +9,7 @@ use GPUIterator;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -75,7 +75,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
-  writeln("CPU Percent: ", CPUPercent);
+  writeln("CPU Percent: ", CPUratio);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -99,7 +99,7 @@ proc main() {
 	}
 
 	const startTime = getCurrentTime();
-	forall i in GPU(1..n, CUDAWrapper, CPUPercent)  {
+	forall i in GPU(1..n, CUDAWrapper, CPUratio)  {
       var c1 = 0.319381530: real(32);
       var c2 = -0.356563782: real(32);
       var c3 = 1.781477937: real(32);
diff --git a/apps/blackscholes/po.slurm b/apps/blackscholes/po.slurm
index bea60bf..fa23e5e 100755
--- a/apps/blackscholes/po.slurm
+++ b/apps/blackscholes/po.slurm
@@ -22,6 +22,6 @@ do
     ./bs.gpu -nl 1 --n=$N --numTrials=10
     for ratio in 0 25 50 75 100;
     do
-	./bs.hybrid -nl 1 --n=$N --numTrials=10 --CPUPercent=$ratio
+	./bs.hybrid -nl 1 --n=$N --numTrials=10 --CPUratio=$ratio
     done
 done
diff --git a/apps/logisticregression/lr.hybrid.chpl b/apps/logisticregression/lr.hybrid.chpl
index 728bdd6..f2abf98 100644
--- a/apps/logisticregression/lr.hybrid.chpl
+++ b/apps/logisticregression/lr.hybrid.chpl
@@ -11,8 +11,8 @@ use GPUIterator;
 config const nFeatures = 32: int;
 config const nSamples = 32: int;
 config const nIters = 32: int;
-config const CPUPercent1 = 0: int;
-config const CPUPercent2 = 0: int;
+config const CPUratio1 = 0: int;
+config const CPUratio2 = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -88,8 +88,8 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Logistic Regression: CPU/GPU Execution (using GPUIterator)");
   writeln("nSamples :", nSamples, " nFeatures :",  nFeatures);
-  writeln("CPU Percent1: ", CPUPercent1);
-  writeln("CPU Percent2: ", CPUPercent2);
+  writeln("CPU Percent1: ", CPUratio1);
+  writeln("CPU Percent2: ", CPUratio2);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -113,10 +113,10 @@ proc main() {
 
 	const startTime = getCurrentTime();
 	for ite in 1..nIters {
-      forall i in GPU(1..nFeatures, CUDAWrapper1, CPUPercent1) {
+      forall i in GPU(1..nFeatures, CUDAWrapper1, CPUratio1) {
 		Wcurr(i) = W(i);
       }
-      forall i in GPU(1..nFeatures, CUDAWrapper2, CPUPercent2) {
+      forall i in GPU(1..nFeatures, CUDAWrapper2, CPUratio2) {
 		var err = 0: real(32);
 		for s in 1..nSamples {
           var arg = 0: real(32);
diff --git a/apps/logisticregression/po.slurm b/apps/logisticregression/po.slurm
index ed3363e..72939fc 100755
--- a/apps/logisticregression/po.slurm
+++ b/apps/logisticregression/po.slurm
@@ -21,6 +21,6 @@ do
     ./lr.gpu -nl 1 --nFeatures=65536 --nSamples=32 --nIters=1 --numTrials=10
     for ratio in 0 25 50 75 100;
     do
-	./lr.hybrid -nl 1 --nFeatures=65536 --nSamples=32 --nIters=1 --numTrials=10 --CPUPercent1=$ratio --CPUPercent2=$ratio
+	./lr.hybrid -nl 1 --nFeatures=65536 --nSamples=32 --nIters=1 --numTrials=10 --CPUratio1=$ratio --CPUratio2=$ratio
     done
 done
diff --git a/apps/stream/po.slurm b/apps/stream/po.slurm
index 1b71ec1..b6dcb2a 100755
--- a/apps/stream/po.slurm
+++ b/apps/stream/po.slurm
@@ -22,6 +22,6 @@ do
     ./stream.gpu -nl 1 --n=$N --numTrials=10
     for ratio in 0 25 50 75 100;
     do
-	./stream.hybrid -nl 1 --n=$N --numTrials=10 --CPUPercent=$ratio
+	./stream.hybrid -nl 1 --n=$N --numTrials=10 --CPUratio=$ratio
     done
 done
diff --git a/apps/stream/stream.hybrid.chpl b/apps/stream/stream.hybrid.chpl
index 8387180..e54eb46 100644
--- a/apps/stream/stream.hybrid.chpl
+++ b/apps/stream/stream.hybrid.chpl
@@ -9,7 +9,7 @@ use GPUIterator;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config const alpha = 3.0: real(32);
@@ -76,7 +76,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Stream: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
-  writeln("CPU ratio: ", CPUPercent);
+  writeln("CPU ratio: ", CPUratio);
   writeln("alpha: ", alpha);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
@@ -91,7 +91,7 @@ proc main() {
 	}
 
 	const startTime = getCurrentTime();
-	forall i in GPU(1..n, CUDAWrapper, CPUPercent) {
+	forall i in GPU(1..n, CUDAWrapper, CPUratio) {
       A(i) = B(i) + alpha * C(i);
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
diff --git a/apps/vector_copy/po.slurm b/apps/vector_copy/po.slurm
index ff9b08f..46f8bee 100755
--- a/apps/vector_copy/po.slurm
+++ b/apps/vector_copy/po.slurm
@@ -22,6 +22,6 @@ do
     ./vc.gpu -nl 1 --n=$N --numTrials=10
     for ratio in 0 25 50 75 100;
     do
-	./vc.hybrid -nl 1 --n=$N --numTrials=10 --CPUPercent=$ratio
+	./vc.hybrid -nl 1 --n=$N --numTrials=10 --CPUratio=$ratio
     done
 done
diff --git a/apps/vector_copy/vc.hybrid.chpl b/apps/vector_copy/vc.hybrid.chpl
index e8f9ffb..898e319 100644
--- a/apps/vector_copy/vc.hybrid.chpl
+++ b/apps/vector_copy/vc.hybrid.chpl
@@ -9,7 +9,7 @@ use GPUIterator;
 /// Runtime Options
 ////////////////////////////////////////////////////////////////////////////////
 config const n = 32: int;
-config const CPUPercent = 0: int;
+config const CPUratio = 0: int;
 config const numTrials = 1: int;
 config const output = 0: int;
 config param verbose = false;
@@ -74,7 +74,7 @@ proc printLocaleInfo() {
 proc main() {
   writeln("Vector Copy: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
-  writeln("CPU Percent: ", CPUPercent);
+  writeln("CPU Percent: ", CPUratio);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
@@ -88,7 +88,7 @@ proc main() {
 	}
 
 	const startTime = getCurrentTime();
-	forall i in GPU(1..n, CUDAWrapper, CPUPercent) {
+	forall i in GPU(1..n, CUDAWrapper, CPUratio) {
       A(i) = B(i);
 	}
 	execTimes(trial) = getCurrentTime() - startTime;

From be71b3a79c264b4eb86f708baf7ee7baeb9052d8 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Sat, 16 May 2020 12:25:55 -0400
Subject: [PATCH 058/118] Add LSF script for mm and stream

---
 apps/mm/mm.lsf         | 31 +++++++++++++++++++++++++++++++
 apps/stream/stream.lsf | 31 +++++++++++++++++++++++++++++++
 2 files changed, 62 insertions(+)
 create mode 100755 apps/mm/mm.lsf
 create mode 100755 apps/stream/stream.lsf

diff --git a/apps/mm/mm.lsf b/apps/mm/mm.lsf
new file mode 100755
index 0000000..a0a23ba
--- /dev/null
+++ b/apps/mm/mm.lsf
@@ -0,0 +1,31 @@
+#!/bin/bash
+#BSUB -P GEN010sollve
+#BSUB -W 1:00
+#BSUB -nnodes 8
+#BSUB -alloc_flags smt1
+##BSUB -alloc_flags gpumps
+#BSUB -J MM
+#BSUB -o MM.%J
+#BSUB -e MM.%J
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-1.20.0
+source ./chiuw2020.sh
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-gpu/apps/mm
+
+N=4096
+
+for nodes in 1 2 4 8;
+do
+    echo "nTasks: " $nodes
+    echo "[LOW]"
+    for ratio in 100 0;
+    do
+        echo "CPUratio:" $ratio
+	    jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./mm.hybrid.dist_real -nl $nodes --n=$N --numTrials=10 --CPUratio=$ratio -v
+    done
+    echo "[LOWMID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./mm.hybrid.dist.lowmid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+    echo "[MID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./mm.hybrid.dist.mid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+done
diff --git a/apps/stream/stream.lsf b/apps/stream/stream.lsf
new file mode 100755
index 0000000..b2337a0
--- /dev/null
+++ b/apps/stream/stream.lsf
@@ -0,0 +1,31 @@
+#!/bin/bash
+#BSUB -P GEN010sollve
+#BSUB -W 0:30
+#BSUB -nnodes 8
+#BSUB -alloc_flags smt1
+##BSUB -alloc_flags gpumps
+#BSUB -J ST
+#BSUB -o ST.%J
+#BSUB -e ST.%J
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-1.20.0
+source ./chiuw2020.sh
+
+cd /ccs/home/ahayashi/chiuw2020/chapel-gpu/apps/stream
+
+N=1073741824
+
+for nodes in 1 2 4 8;
+do
+    echo "nTasks: " $nodes
+    echo "[LOW]"
+    for ratio in 100 0;
+    do
+        echo "CPUratio:" $ratio
+	    jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./stream.hybrid.dist_real -nl $nodes --n=$N --numTrials=10 --CPUratio=$ratio -v
+    done
+    echo "[LOWMID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./stream.hybrid.dist.lowmid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+    echo "[MID]"
+	jsrun -n $nodes -a 1 -c 42 -g 1 -b rs ./stream.hybrid.dist.mid_real -nl $nodes --n=$N --numTrials=10 --CPUratio=0 -v
+done

From e52ae606c7796176617167eb39fec0a772d5fce4 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 15 Jul 2020 22:35:25 -0400
Subject: [PATCH 059/118] Add CMakeLists.txt

---
 CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..843baac
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.8)
+project(ChapelGPUAPI)
+
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")  
+  add_library(GPUAPI SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  add_library(GPUAPIStatic STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+else()
+  message(FATAL_ERROR "No GPU compiler found")
+endif()
+
+# Installation
+
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set (CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install" CACHE PATH "default install path" FORCE)
+endif()
+
+include(GNUInstallDirs)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
+
+# Offer the user the choice of overriding the installation directories
+set(INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Installation directory for libraries")
+set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
+set(INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Installation directory for header files")
+if(WIN32 AND NOT CYGWIN)
+  set(DEF_INSTALL_CMAKEDIR CMake)
+else()
+  set(DEF_INSTALL_CMAKEDIR share/cmake/${PROJECT_NAME})
+endif()
+set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory for CMake files")
+
+# Report to user
+foreach(p LIB INCLUDE)
+  file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
+  message(STATUS "Installing ${p} components to ${_path}")
+  unset(_path)
+endforeach()
+
+install(
+  TARGETS GPUAPI GPUAPIStatic
+  ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${INSTALL_LIBDIR}
+  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+)

From 9e47b669cdd7e882a56cf984dfe4053504461a8e Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:32:53 -0400
Subject: [PATCH 060/118] Add HIP/OpenCL Support in GPUAPI

---
 CMakeLists.txt      |  87 ++++++++++++++++++---
 src/GPUAPI.opencl.c | 183 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+), 10 deletions(-)
 create mode 100644 src/GPUAPI.opencl.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 843baac..81d0a16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,62 @@
 cmake_minimum_required(VERSION 3.8)
 project(ChapelGPUAPI)
 
+# FindCUDA
 include(CheckLanguage)
 check_language(CUDA)
+
+# FindHIP
+if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+find_package(HIP)
+
+# FindOpenCL
+find_package(OpenCL)
+
+set(GPU_COMPILER_FOUND OFF)
+
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")  
-  add_library(GPUAPI SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
-  add_library(GPUAPIStatic STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+  add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-else()
+  set(GPU_COMPILER_FOUND ON)
+endif()
+
+if(HIP_FOUND)
+  if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
+    add_custom_command(
+      OUTPUT GPUAPI.hip.cc
+      COMMAND ${HIP_ROOT_DIR}/hip/bin/hipify-perl ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu > GPUAPI.hip.cc
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu
+      COMMENT "Convering GPUAPI.cu to GPUAPI.hip.cc"
+      )
+    set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc")
+    hip_add_library(GPUAPIHIP SHARED GPUAPI.hip.cc)
+    hip_add_library(GPUAPIHIP_static STATIC GPUAPI.hip.cc)
+    set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+    set(GPU_COMPILER_FOUND ON)
+  else ()
+    set(HIP_FOUND OFF)
+  endif()
+endif()
+
+if(OpenCL_FOUND)
+  add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+endif()
+
+if(NOT GPU_COMPILER_FOUND)
   message(FATAL_ERROR "No GPU compiler found")
 endif()
 
@@ -42,9 +89,29 @@ foreach(p LIB INCLUDE)
   unset(_path)
 endforeach()
 
-install(
-  TARGETS GPUAPI GPUAPIStatic
-  ARCHIVE DESTINATION ${INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${INSTALL_LIBDIR}
-  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
-)
+if(CMAKE_CUDA_COMPILER)
+  install(
+    TARGETS GPUAPICUDA GPUAPICUDA_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+if (HIP_FOUND)
+  install(
+    TARGETS GPUAPIHIP GPUAPIHIP_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+if(OpenCL_FOUND)
+  install(
+    TARGETS GPUAPIOPENCL GPUAPIOPENCL_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
diff --git a/src/GPUAPI.opencl.c b/src/GPUAPI.opencl.c
new file mode 100644
index 0000000..42f7502
--- /dev/null
+++ b/src/GPUAPI.opencl.c
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <assert.h>
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#define MAX_PLATFORM_ENTRIES 8
+#define MAX_DEVICE_ENTRIES 16
+
+#define OPENCL_ERROR_CHECK
+#define OpenCLSafeCall( err ) __OpenCLSafeCall( err, __FILE__, __LINE__ )
+#define OpenCLCheckError()    __OpenCLCheckError( __FILE__, __LINE__ )
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+  const char *openclGetErrorString(cl_int error)
+  {
+    switch(error){
+      // run-time and JIT compiler errors
+    case 0: return "CL_SUCCESS";
+    case -1: return "CL_DEVICE_NOT_FOUND";
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case -5: return "CL_OUT_OF_RESOURCES";
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case -8: return "CL_MEM_COPY_OVERLAP";
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+      
+    // compile-time errors
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+    // extension errors
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+    default: return "Unknown OpenCL error";
+    }
+  }
+
+  void __OpenCLSafeCall( cl_int err, const char *file, const int line ) {
+#ifdef OPENCL_ERROR_CHECK
+    if ( CL_SUCCESS != err )
+      {
+        fprintf( stderr, "OpenCLSafeCall() failed at %s:%i : %s\n",
+                 file, line, openclGetErrorString( err ) );
+        exit( -1 );
+      }
+#endif
+    
+    return;
+  }
+  
+  void GetDeviceCount(int *count) {
+    cl_platform_id platforms[MAX_PLATFORM_ENTRIES];
+    cl_uint num_platforms;
+    OpenCLSafeCall(clGetPlatformIDs(MAX_PLATFORM_ENTRIES, platforms, &num_platforms));
+    printf("GPUAPI: %d OpenCL platform(s) found\n", num_platforms);
+    char *env = getenv("CHPL_GPU_PLATFORM_ID");
+    int specified_pid = -1;
+    if (env) {
+      specified_pid = atoi(env);
+      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is specified: %d\n", specified_pid);
+    } else {
+      specified_pid = 0;
+      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is NOT specified. Set to 0\n");
+    }
+    *count = 0;
+    for (int i = 0; i < num_platforms; i++) {
+      char buffer[1024];
+      OpenCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, buffer, NULL));
+      printf("GPUAPI: platform[%d].VENDOR = %s\n", i, buffer);
+      cl_device_id devices[MAX_DEVICE_ENTRIES];
+      cl_uint num_devices;
+      OpenCLSafeCall(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, MAX_DEVICE_ENTRIES, devices, &num_devices));
+      printf("GPUAPI: \t%d OpenCL device(s)\n", num_devices);
+      if (specified_pid == i) {
+	*count = num_devices;
+      }
+      for (int i = 0; i < num_devices; i++) {
+	OpenCLSafeCall(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
+	printf("GPUAPI: \tdevice[%d].NAME = %s\n", i, buffer);
+      }      
+    }
+  }
+  
+  void GetDevice(int *device) {
+
+  }
+
+  void SetDevice(int device) {
+
+  }
+
+  void ProfilerStart() {
+  }
+
+  void ProfilerStop() {
+  }
+
+  void DeviceSynchronize() {
+  }
+
+  void Malloc(void** devPtr, size_t size) {
+  }
+
+  void Memcpy(void* dst, void* src, size_t count, int kind) {
+      switch (kind) {
+      case 0:
+	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+          break;
+      case 1:
+	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+          break;
+      default:
+          printf("Fatal: Wrong Memcpy kind!\n");
+          exit(1);
+      }
+  }
+    
+  void Free(void* devPtr) {
+  }
+#ifdef __cplusplus
+}
+#endif

From e17bef2653ab13b4975864f317c8d5ebdcc399d2 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:36:31 -0400
Subject: [PATCH 061/118] Add the -O3 option to the OpenCL compilation flags

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81d0a16..fa5dd9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ if(HIP_FOUND)
 endif()
 
 if(OpenCL_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
 endif()

From f75cefc7761cbc6440dd1c96f23950e6e58abeba Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:47:58 -0400
Subject: [PATCH 062/118] Update CMakeLists.txt

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa5dd9e..2bd3912 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ if(OpenCL_FOUND)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  set(GPU_COMPILER_FOUND ON)
 endif()
 
 if(NOT GPU_COMPILER_FOUND)

From 1111865ea981263071d44aab91618b1fdfa179bb Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Thu, 16 Jul 2020 12:58:23 -0400
Subject: [PATCH 063/118] Update CMakeLists.txt

---
 CMakeLists.txt | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bd3912..51e449e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ChapelGPUAPI)
 
 # FindCUDA
 include(CheckLanguage)
-check_language(CUDA)
+check_language(CUDA QUIET)
 
 # FindHIP
 if(NOT DEFINED HIP_PATH)
@@ -14,10 +14,10 @@ if(NOT DEFINED HIP_PATH)
     endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-find_package(HIP)
+find_package(HIP QUIET)
 
 # FindOpenCL
-find_package(OpenCL)
+find_package(OpenCL QUIET)
 
 set(GPU_COMPILER_FOUND OFF)
 
@@ -32,6 +32,7 @@ endif()
 
 if(HIP_FOUND)
   if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIP: " ${HIP_VERSION})
     message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
     add_custom_command(
       OUTPUT GPUAPI.hip.cc
@@ -47,15 +48,23 @@ if(HIP_FOUND)
     set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
     set(GPU_COMPILER_FOUND ON)
   else ()
+    message(STATUS "Found HIP, but HIPIFY NOTFOUND")
     set(HIP_FOUND OFF)
   endif()
+else()
+    message(STATUS "HIP NOTFOUND")
 endif()
 
 if(OpenCL_FOUND)
+  message(STATUS "Found OpenCL: " ${OpenCL_VERSION_STRING})
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  target_link_libraries(GPUAPIOPENCL OpenCL::OpenCL)
+  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)  
   set(GPU_COMPILER_FOUND ON)
+else()
+  message(STATUS "OpenCL Not Found")
 endif()
 
 if(NOT GPU_COMPILER_FOUND)
@@ -87,7 +96,7 @@ set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory
 # Report to user
 foreach(p LIB INCLUDE)
   file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
-  message(STATUS "Installing ${p} components to ${_path}")
+  message(STATUS "${p} components will be installed to ${_path}")
   unset(_path)
 endforeach()
 

From dd7e04bab5f722edecc91024d61f60d36c878f19 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:04:06 -0400
Subject: [PATCH 064/118] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51e449e..477be30 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if(CMAKE_CUDA_COMPILER)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
   add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
-  set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+  set_target_properties(GPUAPICUDA PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
   set(GPU_COMPILER_FOUND ON)
 endif()
 

From 6fa5a415162a2b673380085b4173d1711efe3798 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:46:27 -0400
Subject: [PATCH 065/118] Update CMakeLists.txt

---
 CMakeLists.txt | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 477be30..c6a6da2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,17 +84,18 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}
 
 # Offer the user the choice of overriding the installation directories
 set(INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Installation directory for libraries")
-set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
 set(INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Installation directory for header files")
+set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
 if(WIN32 AND NOT CYGWIN)
   set(DEF_INSTALL_CMAKEDIR CMake)
 else()
   set(DEF_INSTALL_CMAKEDIR share/cmake/${PROJECT_NAME})
 endif()
 set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory for CMake files")
+set(INSTALL_MODULEDIR modules CACHE PATH "Installation directory for Chapel module files")
 
 # Report to user
-foreach(p LIB INCLUDE)
+foreach(p LIB INCLUDE MODULE)
   file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
   message(STATUS "${p} components will be installed to ${_path}")
   unset(_path)
@@ -126,3 +127,19 @@ if(OpenCL_FOUND)
     PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
     )
 endif()
+
+if(CMAKE_CUDA_COMPILER OR HIP_FOUND)
+  install(
+    FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/lambda.h
+    DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+install(
+  FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUIterator.chpl
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl  
+  DESTINATION ${INSTALL_MODULEDIR}
+  )
+

From 400a8c4cadea48f6992b35d52283aacde0d2e60c Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:46:57 -0400
Subject: [PATCH 066/118] Update CMakeLists.txt

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6a6da2..3fc73c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,7 +139,6 @@ endif()
 install(
   FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUIterator.chpl
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl  
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl
   DESTINATION ${INSTALL_MODULEDIR}
   )
-

From d98d71acb44017486f512eb6d82bec75d437ab76 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 20 Jul 2020 12:10:11 -0400
Subject: [PATCH 067/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index fd5bd75..5f1d60e 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -230,9 +230,9 @@ module GPUIterator {
 
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
-          const r = subdom.dim(1);
+          const r = subdom.dim(0);
           const portions = computeSubranges(r, CPUPercent);
-          for i in createTaskAndYield(tag, 0..0, portions(1), portions(2), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
             yield i;
           }
         }
@@ -251,7 +251,7 @@ module GPUIterator {
       && isRectangularDom(D)
       && D.dist.type <= Block {
 
-      const lowBasedIters = followThis(1).translate(D.low);
+      const lowBasedIters = followThis(0).translate(D.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower, block distributed)");
@@ -281,10 +281,10 @@ module GPUIterator {
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
           if (debugGPUIterator) then writeln("[DEBUG GPUITERATOR]", here, " (", here.name,  ") is responsible for ", subdom);
-          const r = subdom.dim(1);
+          const r = subdom.dim(0);
           const portions = computeSubranges(r, CPUPercent);
 
-          for i in createTaskAndYield(tag, 0..0, portions(1), portions(2), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
             yield i;
           }
         }
@@ -319,7 +319,7 @@ module GPUIterator {
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
 
       const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
+      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
         yield i;
       }
     }
@@ -334,7 +334,7 @@ module GPUIterator {
       where tag == iterKind.follower
       && followThis.size == 1 {
 
-      const lowBasedIters = followThis(1).translate(r.low);
+      const lowBasedIters = followThis(0).translate(r.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
@@ -358,7 +358,7 @@ module GPUIterator {
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
 
       const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
+      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
         yield i;
       }
     }

From 03af6adf219a0373fb3148ada877c535fdce4737 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 20 Jul 2020 20:07:11 -0400
Subject: [PATCH 068/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 50 ++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 5f1d60e..011d025 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -59,7 +59,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper)
       where tag == iterKind.leader {
 
       if (CPUrange.size == 0) {
@@ -134,7 +134,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper)
       where tag == iterKind.standalone {
 
       if (CPUrange.size == 0) {
@@ -210,14 +210,14 @@ module GPUIterator {
     iter createTaskAndYield(r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void)) {
+                            GPUWrapper) {
       halt("This is dummy");
     }
 
     // leader (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
        where tag == iterKind.leader
@@ -230,9 +230,9 @@ module GPUIterator {
 
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
-          const r = subdom.dim(0);
-          const portions = computeSubranges(r, CPUPercent);
-          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
+          const (r,) = subdom.dims();
+          const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+          for i in createTaskAndYield(tag, 0..0, CPURange, GPURange, GPUWrapper) {
             yield i;
           }
         }
@@ -242,7 +242,7 @@ module GPUIterator {
     // follower (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0,
              followThis
              )
@@ -251,7 +251,9 @@ module GPUIterator {
       && isRectangularDom(D)
       && D.dist.type <= Block {
 
-      const lowBasedIters = followThis(0).translate(D.low);
+      // index-neutral
+      const (followInds,) = followThis;
+      const lowBasedIters = followInds.translate(D.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower, block distributed)");
@@ -266,7 +268,7 @@ module GPUIterator {
     // standalone (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where tag == iterKind.standalone
@@ -281,10 +283,10 @@ module GPUIterator {
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
           if (debugGPUIterator) then writeln("[DEBUG GPUITERATOR]", here, " (", here.name,  ") is responsible for ", subdom);
-          const r = subdom.dim(0);
-          const portions = computeSubranges(r, CPUPercent);
+          const (r,) = subdom.dims();
+          const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
 
-          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, CPURange, GPURange, GPUWrapper) {
             yield i;
           }
         }
@@ -293,7 +295,7 @@ module GPUIterator {
 
     // serial iterator (block distributed domains)
     iter GPU(D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where isRectangularDom(D)
@@ -310,7 +312,7 @@ module GPUIterator {
     // leader (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where tag == iterKind.leader {
@@ -318,8 +320,8 @@ module GPUIterator {
       if (debugGPUIterator) then
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
 
-      const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
         yield i;
       }
     }
@@ -327,14 +329,16 @@ module GPUIterator {
     // follower
     iter GPU(param tag: iterKind,
              r:range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0,
              followThis
              )
       where tag == iterKind.follower
       && followThis.size == 1 {
 
-      const lowBasedIters = followThis(0).translate(r.low);
+      // index-neutral
+      const (followInds,) = followThis;
+      const lowBasedIters = followInds.translate(r.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
@@ -349,7 +353,7 @@ module GPUIterator {
     // standalone (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
   	  where tag == iterKind.standalone {
@@ -357,15 +361,15 @@ module GPUIterator {
       if (debugGPUIterator) then
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
 
-      const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
         yield i;
       }
     }
 
     // serial iterators (range)
     iter GPU(r:range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              ) {
       if (debugGPUIterator) then

From b176bc68a9106e54c3455a43f9faf826ff0e14ac Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Mon, 20 Jul 2020 22:12:35 -0400
Subject: [PATCH 069/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 011d025..d7fbc41 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -45,7 +45,7 @@ module GPUIterator {
     inline proc computeChunk(r: range, myChunk, numChunks)
       where r.stridable == false {
 
-      const numElems = r.length;
+      const numElems = r.size;
       const elemsPerChunk = numElems/numChunks;
       const mylow = r.low + elemsPerChunk*myChunk;
       if (myChunk != numChunks - 1) {
@@ -71,7 +71,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
@@ -79,7 +79,7 @@ module GPUIterator {
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid:int(32));
-              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
         }
@@ -113,7 +113,7 @@ module GPUIterator {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
                   writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
-                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
@@ -121,7 +121,7 @@ module GPUIterator {
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid:int(32));
-                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }
             }
@@ -146,7 +146,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
@@ -154,7 +154,7 @@ module GPUIterator {
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid:int(32));
-              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
         }
@@ -190,7 +190,7 @@ module GPUIterator {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
                   writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
-                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
@@ -198,7 +198,7 @@ module GPUIterator {
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid:int(32));
-                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }
             }

From 53a7f400af2b569410933e3401f7496f120ffc0d Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 20 Jul 2020 12:10:11 -0400
Subject: [PATCH 070/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index fd5bd75..5f1d60e 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -230,9 +230,9 @@ module GPUIterator {
 
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
-          const r = subdom.dim(1);
+          const r = subdom.dim(0);
           const portions = computeSubranges(r, CPUPercent);
-          for i in createTaskAndYield(tag, 0..0, portions(1), portions(2), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
             yield i;
           }
         }
@@ -251,7 +251,7 @@ module GPUIterator {
       && isRectangularDom(D)
       && D.dist.type <= Block {
 
-      const lowBasedIters = followThis(1).translate(D.low);
+      const lowBasedIters = followThis(0).translate(D.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower, block distributed)");
@@ -281,10 +281,10 @@ module GPUIterator {
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
           if (debugGPUIterator) then writeln("[DEBUG GPUITERATOR]", here, " (", here.name,  ") is responsible for ", subdom);
-          const r = subdom.dim(1);
+          const r = subdom.dim(0);
           const portions = computeSubranges(r, CPUPercent);
 
-          for i in createTaskAndYield(tag, 0..0, portions(1), portions(2), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
             yield i;
           }
         }
@@ -319,7 +319,7 @@ module GPUIterator {
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
 
       const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
+      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
         yield i;
       }
     }
@@ -334,7 +334,7 @@ module GPUIterator {
       where tag == iterKind.follower
       && followThis.size == 1 {
 
-      const lowBasedIters = followThis(1).translate(r.low);
+      const lowBasedIters = followThis(0).translate(r.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
@@ -358,7 +358,7 @@ module GPUIterator {
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
 
       const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(1), portions(2), GPUWrapper) {
+      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
         yield i;
       }
     }

From fba9c169f2c207f4eb9698859390c5e58c610f7f Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 20 Jul 2020 20:07:11 -0400
Subject: [PATCH 071/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 50 ++++++++++++++++++++++++--------------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 5f1d60e..011d025 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -59,7 +59,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper)
       where tag == iterKind.leader {
 
       if (CPUrange.size == 0) {
@@ -134,7 +134,7 @@ module GPUIterator {
                             r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void))
+                            GPUWrapper)
       where tag == iterKind.standalone {
 
       if (CPUrange.size == 0) {
@@ -210,14 +210,14 @@ module GPUIterator {
     iter createTaskAndYield(r: range(?),
                             CPUrange: range(?),
                             GPUrange: range(?),
-                            GPUWrapper: func(int, int, int, void)) {
+                            GPUWrapper) {
       halt("This is dummy");
     }
 
     // leader (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
        where tag == iterKind.leader
@@ -230,9 +230,9 @@ module GPUIterator {
 
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
-          const r = subdom.dim(0);
-          const portions = computeSubranges(r, CPUPercent);
-          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
+          const (r,) = subdom.dims();
+          const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+          for i in createTaskAndYield(tag, 0..0, CPURange, GPURange, GPUWrapper) {
             yield i;
           }
         }
@@ -242,7 +242,7 @@ module GPUIterator {
     // follower (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0,
              followThis
              )
@@ -251,7 +251,9 @@ module GPUIterator {
       && isRectangularDom(D)
       && D.dist.type <= Block {
 
-      const lowBasedIters = followThis(0).translate(D.low);
+      // index-neutral
+      const (followInds,) = followThis;
+      const lowBasedIters = followInds.translate(D.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower, block distributed)");
@@ -266,7 +268,7 @@ module GPUIterator {
     // standalone (block distributed domains)
     iter GPU(param tag: iterKind,
              D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where tag == iterKind.standalone
@@ -281,10 +283,10 @@ module GPUIterator {
       coforall loc in D.targetLocales() do on loc {
         for subdom in D.localSubdomains() {
           if (debugGPUIterator) then writeln("[DEBUG GPUITERATOR]", here, " (", here.name,  ") is responsible for ", subdom);
-          const r = subdom.dim(0);
-          const portions = computeSubranges(r, CPUPercent);
+          const (r,) = subdom.dims();
+          const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
 
-          for i in createTaskAndYield(tag, 0..0, portions(0), portions(1), GPUWrapper) {
+          for i in createTaskAndYield(tag, 0..0, CPURange, GPURange, GPUWrapper) {
             yield i;
           }
         }
@@ -293,7 +295,7 @@ module GPUIterator {
 
     // serial iterator (block distributed domains)
     iter GPU(D: domain,
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where isRectangularDom(D)
@@ -310,7 +312,7 @@ module GPUIterator {
     // leader (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
       where tag == iterKind.leader {
@@ -318,8 +320,8 @@ module GPUIterator {
       if (debugGPUIterator) then
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
 
-      const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
         yield i;
       }
     }
@@ -327,14 +329,16 @@ module GPUIterator {
     // follower
     iter GPU(param tag: iterKind,
              r:range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0,
              followThis
              )
       where tag == iterKind.follower
       && followThis.size == 1 {
 
-      const lowBasedIters = followThis(0).translate(r.low);
+      // index-neutral
+      const (followInds,) = followThis;
+      const lowBasedIters = followInds.translate(r.low);
 
       if (debugGPUIterator) {
         writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
@@ -349,7 +353,7 @@ module GPUIterator {
     // standalone (range)
     iter GPU(param tag: iterKind,
              r: range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              )
   	  where tag == iterKind.standalone {
@@ -357,15 +361,15 @@ module GPUIterator {
       if (debugGPUIterator) then
 	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
 
-      const portions = computeSubranges(r, CPUPercent);
-      for i in createTaskAndYield(tag, r, portions(0), portions(1), GPUWrapper) {
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
         yield i;
       }
     }
 
     // serial iterators (range)
     iter GPU(r:range(?),
-             GPUWrapper: func(int, int, int, void),
+             GPUWrapper,
              CPUPercent: int = 0
              ) {
       if (debugGPUIterator) then

From 63d10665b67f210deeb4a0d4f2ac6618512da2b3 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Mon, 20 Jul 2020 22:12:35 -0400
Subject: [PATCH 072/118] Update GPUIterator.chpl

---
 src/GPUIterator.chpl | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index 011d025..d7fbc41 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -45,7 +45,7 @@ module GPUIterator {
     inline proc computeChunk(r: range, myChunk, numChunks)
       where r.stridable == false {
 
-      const numElems = r.length;
+      const numElems = r.size;
       const elemsPerChunk = numElems/numChunks;
       const mylow = r.low + elemsPerChunk*myChunk;
       if (myChunk != numChunks - 1) {
@@ -71,7 +71,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
@@ -79,7 +79,7 @@ module GPUIterator {
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid:int(32));
-              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
         }
@@ -113,7 +113,7 @@ module GPUIterator {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
                   writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
-                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
@@ -121,7 +121,7 @@ module GPUIterator {
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid:int(32));
-                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }
             }
@@ -146,7 +146,7 @@ module GPUIterator {
             const myIters = GPUrange;
             if (debugGPUIterator) then
               writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters, " CPU portion is ZERO");
-            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+            GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
           }
           otherwise {
             coforall tid in 0..#nGPUs {
@@ -154,7 +154,7 @@ module GPUIterator {
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
               SetDevice(tid:int(32));
-              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+              GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
         }
@@ -190,7 +190,7 @@ module GPUIterator {
                 const myIters = GPUrange;
                 if (debugGPUIterator) then
                   writeln("[DEBUG GPUITERATOR] GPU portion: ", myIters);
-                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.length);
+                GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, GPUrange.size);
               }
               otherwise {
                 coforall tid in 0..#nGPUs {
@@ -198,7 +198,7 @@ module GPUIterator {
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
                   SetDevice(tid:int(32));
-                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.length);
+                  GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }
             }

From 226b01dc3d046e6193e576d74be9e2c4bbf21cc4 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Mon, 20 Jul 2020 22:49:08 -0400
Subject: [PATCH 073/118] Add BlockDist to each app

---
 apps/blackscholes/bs.hybrid.dist.chpl       | 3 ++-
 apps/logisticregression/lr.hybrid.dist.chpl | 1 +
 apps/mm/mm.hybrid.dist.chpl                 | 3 ++-
 apps/stream/stream.hybrid.dist.chpl         | 3 ++-
 apps/vector_copy/vc.hybrid.dist.chpl        | 1 +
 5 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/apps/blackscholes/bs.hybrid.dist.chpl b/apps/blackscholes/bs.hybrid.dist.chpl
index da89c30..a1f5a49 100644
--- a/apps/blackscholes/bs.hybrid.dist.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.chpl
@@ -4,6 +4,7 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use BlockDist;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -80,7 +81,7 @@ proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
-  writeln("nGPUs: ", nGPUs);  
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
diff --git a/apps/logisticregression/lr.hybrid.dist.chpl b/apps/logisticregression/lr.hybrid.dist.chpl
index 9735a76..d7ef012 100644
--- a/apps/logisticregression/lr.hybrid.dist.chpl
+++ b/apps/logisticregression/lr.hybrid.dist.chpl
@@ -4,6 +4,7 @@ use ReplicatedDist;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use BlockDist;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
diff --git a/apps/mm/mm.hybrid.dist.chpl b/apps/mm/mm.hybrid.dist.chpl
index 2ee668c..d18cc04 100644
--- a/apps/mm/mm.hybrid.dist.chpl
+++ b/apps/mm/mm.hybrid.dist.chpl
@@ -4,6 +4,7 @@ use ReplicatedDist;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use BlockDist;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -87,7 +88,7 @@ proc main() {
   writeln("Matrix Multiplication: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n, "x", n);
   writeln("CPU ratio: ", CPUratio);
-  writeln("nGPUs: ", nGPUs);    
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("tiled: ", tiled);
   writeln("output: ", output);
diff --git a/apps/stream/stream.hybrid.dist.chpl b/apps/stream/stream.hybrid.dist.chpl
index 3067790..6c9be10 100644
--- a/apps/stream/stream.hybrid.dist.chpl
+++ b/apps/stream/stream.hybrid.dist.chpl
@@ -4,6 +4,7 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use BlockDist;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -81,7 +82,7 @@ proc main() {
   writeln("Stream: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
-  writeln("nGPUs: ", nGPUs);  
+  writeln("nGPUs: ", nGPUs);
   writeln("alpha: ", alpha);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
diff --git a/apps/vector_copy/vc.hybrid.dist.chpl b/apps/vector_copy/vc.hybrid.dist.chpl
index 3a803ec..afbdba5 100644
--- a/apps/vector_copy/vc.hybrid.dist.chpl
+++ b/apps/vector_copy/vc.hybrid.dist.chpl
@@ -4,6 +4,7 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use BlockDist;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options

From b7bc5281f8f609f29ef0c4ee3a6071c0e7461310 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Thu, 23 Jul 2020 01:09:40 -0400
Subject: [PATCH 074/118] Update CMakeLists.txt

---
 CMakeLists.txt | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fc73c9..d100f87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,8 @@ find_package(HIP QUIET)
 # FindOpenCL
 find_package(OpenCL QUIET)
 
-set(GPU_COMPILER_FOUND OFF)
+#
+set(GPU_ROOT_DIRS "")
 
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
@@ -27,7 +28,7 @@ if(CMAKE_CUDA_COMPILER)
   add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   set_target_properties(GPUAPICUDA PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-  set(GPU_COMPILER_FOUND ON)
+  list(APPEND GPU_ROOT_DIRS "export CUDA_ROOT_DIR=${${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/..")
 endif()
 
 if(HIP_FOUND)
@@ -46,7 +47,7 @@ if(HIP_FOUND)
     hip_add_library(GPUAPIHIP SHARED GPUAPI.hip.cc)
     hip_add_library(GPUAPIHIP_static STATIC GPUAPI.hip.cc)
     set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-    set(GPU_COMPILER_FOUND ON)
+    list(APPEND GPU_ROOT_DIRS "export HIP_ROOT_DIR=${HIP_ROOT_DIR}")
   else ()
     message(STATUS "Found HIP, but HIPIFY NOTFOUND")
     set(HIP_FOUND OFF)
@@ -61,14 +62,21 @@ if(OpenCL_FOUND)
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   target_link_libraries(GPUAPIOPENCL OpenCL::OpenCL)
-  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)  
+  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)
+  list(APPEND GPU_ROOT_DIRS "export OpenCL_INCLUDE_DIRS=${OpenCL_INCLUDE_DIRS}")
+  list(APPEND GPU_ROOT_DIRS "export OpenCL_LIBRARIES=${OpenCL_LIBRARIES}")
   set(GPU_COMPILER_FOUND ON)
 else()
   message(STATUS "OpenCL Not Found")
 endif()
 
-if(NOT GPU_COMPILER_FOUND)
+if(NOT GPU_ROOT_DIRS)
   message(FATAL_ERROR "No GPU compiler found")
+else()
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/env.sh "#!/bin/bash\n")
+  foreach (p ${GPU_ROOT_DIRS})
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh ${p}\n)
+  endforeach()
 endif()
 
 # Installation
@@ -142,3 +150,9 @@ install(
   ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl
   DESTINATION ${INSTALL_MODULEDIR}
   )
+
+install(
+  PROGRAMS
+  ${CMAKE_CURRENT_BINARY_DIR}/env.sh
+  DESTINATION ${INSTALL_BINDIR}
+  )

From b3f088397db8d4d1223cf509e92d7313f4f54aec Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 15 Jul 2020 22:35:25 -0400
Subject: [PATCH 075/118] Add CMakeLists.txt

---
 CMakeLists.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..843baac
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,50 @@
+cmake_minimum_required(VERSION 3.8)
+project(ChapelGPUAPI)
+
+include(CheckLanguage)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")  
+  add_library(GPUAPI SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  add_library(GPUAPIStatic STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+else()
+  message(FATAL_ERROR "No GPU compiler found")
+endif()
+
+# Installation
+
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set (CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install" CACHE PATH "default install path" FORCE)
+endif()
+
+include(GNUInstallDirs)
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
+
+# Offer the user the choice of overriding the installation directories
+set(INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Installation directory for libraries")
+set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
+set(INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Installation directory for header files")
+if(WIN32 AND NOT CYGWIN)
+  set(DEF_INSTALL_CMAKEDIR CMake)
+else()
+  set(DEF_INSTALL_CMAKEDIR share/cmake/${PROJECT_NAME})
+endif()
+set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory for CMake files")
+
+# Report to user
+foreach(p LIB INCLUDE)
+  file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
+  message(STATUS "Installing ${p} components to ${_path}")
+  unset(_path)
+endforeach()
+
+install(
+  TARGETS GPUAPI GPUAPIStatic
+  ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+  LIBRARY DESTINATION ${INSTALL_LIBDIR}
+  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+)

From f3b38b33fc17f8b49c76e4e59598ac9a0c842826 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:32:53 -0400
Subject: [PATCH 076/118] Add HIP/OpenCL Support in GPUAPI

---
 CMakeLists.txt      |  87 ++++++++++++++++++---
 src/GPUAPI.opencl.c | 183 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+), 10 deletions(-)
 create mode 100644 src/GPUAPI.opencl.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 843baac..81d0a16 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,15 +1,62 @@
 cmake_minimum_required(VERSION 3.8)
 project(ChapelGPUAPI)
 
+# FindCUDA
 include(CheckLanguage)
 check_language(CUDA)
+
+# FindHIP
+if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+find_package(HIP)
+
+# FindOpenCL
+find_package(OpenCL)
+
+set(GPU_COMPILER_FOUND OFF)
+
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")  
-  add_library(GPUAPI SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
-  add_library(GPUAPIStatic STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+  add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
+  add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-else()
+  set(GPU_COMPILER_FOUND ON)
+endif()
+
+if(HIP_FOUND)
+  if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
+    add_custom_command(
+      OUTPUT GPUAPI.hip.cc
+      COMMAND ${HIP_ROOT_DIR}/hip/bin/hipify-perl ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu > GPUAPI.hip.cc
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu
+      COMMENT "Convering GPUAPI.cu to GPUAPI.hip.cc"
+      )
+    set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc")
+    hip_add_library(GPUAPIHIP SHARED GPUAPI.hip.cc)
+    hip_add_library(GPUAPIHIP_static STATIC GPUAPI.hip.cc)
+    set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+    set(GPU_COMPILER_FOUND ON)
+  else ()
+    set(HIP_FOUND OFF)
+  endif()
+endif()
+
+if(OpenCL_FOUND)
+  add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+endif()
+
+if(NOT GPU_COMPILER_FOUND)
   message(FATAL_ERROR "No GPU compiler found")
 endif()
 
@@ -42,9 +89,29 @@ foreach(p LIB INCLUDE)
   unset(_path)
 endforeach()
 
-install(
-  TARGETS GPUAPI GPUAPIStatic
-  ARCHIVE DESTINATION ${INSTALL_LIBDIR}
-  LIBRARY DESTINATION ${INSTALL_LIBDIR}
-  PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
-)
+if(CMAKE_CUDA_COMPILER)
+  install(
+    TARGETS GPUAPICUDA GPUAPICUDA_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+if (HIP_FOUND)
+  install(
+    TARGETS GPUAPIHIP GPUAPIHIP_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+if(OpenCL_FOUND)
+  install(
+    TARGETS GPUAPIOPENCL GPUAPIOPENCL_static
+    ARCHIVE DESTINATION ${INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
diff --git a/src/GPUAPI.opencl.c b/src/GPUAPI.opencl.c
new file mode 100644
index 0000000..42f7502
--- /dev/null
+++ b/src/GPUAPI.opencl.c
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <assert.h>
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#define MAX_PLATFORM_ENTRIES 8
+#define MAX_DEVICE_ENTRIES 16
+
+#define OPENCL_ERROR_CHECK
+#define OpenCLSafeCall( err ) __OpenCLSafeCall( err, __FILE__, __LINE__ )
+#define OpenCLCheckError()    __OpenCLCheckError( __FILE__, __LINE__ )
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+  const char *openclGetErrorString(cl_int error)
+  {
+    switch(error){
+      // run-time and JIT compiler errors
+    case 0: return "CL_SUCCESS";
+    case -1: return "CL_DEVICE_NOT_FOUND";
+    case -2: return "CL_DEVICE_NOT_AVAILABLE";
+    case -3: return "CL_COMPILER_NOT_AVAILABLE";
+    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+    case -5: return "CL_OUT_OF_RESOURCES";
+    case -6: return "CL_OUT_OF_HOST_MEMORY";
+    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+    case -8: return "CL_MEM_COPY_OVERLAP";
+    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+    case -11: return "CL_BUILD_PROGRAM_FAILURE";
+    case -12: return "CL_MAP_FAILURE";
+    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+    case -16: return "CL_LINKER_NOT_AVAILABLE";
+    case -17: return "CL_LINK_PROGRAM_FAILURE";
+    case -18: return "CL_DEVICE_PARTITION_FAILED";
+    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+      
+    // compile-time errors
+    case -30: return "CL_INVALID_VALUE";
+    case -31: return "CL_INVALID_DEVICE_TYPE";
+    case -32: return "CL_INVALID_PLATFORM";
+    case -33: return "CL_INVALID_DEVICE";
+    case -34: return "CL_INVALID_CONTEXT";
+    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+    case -36: return "CL_INVALID_COMMAND_QUEUE";
+    case -37: return "CL_INVALID_HOST_PTR";
+    case -38: return "CL_INVALID_MEM_OBJECT";
+    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+    case -40: return "CL_INVALID_IMAGE_SIZE";
+    case -41: return "CL_INVALID_SAMPLER";
+    case -42: return "CL_INVALID_BINARY";
+    case -43: return "CL_INVALID_BUILD_OPTIONS";
+    case -44: return "CL_INVALID_PROGRAM";
+    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+    case -46: return "CL_INVALID_KERNEL_NAME";
+    case -47: return "CL_INVALID_KERNEL_DEFINITION";
+    case -48: return "CL_INVALID_KERNEL";
+    case -49: return "CL_INVALID_ARG_INDEX";
+    case -50: return "CL_INVALID_ARG_VALUE";
+    case -51: return "CL_INVALID_ARG_SIZE";
+    case -52: return "CL_INVALID_KERNEL_ARGS";
+    case -53: return "CL_INVALID_WORK_DIMENSION";
+    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+    case -56: return "CL_INVALID_GLOBAL_OFFSET";
+    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+    case -58: return "CL_INVALID_EVENT";
+    case -59: return "CL_INVALID_OPERATION";
+    case -60: return "CL_INVALID_GL_OBJECT";
+    case -61: return "CL_INVALID_BUFFER_SIZE";
+    case -62: return "CL_INVALID_MIP_LEVEL";
+    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+    case -64: return "CL_INVALID_PROPERTY";
+    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+    case -66: return "CL_INVALID_COMPILER_OPTIONS";
+    case -67: return "CL_INVALID_LINKER_OPTIONS";
+    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+    // extension errors
+    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+    default: return "Unknown OpenCL error";
+    }
+  }
+
+  void __OpenCLSafeCall( cl_int err, const char *file, const int line ) {
+#ifdef OPENCL_ERROR_CHECK
+    if ( CL_SUCCESS != err )
+      {
+        fprintf( stderr, "OpenCLSafeCall() failed at %s:%i : %s\n",
+                 file, line, openclGetErrorString( err ) );
+        exit( -1 );
+      }
+#endif
+    
+    return;
+  }
+  
+  void GetDeviceCount(int *count) {
+    cl_platform_id platforms[MAX_PLATFORM_ENTRIES];
+    cl_uint num_platforms;
+    OpenCLSafeCall(clGetPlatformIDs(MAX_PLATFORM_ENTRIES, platforms, &num_platforms));
+    printf("GPUAPI: %d OpenCL platform(s) found\n", num_platforms);
+    char *env = getenv("CHPL_GPU_PLATFORM_ID");
+    int specified_pid = -1;
+    if (env) {
+      specified_pid = atoi(env);
+      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is specified: %d\n", specified_pid);
+    } else {
+      specified_pid = 0;
+      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is NOT specified. Set to 0\n");
+    }
+    *count = 0;
+    for (int i = 0; i < num_platforms; i++) {
+      char buffer[1024];
+      OpenCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, buffer, NULL));
+      printf("GPUAPI: platform[%d].VENDOR = %s\n", i, buffer);
+      cl_device_id devices[MAX_DEVICE_ENTRIES];
+      cl_uint num_devices;
+      OpenCLSafeCall(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, MAX_DEVICE_ENTRIES, devices, &num_devices));
+      printf("GPUAPI: \t%d OpenCL device(s)\n", num_devices);
+      if (specified_pid == i) {
+	*count = num_devices;
+      }
+      for (int i = 0; i < num_devices; i++) {
+	OpenCLSafeCall(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
+	printf("GPUAPI: \tdevice[%d].NAME = %s\n", i, buffer);
+      }      
+    }
+  }
+  
+  void GetDevice(int *device) {
+
+  }
+
+  void SetDevice(int device) {
+
+  }
+
+  void ProfilerStart() {
+  }
+
+  void ProfilerStop() {
+  }
+
+  void DeviceSynchronize() {
+  }
+
+  void Malloc(void** devPtr, size_t size) {
+  }
+
+  void Memcpy(void* dst, void* src, size_t count, int kind) {
+      switch (kind) {
+      case 0:
+	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+          break;
+      case 1:
+	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+          break;
+      default:
+          printf("Fatal: Wrong Memcpy kind!\n");
+          exit(1);
+      }
+  }
+    
+  void Free(void* devPtr) {
+  }
+#ifdef __cplusplus
+}
+#endif

From 3d89026fa729fae427228c7d0f9dac74205ac8e1 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:36:31 -0400
Subject: [PATCH 077/118] Add the -O3 option to the OpenCL compilation flags

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81d0a16..fa5dd9e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,7 @@ if(HIP_FOUND)
 endif()
 
 if(OpenCL_FOUND)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
 endif()

From 3a8879c3912e23e2cdb3a2759ac6cca23d42277f Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 02:47:58 -0400
Subject: [PATCH 078/118] Update CMakeLists.txt

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index fa5dd9e..2bd3912 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -55,6 +55,7 @@ if(OpenCL_FOUND)
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  set(GPU_COMPILER_FOUND ON)
 endif()
 
 if(NOT GPU_COMPILER_FOUND)

From 580923e75570fd4928e65ab99de43ef489870f5f Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Thu, 16 Jul 2020 12:58:23 -0400
Subject: [PATCH 079/118] Update CMakeLists.txt

---
 CMakeLists.txt | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2bd3912..51e449e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(ChapelGPUAPI)
 
 # FindCUDA
 include(CheckLanguage)
-check_language(CUDA)
+check_language(CUDA QUIET)
 
 # FindHIP
 if(NOT DEFINED HIP_PATH)
@@ -14,10 +14,10 @@ if(NOT DEFINED HIP_PATH)
     endif()
 endif()
 set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
-find_package(HIP)
+find_package(HIP QUIET)
 
 # FindOpenCL
-find_package(OpenCL)
+find_package(OpenCL QUIET)
 
 set(GPU_COMPILER_FOUND OFF)
 
@@ -32,6 +32,7 @@ endif()
 
 if(HIP_FOUND)
   if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIP: " ${HIP_VERSION})
     message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
     add_custom_command(
       OUTPUT GPUAPI.hip.cc
@@ -47,15 +48,23 @@ if(HIP_FOUND)
     set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
     set(GPU_COMPILER_FOUND ON)
   else ()
+    message(STATUS "Found HIP, but HIPIFY NOTFOUND")
     set(HIP_FOUND OFF)
   endif()
+else()
+    message(STATUS "HIP NOTFOUND")
 endif()
 
 if(OpenCL_FOUND)
+  message(STATUS "Found OpenCL: " ${OpenCL_VERSION_STRING})
   set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
+  target_link_libraries(GPUAPIOPENCL OpenCL::OpenCL)
+  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)  
   set(GPU_COMPILER_FOUND ON)
+else()
+  message(STATUS "OpenCL Not Found")
 endif()
 
 if(NOT GPU_COMPILER_FOUND)
@@ -87,7 +96,7 @@ set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory
 # Report to user
 foreach(p LIB INCLUDE)
   file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
-  message(STATUS "Installing ${p} components to ${_path}")
+  message(STATUS "${p} components will be installed to ${_path}")
   unset(_path)
 endforeach()
 

From b12a5e2da8caf7a5b8e1b347bb6997a4ec86a649 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:04:06 -0400
Subject: [PATCH 080/118] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51e449e..477be30 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,7 +26,7 @@ if(CMAKE_CUDA_COMPILER)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
   add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
-  set_target_properties(GPUAPI PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
+  set_target_properties(GPUAPICUDA PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
   set(GPU_COMPILER_FOUND ON)
 endif()
 

From 12531cc81f048dfff19534770192dc7cdb256a98 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:46:27 -0400
Subject: [PATCH 081/118] Update CMakeLists.txt

---
 CMakeLists.txt | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 477be30..c6a6da2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,17 +84,18 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}
 
 # Offer the user the choice of overriding the installation directories
 set(INSTALL_LIBDIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Installation directory for libraries")
-set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
 set(INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Installation directory for header files")
+set(INSTALL_BINDIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Installation directory for executables")
 if(WIN32 AND NOT CYGWIN)
   set(DEF_INSTALL_CMAKEDIR CMake)
 else()
   set(DEF_INSTALL_CMAKEDIR share/cmake/${PROJECT_NAME})
 endif()
 set(INSTALL_CMAKEDIR ${DEF_INSTALL_CMAKEDIR} CACHE PATH "Installation directory for CMake files")
+set(INSTALL_MODULEDIR modules CACHE PATH "Installation directory for Chapel module files")
 
 # Report to user
-foreach(p LIB INCLUDE)
+foreach(p LIB INCLUDE MODULE)
   file(TO_NATIVE_PATH ${CMAKE_INSTALL_PREFIX}/${INSTALL_${p}DIR} _path )
   message(STATUS "${p} components will be installed to ${_path}")
   unset(_path)
@@ -126,3 +127,19 @@ if(OpenCL_FOUND)
     PUBLIC_HEADER DESTINATION ${INSTALL_INCLUDEDIR}
     )
 endif()
+
+if(CMAKE_CUDA_COMPILER OR HIP_FOUND)
+  install(
+    FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/src/lambda.h
+    DESTINATION ${INSTALL_INCLUDEDIR}
+    )
+endif()
+
+install(
+  FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUIterator.chpl
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl  
+  DESTINATION ${INSTALL_MODULEDIR}
+  )
+

From 09d72073f7d27def9d1784c373ffa13fa50d03f9 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 16 Jul 2020 16:46:57 -0400
Subject: [PATCH 082/118] Update CMakeLists.txt

---
 CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6a6da2..3fc73c9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,7 +139,6 @@ endif()
 install(
   FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUIterator.chpl
-  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl  
+  ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl
   DESTINATION ${INSTALL_MODULEDIR}
   )
-

From 96ee811407f43ad41977f6ece63eb2d3aa9c2022 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Thu, 23 Jul 2020 01:09:40 -0400
Subject: [PATCH 083/118] Update CMakeLists.txt

---
 CMakeLists.txt | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3fc73c9..d100f87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -19,7 +19,8 @@ find_package(HIP QUIET)
 # FindOpenCL
 find_package(OpenCL QUIET)
 
-set(GPU_COMPILER_FOUND OFF)
+#
+set(GPU_ROOT_DIRS "")
 
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
@@ -27,7 +28,7 @@ if(CMAKE_CUDA_COMPILER)
   add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   set_target_properties(GPUAPICUDA PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-  set(GPU_COMPILER_FOUND ON)
+  list(APPEND GPU_ROOT_DIRS "export CUDA_ROOT_DIR=${${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/..")
 endif()
 
 if(HIP_FOUND)
@@ -46,7 +47,7 @@ if(HIP_FOUND)
     hip_add_library(GPUAPIHIP SHARED GPUAPI.hip.cc)
     hip_add_library(GPUAPIHIP_static STATIC GPUAPI.hip.cc)
     set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-    set(GPU_COMPILER_FOUND ON)
+    list(APPEND GPU_ROOT_DIRS "export HIP_ROOT_DIR=${HIP_ROOT_DIR}")
   else ()
     message(STATUS "Found HIP, but HIPIFY NOTFOUND")
     set(HIP_FOUND OFF)
@@ -61,14 +62,21 @@ if(OpenCL_FOUND)
   add_library(GPUAPIOPENCL SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   target_link_libraries(GPUAPIOPENCL OpenCL::OpenCL)
-  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)  
+  target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)
+  list(APPEND GPU_ROOT_DIRS "export OpenCL_INCLUDE_DIRS=${OpenCL_INCLUDE_DIRS}")
+  list(APPEND GPU_ROOT_DIRS "export OpenCL_LIBRARIES=${OpenCL_LIBRARIES}")
   set(GPU_COMPILER_FOUND ON)
 else()
   message(STATUS "OpenCL Not Found")
 endif()
 
-if(NOT GPU_COMPILER_FOUND)
+if(NOT GPU_ROOT_DIRS)
   message(FATAL_ERROR "No GPU compiler found")
+else()
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/env.sh "#!/bin/bash\n")
+  foreach (p ${GPU_ROOT_DIRS})
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh ${p}\n)
+  endforeach()
 endif()
 
 # Installation
@@ -142,3 +150,9 @@ install(
   ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.chpl
   DESTINATION ${INSTALL_MODULEDIR}
   )
+
+install(
+  PROGRAMS
+  ${CMAKE_CURRENT_BINARY_DIR}/env.sh
+  DESTINATION ${INSTALL_BINDIR}
+  )

From 20dfd04e153681707f05be3ffd342f3016b0118d Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 23 Jul 2020 01:16:48 -0400
Subject: [PATCH 084/118] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d100f87..0166979 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ if(CMAKE_CUDA_COMPILER)
   add_library(GPUAPICUDA SHARED ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   add_library(GPUAPICUDA_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.cu)
   set_target_properties(GPUAPICUDA PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
-  list(APPEND GPU_ROOT_DIRS "export CUDA_ROOT_DIR=${${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/..")
+  list(APPEND GPU_ROOT_DIRS "export CUDA_ROOT_DIR=${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}/..")
 endif()
 
 if(HIP_FOUND)

From 713f98bd92af7f56ea007b620e8dfb4001227937 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 14:27:31 -0400
Subject: [PATCH 085/118] Add an initial workaround for Mason

---
 example/vc.chpl      | 44 ++++++++++++++++++++++++--------------------
 src/GPUIterator.chpl | 25 ++++++++++++++++---------
 2 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/example/vc.chpl b/example/vc.chpl
index 7dcb6ad..3ee94a4 100644
--- a/example/vc.chpl
+++ b/example/vc.chpl
@@ -2,31 +2,36 @@ use GPUIterator;
 
 config const n = 1024;
 config const CPUPercent = 50;
+config param useGPU = false; // for Mason
 
 var A: [1..n] real(32);
 var B: [1..n] real(32);
 
-for i in 1..n {
-  A(i) = 0: real(32);
-  B(i) = i: real(32);
-}
+extern proc vcGPU(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int) where useGPU == true;
 
 // This callback function is called after the GPUIterator
 // has computed a subspace for the GPU part
-var GPUCallBack = lambda(lo: int, hi: int, nElems: int) {
+proc GPUCallBack(lo: int, hi: int, N: int) {
   // Note: lo, and hi are 0-origin
   //       so that they can be easily handled by the C side
-  if (hi-lo+1 != nElems) {                
+  if (hi-lo+1 != N) {
     exit();
   }
-  // This for loop should be replaced
-  // with a function call to the GPU part
-  // Since lo and hi are converted to 0-origin,
-  // 1 is added to lo and hi in this example
-  for i in lo..hi {
-    A(i+1) = B(i+1);
+
+  if (useGPU == false) {
+      // This for loop should be replaced
+      // with a function call to the GPU part
+      // Since lo and hi are converted to 0-origin,
+      // 1 is added to lo and hi in this example
+      for i in lo..hi {
+          A(i+1) = B(i+1);
+      }
+  } else {
+      vcGPU(A, B, lo, hi, N);
   }
-};
+}
+
+B = 1;
 
 // Vector Copy with GPUIterator
 forall i in GPU(1..n, GPUCallBack, CPUPercent) {
@@ -34,10 +39,9 @@ forall i in GPU(1..n, GPUCallBack, CPUPercent) {
 }
 
 // verify
-for i in 1..n {
-  if (A(i) != i) {
-    halt("Verification Error");
-  }
-}
-
-writeln("Verified");
\ No newline at end of file
+if (A.equals(B)) {
+  writeln("Verified");
+} else {
+  writeln("Not Verified");
+  exit();
+}
\ No newline at end of file
diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index d7fbc41..d81e5d0 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -21,12 +21,15 @@ module GPUIterator {
     use GPUAPI;
 
     config param debugGPUIterator = false;
-    config const nGPUs = getNumDevices();
 
-    proc getNumDevices() {
-       var count: int(32);
-       GetDeviceCount(count);
-       return count;
+    // if true, don't use GetDeviceCount/SetDeivce
+    config param disableMultiGPUs = false;
+    config const nGPUs = if (disableMultiGPUs) then 1 else getNumDevices();
+
+    proc getNumDevices() where disableMultiGPUs == false {
+      var count: int(32);
+      GetDeviceCount(count);
+      return count;
     }
 
     // Utility functions
@@ -78,7 +81,8 @@ module GPUIterator {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
-              SetDevice(tid:int(32));
+              if (disableMultiGPUs == false) then
+                SetDevice(tid:int(32));
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
@@ -120,7 +124,8 @@ module GPUIterator {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
-                  SetDevice(tid:int(32));
+                  if (disableMultiGPUs == false) then
+                    SetDevice(tid:int(32));
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }
@@ -153,7 +158,8 @@ module GPUIterator {
               const myIters = computeChunk(GPUrange, tid, nGPUs);
               if (debugGPUIterator) then
                 writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters, " CPU portion is ZERO");
-              SetDevice(tid:int(32));
+              if (disableMultiGPUs == false) then
+                SetDevice(tid:int(32));
               GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
             }
           }
@@ -197,7 +203,8 @@ module GPUIterator {
                   const myIters = computeChunk(GPUrange, tid, nGPUs);
                   if (debugGPUIterator) then
                     writeln("[DEBUG GPUITERATOR] GPU", tid, " portion", ":", myIters);
-                  SetDevice(tid:int(32));
+                  if (disableMultiGPUs == false) then
+                    SetDevice(tid:int(32));
                   GPUWrapper(myIters.translate(-r.low).first, myIters.translate(-r.low).last, myIters.size);
                 }
               }

From 04d2b8372c055d34ff7d0ecc2f721a8a25345b87 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 14:28:52 -0400
Subject: [PATCH 086/118] Add an initial version of CMakeLists.txt and Makefile
 for example

---
 example/CMakeLists.txt | 12 ++++++++++++
 example/Makefile       | 13 +++++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 example/CMakeLists.txt
 create mode 100644 example/Makefile

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
new file mode 100644
index 0000000..96030c5
--- /dev/null
+++ b/example/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.8)
+project(VectorCopy)
+
+# FindCUDA
+include(CheckLanguage)
+check_language(CUDA QUIET)
+
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+  add_library(vc.cuda STATIC ${CMAKE_CURRENT_SOURCE_DIR}/vc.cu)
+endif()
diff --git a/example/Makefile b/example/Makefile
new file mode 100644
index 0000000..6ae7a09
--- /dev/null
+++ b/example/Makefile
@@ -0,0 +1,13 @@
+CHPL_GPU_MODULES=-M $(CHPL_GPU_HOME)/modules $(CHPL_GPU_HOME)/include/GPUAPI.h
+CHPL_FLAGS=--fast $(CHPL_GPU_MODULES) -suseGPU=true
+
+CUDA_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPICUDA_static -L./build -lvc.cuda -L$(CUDA_ROOT_DIR)/lib -lcudart
+
+all: cuda
+
+gpu: vc.cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+cuda: gpu vc.chpl
+	chpl $(CHPL_FLAGS) vc.h vc.chpl $(CUDA_LIBS)
+

From dca5410cd2b08e630c9470487485068f21c6ab58 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 14:36:29 -0400
Subject: [PATCH 087/118] Update Mason.toml

---
 Mason.toml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Mason.toml b/Mason.toml
index 6d4f29b..c92ba3c 100644
--- a/Mason.toml
+++ b/Mason.toml
@@ -2,6 +2,12 @@
 [brick]
 name = "GPUIterator"
 version = "0.1.0"
-chplVersion = "1.16.0..1.20.0"
+chplVersion = "1.16.0..1.22.0"
 
 [dependencies]
+
+
+[examples]
+examples = ["vc.chpl"]
+[examples.vc]
+compopts = "-sdisableMultiGPUs"
\ No newline at end of file

From db47355123c0db7e195fef89297d5fa95741950e Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 16:25:41 -0400
Subject: [PATCH 088/118] Add vc.cu vc.h to example

---
 example/vc.cu | 20 ++++++++++++++++++++
 example/vc.h  |  4 ++++
 2 files changed, 24 insertions(+)
 create mode 100644 example/vc.cu
 create mode 100644 example/vc.h

diff --git a/example/vc.cu b/example/vc.cu
new file mode 100644
index 0000000..d98c288
--- /dev/null
+++ b/example/vc.cu
@@ -0,0 +1,20 @@
+__global__ void vc(float *dA, float *dB, int N) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (id < N) {
+        dA[id] = dB[id];
+  }
+}
+
+extern "C" {
+  void vcGPU(float* A, float *B, int start, int end, int GPUN) {
+    float *dA, *dB;
+    cudaMalloc(&dA, sizeof(float) * GPUN);
+    cudaMalloc(&dB, sizeof(float) * GPUN);
+    cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice);
+    vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
+    cudaDeviceSynchronize();
+    cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost);
+    cudaFree(dA);
+    cudaFree(dB);
+  }
+}
diff --git a/example/vc.h b/example/vc.h
new file mode 100644
index 0000000..5d8de74
--- /dev/null
+++ b/example/vc.h
@@ -0,0 +1,4 @@
+#ifndef _VC_H_
+#define _VC_H_
+void vcGPU(float* A, float *B, int start, int end, int GPUN);
+#endif

From fd40d6de98da69ae4716b9df8df37a5e9eee64a1 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 17:30:55 -0400
Subject: [PATCH 089/118] Update CMakeLists.txt and Makefile for example

---
 example/CMakeLists.txt | 46 ++++++++++++++++++++++++++++++++++++++++++
 example/Makefile       | 19 ++++++++++++++++-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index 96030c5..7a278b0 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -5,8 +5,54 @@ project(VectorCopy)
 include(CheckLanguage)
 check_language(CUDA QUIET)
 
+# FindHIP
+if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+find_package(HIP QUIET)
+
+# FindOpenCL
+find_package(OpenCL QUIET)
+
 if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
   add_library(vc.cuda STATIC ${CMAKE_CURRENT_SOURCE_DIR}/vc.cu)
 endif()
+
+if(HIP_FOUND)
+  if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIP: " ${HIP_VERSION})
+    message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
+    add_custom_command(
+      OUTPUT vc.hip.cc
+      COMMAND ${HIP_ROOT_DIR}/hip/bin/hipify-perl ${CMAKE_CURRENT_SOURCE_DIR}/vc.cu > vc.hip.cc
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/vc.cu
+      COMMENT "Convering vc.cu to vc.hip.cc"
+      )
+    set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc -fPIC")
+    hip_add_library(vc.hip STATIC vc.hip.cc)
+  else ()
+    message(STATUS "Found HIP, but HIPIFY NOTFOUND")
+    set(HIP_FOUND OFF)
+  endif()
+else()
+    message(STATUS "HIP NOTFOUND")
+endif()
+
+if(OpenCL_FOUND)
+  message(STATUS "Found OpenCL: " ${OpenCL_VERSION_STRING})
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
+  add_library(vc.opencl STATIC ${CMAKE_CURRENT_SOURCE_DIR}/vc.opencl.c)
+  target_link_libraries(vc.opencl OpenCL::OpenCL)
+else()
+  message(STATUS "OpenCL Not Found")
+endif()
+
diff --git a/example/Makefile b/example/Makefile
index 6ae7a09..39cb189 100644
--- a/example/Makefile
+++ b/example/Makefile
@@ -1,7 +1,18 @@
 CHPL_GPU_MODULES=-M $(CHPL_GPU_HOME)/modules $(CHPL_GPU_HOME)/include/GPUAPI.h
 CHPL_FLAGS=--fast $(CHPL_GPU_MODULES) -suseGPU=true
 
+# CUDA
 CUDA_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPICUDA_static -L./build -lvc.cuda -L$(CUDA_ROOT_DIR)/lib -lcudart
+# HIP
+HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L./build -lvc.hip -L$(HIP_ROOT_DIR)/lib -lhip_hcc
+
+# OpenCL
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl -framework OpenCL
+else
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
+endif
 
 all: cuda
 
@@ -9,5 +20,11 @@ gpu: vc.cu
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
 cuda: gpu vc.chpl
-	chpl $(CHPL_FLAGS) vc.h vc.chpl $(CUDA_LIBS)
+	chpl $(CHPL_FLAGS) vc.h vc.chpl $(CUDA_LIBS) -o vc.cuda
+
+hip: gpu vc.chpl
+	chpl $(CHPL_FLAGS) vc.h vc.chpl $(HIP_LIBS) -o vc.hip
+
+opencl: gpu vc.chpl
+	chpl $(CHPL_FLAGS) vc.h vc.chpl $(OPENCL_LIBS) -o vc.opencl
 

From e57dabe803691c3823bc5bfcd21ab68adafad291 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 17:34:54 -0400
Subject: [PATCH 090/118] Add vc.opencl.c

---
 example/vc.opencl.c | 162 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 162 insertions(+)
 create mode 100644 example/vc.opencl.c

diff --git a/example/vc.opencl.c b/example/vc.opencl.c
new file mode 100644
index 0000000..1959831
--- /dev/null
+++ b/example/vc.opencl.c
@@ -0,0 +1,162 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <assert.h>
+
+#ifdef __APPLE__
+#include <OpenCL/opencl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#define MAX_SOURCE_SIZE (0x100000)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    extern char* openclGetErrorString(cl_int);
+
+    void vcGPU(float* A, float *B, int start, int end, int GPUN) {
+        FILE *fp;
+        char *source_str;
+        size_t source_size;
+        char str[1024];
+
+        fp = fopen("vc.cl", "r");
+        if (!fp) {
+            fprintf(stderr, "Failed to load kernel.\n");
+            exit(1);
+        }
+        source_str = (char*)malloc(MAX_SOURCE_SIZE);
+        source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+        fclose( fp );
+
+        // Get platform and device information
+        cl_platform_id platform_id = NULL;
+        cl_device_id device_ids[2];
+        cl_uint ret_num_devices;
+        cl_uint ret_num_platforms;
+        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+        if (ret != CL_SUCCESS) {
+            printf("clGetPlatformIDs %s\n", openclGetErrorString(ret));
+        }
+
+        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
+        int did = 0;
+        char *env = getenv("OCL_DEVICE_NO");
+        if (env) {
+            did = atoi(env);
+        }
+
+        cl_device_id device_id = device_ids[did];
+        size_t sret;
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+		printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
+
+        // Create an OpenCL context
+        cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Create a command queue
+        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Create memory buffers on the device for each vector
+        cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, GPUN * sizeof(float), NULL, &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        cl_event h2d_event;
+        ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), B + start, 0, NULL, &h2d_event);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        } else {
+            clWaitForEvents(1, &h2d_event);
+        }
+
+        // Create a program from the kernel source
+        cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Build the program
+        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Create the OpenCL kernel
+        cl_kernel kernel = clCreateKernel(program, "vc", &ret);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Set the arguments of the kernel
+        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&GPUN);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+
+        // Execute the OpenCL kernel on the list
+        size_t local_item_size = 64; // Divide work items into groups of 64
+        size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
+        cl_event k_event;
+        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        } else {
+            clWaitForEvents(1, &k_event);
+        }
+
+        cl_event d2h_event;
+        ret = clEnqueueReadBuffer(command_queue, a_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), A + start, 0, NULL, &d2h_event);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+        ret = clFinish(command_queue);
+        if (ret != CL_SUCCESS) {
+            printf("%s\n", openclGetErrorString(ret));
+        }
+        cl_ulong time_start;
+        cl_ulong time_end;
+
+        // H2D
+        clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+        clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+        printf("H2D time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+
+        // Kernel
+        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+        printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+        // D2H
+        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+        printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+    }
+#ifdef __cplusplus
+}
+#endif

From 86684555d55ff82ab4e8b9bf2cd22a398c0142f8 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 18:28:35 -0400
Subject: [PATCH 091/118] Add vc.cl

---
 example/vc.cl | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 example/vc.cl

diff --git a/example/vc.cl b/example/vc.cl
new file mode 100644
index 0000000..3cc12b3
--- /dev/null
+++ b/example/vc.cl
@@ -0,0 +1,6 @@
+__kernel void vc(__global float *A, __global const float *B, int n) {
+    int id = get_global_id(0);
+    if (id < n) {
+      A[id] = B[id];
+    }
+}

From a253c79b5d25be9901e099ce905e25114a6f7b87 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@rice.edu>
Date: Tue, 25 Aug 2020 18:29:41 -0400
Subject: [PATCH 092/118] Update CMakeLists.txt and Makefile for OS X

---
 CMakeLists.txt   | 1 +
 example/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0166979..0a40541 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,7 @@ if(OpenCL_FOUND)
   add_library(GPUAPIOPENCL_static STATIC ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.opencl.c)
   target_link_libraries(GPUAPIOPENCL OpenCL::OpenCL)
   target_link_libraries(GPUAPIOPENCL_static OpenCL::OpenCL)
+  set_target_properties(GPUAPIOPENCL PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)
   list(APPEND GPU_ROOT_DIRS "export OpenCL_INCLUDE_DIRS=${OpenCL_INCLUDE_DIRS}")
   list(APPEND GPU_ROOT_DIRS "export OpenCL_LIBRARIES=${OpenCL_LIBRARIES}")
   set(GPU_COMPILER_FOUND ON)
diff --git a/example/Makefile b/example/Makefile
index 39cb189..9e7392d 100644
--- a/example/Makefile
+++ b/example/Makefile
@@ -9,7 +9,7 @@ HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L./build -lvc.hip -L$(HIP_RO
 # OpenCL
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Darwin)
-	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl -framework OpenCL
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl --ldflags '-framework OpenCL'
 else
 	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
 endif

From c9e14d9964cd3af0e9695659ad45cdd178e325ed Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Tue, 25 Aug 2020 18:32:39 -0400
Subject: [PATCH 093/118] Update Mason.toml

---
 Mason.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Mason.toml b/Mason.toml
index c92ba3c..85874c6 100644
--- a/Mason.toml
+++ b/Mason.toml
@@ -2,7 +2,7 @@
 [brick]
 name = "GPUIterator"
 version = "0.1.0"
-chplVersion = "1.16.0..1.22.0"
+chplVersion = "1.16.0..1.22.1"
 
 [dependencies]
 

From 752e7a221ef5862c9795649e55312d739035a760 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 26 Aug 2020 12:11:23 -0400
Subject: [PATCH 094/118] Update CMakeLists.txt

---
 CMakeLists.txt | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a40541..f1a6cc5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,11 +73,6 @@ endif()
 
 if(NOT GPU_ROOT_DIRS)
   message(FATAL_ERROR "No GPU compiler found")
-else()
-  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/env.sh "#!/bin/bash\n")
-  foreach (p ${GPU_ROOT_DIRS})
-    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh ${p}\n)
-  endforeach()
 endif()
 
 # Installation
@@ -110,6 +105,14 @@ foreach(p LIB INCLUDE MODULE)
   unset(_path)
 endforeach()
 
+if (GPU_ROOT_DIRS)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/env.sh "#!/bin/bash\n")
+  file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh "export CHPL_GPU_HOME="${CMAKE_INSTALL_PREFIX}\n)
+  foreach (p ${GPU_ROOT_DIRS})
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh ${p}\n)
+  endforeach()
+endif()
+
 if(CMAKE_CUDA_COMPILER)
   install(
     TARGETS GPUAPICUDA GPUAPICUDA_static

From 2cc2d15ef84ffb91e398d0044284d94c7dc520de Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 26 Aug 2020 14:46:20 -0400
Subject: [PATCH 095/118] Update CMakeLists.txt for apps

---
 apps/BuildGPUCode.cmake                |  55 ++++++++++
 apps/Makefile                          | 141 ++++++-------------------
 apps/blackscholes/CMakeLists.txt       |   5 +
 apps/logisticregression/CMakeLists.txt |   5 +
 apps/mm/CMakeLists.txt                 |   5 +
 apps/stream/CMakeLists.txt             |   5 +
 apps/vector_copy/CMakeLists.txt        |   5 +
 7 files changed, 110 insertions(+), 111 deletions(-)
 create mode 100644 apps/BuildGPUCode.cmake
 create mode 100644 apps/blackscholes/CMakeLists.txt
 create mode 100644 apps/logisticregression/CMakeLists.txt
 create mode 100644 apps/mm/CMakeLists.txt
 create mode 100644 apps/stream/CMakeLists.txt
 create mode 100644 apps/vector_copy/CMakeLists.txt

diff --git a/apps/BuildGPUCode.cmake b/apps/BuildGPUCode.cmake
new file mode 100644
index 0000000..a12b8e2
--- /dev/null
+++ b/apps/BuildGPUCode.cmake
@@ -0,0 +1,55 @@
+# FindCUDA
+include(CheckLanguage)
+check_language(CUDA QUIET)
+
+# FindHIP
+if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+        set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+        set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+endif()
+set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+find_package(HIP QUIET)
+
+# FindOpenCL
+find_package(OpenCL QUIET)
+
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
+  add_library(${APP}.cuda STATIC ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.cu)
+endif()
+
+if(HIP_FOUND)
+  if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")
+    message(STATUS "Found HIP: " ${HIP_VERSION})
+    message(STATUS "Found HIPIFY: " ${HIP_ROOT_DIR}/hip/bin/hipify-perl)
+    add_custom_command(
+      OUTPUT ${APP}.hip.cc
+      COMMAND ${HIP_ROOT_DIR}/hip/bin/hipify-perl ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.cu > ${APP}.hip.cc
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.cu
+      COMMENT "Convering .cu to .hip.cc"
+      )
+    set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc -fPIC")
+    hip_add_library(${APP}.hip STATIC ${APP}.hip.cc)
+  else ()
+    message(STATUS "Found HIP, but HIPIFY NOTFOUND")
+    set(HIP_FOUND OFF)
+  endif()
+else()
+    message(STATUS "HIP NOTFOUND")
+endif()
+
+if(OpenCL_FOUND)
+  message(STATUS "Found OpenCL: " ${OpenCL_VERSION_STRING})
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O3")
+  add_library(${APP}.opencl STATIC ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.opencl.c)
+  target_link_libraries(${APP}.opencl OpenCL::OpenCL)
+else()
+  message(STATUS "OpenCL Not Found")
+endif()
+
diff --git a/apps/Makefile b/apps/Makefile
index 7de5867..a184a1a 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -1,113 +1,32 @@
-# Chapel
-CHPLFLAGS=--fast -sverbose
-CHPLMODULE=../../src
-GPUAPIFLAGS=-sdebugGPUAPI -sdebugGPUIterator $(CHPLMODULE)/GPUAPI.h GPUAPI.o
+CHPL_GPU_MODULES=-M $(CHPL_GPU_HOME)/modules $(CHPL_GPU_HOME)/include/GPUAPI.h
+CHPL_FLAGS=--fast $(CHPL_GPU_MODULES)
 
 # CUDA
-CUDA_HOME?=/usr/local/cuda
-CUDA_SM?=sm_70
-CUDALIBSFLAGS=-L$(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas
-NVCCFLAGS=-O3 -arch $(CUDA_SM) -std=c++11 --extended-lambda -I$(CHPLMODULE)
-$(info CUDA_HOME is $(CUDA_HOME))
-$(info CUDA_SM is $(CUDA_SM))
-
-# ROCM
-ROCM_HOME?=/opt/rocm
-HIP_HOME=$(ROCM_HOME)/hip
-HIPLIBSFLAGS=-L$(ROCM_HOME)/lib -lhip_hcc
-$(info ROCM_HOME is $(ROCM_HOME))
-
-# For OpenCL (MacOS)
-OCLLIBSFLAGS=-framework OpenCL
-OCLFLAGS=-framework OpenCL
-
-all: baseline cudagpu cudahybrid cudahybrid.dist
-
-$(TARGET).o: $(TARGET).cu
-	nvcc $(NVCCFLAGS) -c $^
-
-GPUAPI.o: $(CHPLMODULE)/GPUAPI.cu
-	nvcc $(NVCCFLAGS) -c $^
-
-GPUAPI.hip.o: $(CHPLMODULE)/GPUAPI.cu
-	$(HIP_HOME)/bin/hipify-perl $^ > GPUAPI.hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c GPUAPI.hip.cpp -fno-gpu-rdc
-
-$(TARGET).opencl.o: $(TARGET).opencl.c
-	gcc -O3 -Wall $(OCLFLAGS) -c $^
-
-.PHONY: baseline
-baseline: $(TARGET).baseline.chpl
-	chpl $(CHPLFLAGS) $(TARGET).baseline.chpl
-
-.PHONY: blas
-blas:
-	chpl $(CHPLFLAGS) $(TARGET).blas.chpl
-
-.PHONY: cudagpu
-cudagpu: $(TARGET).o $(TARGET).gpu.chpl
-	chpl $(CHPLFLAGS) $(TARGET).o $(TARGET).gpu.chpl $(CUDALIBSFLAGS)
-
-.PHONY: cudahybrid
-cudahybrid: GPUAPI.o $(TARGET).o $(TARGET).hybrid.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.chpl $(CUDALIBSFLAGS)
-
-.PHONY: cudahybrid.dist
-cudahybrid.dist: GPUAPI.o $(TARGET).o $(TARGET).hybrid.dist.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl $(CUDALIBSFLAGS)
-
-.PHONY: cudahybrid.dist.lowmid
-cudahybrid.dist.lowmid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.lowmid.chpl
-	nvcc $(NVCCFLAGS) -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.lowmid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.lowmid
-
-.PHONY: cudahybrid.dist.mid
-cudahybrid.dist.mid: GPUAPI.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.mid.chpl
-	nvcc $(NVCCFLAGS) -c $(TARGET).kernel.cu -o $(TARGET).kernel.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.mid.chpl $(CUDALIBSFLAGS) -o $(TARGET).hybrid.dist.mid
-
-.PHONY: openclgpu
-oclgpu: $(TARGET).opencl.o $(TARGET).gpu.chpl
-	chpl $(CHPLFLAGS) $(TARGET).opencl.o $(TARGET).gpu.chpl --ldflags $(OCLLIBSFLAGS)
-
-.PHONY: openclhybrid
-oclhybrid: $(TARGET).opencl.o $(TARGET).hybrid.chpl
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).opencl.o $(TARGET).hybrid.chpl --ldflags $(OCLLIBSFLAGS)
-
-.PHONY: hipgpu
-hipgpu: $(TARGET).cu $(TARGET).gpu.chpl
-	$(HIP_HOME)/bin/hipify-perl $(TARGET).cu > $(TARGET).hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).hip.cpp -fno-gpu-rdc
-	chpl $(CHPLFLAGS) $(TARGET).hip.o $(TARGET).gpu.chpl --ldflags $(HIPLIBSFLAGS)
-
-.PHONY: hiphybrid
-hiphybrid: $(TARGET).cu $(TARGET).hybrid.chpl
-	$(HIP_HOME)/bin/hipify-perl $(TARGET).cu > $(TARGET).hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).hip.cpp -fno-gpu-rdc
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).hip.o $(TARGET).hybrid.chpl --ldflags $(HIPLIBSFLAGS)
-
-.PHONY: hiphybrid.dist
-hiphybrid.dist: GPUAPI.hip.o $(TARGET).cu $(TARGET).hybrid.dist.chpl
-	$(HIP_HOME)/bin/hipify-perl $(TARGET).cu > $(TARGET).hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).hip.cpp -fno-gpu-rdc
-	cp GPUAPI.hip.o GPUAPI.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).h $(TARGET).hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.chpl --ldflags $(HIPLIBSFLAGS)
-
-.PHONY: hiphybrid.dist.lowmid
-hiphybrid.dist.lowmid: GPUAPI.hip.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.lowmid.chpl
-	$(HIP_HOME)/bin/hipify-perl $(TARGET).kernel.cu > $(TARGET).kernel.hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).kernel.hip.cpp -fno-gpu-rdc
-	cp GPUAPI.hip.o GPUAPI.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.lowmid.chpl --ldflags $(HIPLIBSFLAGS)
-
-.PHONY: hiphybrid.dist.mid
-hiphybrid.dist.mid: GPUAPI.hip.o $(TARGET).kernel.cu $(TARGET).hybrid.dist.mid.chpl
-	$(HIP_HOME)/bin/hipify-perl $(TARGET).kernel.cu > $(TARGET).kernel.hip.cpp
-	$(HIP_HOME)/bin/hipcc -O3 -Wall -fPIC -c $(TARGET).kernel.hip.cpp -fno-gpu-rdc
-	cp GPUAPI.hip.o GPUAPI.o
-	chpl -M $(CHPLMODULE) $(CHPLFLAGS) $(TARGET).kernel.h $(TARGET).kernel.hip.o $(GPUAPIFLAGS) $(TARGET).hybrid.dist.mid.chpl --ldflags $(HIPLIBSFLAGS)
-
-
-.PHONY: clean
-clean:
-	rm -f $(TARGET).baseline $(TARGET).gpu $(TARGET).hybrid $(TARGET).hybrid.dist $(TARGET).hybrid.dist.lowmid $(TARGET).hybrid.dist.mid $(TARGET).o GPUAPI.o *_real
+ifeq ($(USE_CUBLAS), yes)
+  CUBLAS_LIB=-lcublas
+endif
+CUDA_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPICUDA_static -L./build -l$(TARGET).cuda -L$(CUDA_ROOT_DIR)/lib -lcudart $(CUBLAS_LIB)
+# HIP
+HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L./build -l$(TARGET).hip -L$(HIP_ROOT_DIR)/lib -lhip_hcc
+
+# OpenCL
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Darwin)
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -l$(TARGET).opencl --ldflags '-framework OpenCL'
+else
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -l$(TARGET).opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
+endif
+
+all: cudahybrid.dist
+
+gpu: $(TARGET).cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+cuda%: gpu $(TARGET).%.chpl
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(CUDA_LIBS) -o $(TARGET).cuda.$*
+
+hip%: gpu $(TARGET).%.chpl
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(HIP_LIBS) -o $(TARGET).hip.$*
+
+opencl%: gpu $(TARGET).%.chpl
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(OPNCL_LIBS) -o $(TARGET).opencl.$*
diff --git a/apps/blackscholes/CMakeLists.txt b/apps/blackscholes/CMakeLists.txt
new file mode 100644
index 0000000..d48b295
--- /dev/null
+++ b/apps/blackscholes/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.8)
+project(BlackScholes)
+
+set(APP bs)
+include(../BuildGPUCode.cmake)
diff --git a/apps/logisticregression/CMakeLists.txt b/apps/logisticregression/CMakeLists.txt
new file mode 100644
index 0000000..963442d
--- /dev/null
+++ b/apps/logisticregression/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.8)
+project(LogisticRegression)
+
+set(APP lr)
+include(../BuildGPUCode.cmake)
diff --git a/apps/mm/CMakeLists.txt b/apps/mm/CMakeLists.txt
new file mode 100644
index 0000000..f1bd13b
--- /dev/null
+++ b/apps/mm/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.8)
+project(MatrixMultiplication)
+
+set(APP mm)
+include(../BuildGPUCode.cmake)
diff --git a/apps/stream/CMakeLists.txt b/apps/stream/CMakeLists.txt
new file mode 100644
index 0000000..577e694
--- /dev/null
+++ b/apps/stream/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.8)
+project(Stream)
+
+set(APP stream)
+include(../BuildGPUCode.cmake)
diff --git a/apps/vector_copy/CMakeLists.txt b/apps/vector_copy/CMakeLists.txt
new file mode 100644
index 0000000..90d4d01
--- /dev/null
+++ b/apps/vector_copy/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.8)
+project(VectorCopy)
+
+set(APP vc)
+include(../BuildGPUCode.cmake)

From fb796de05bc6e1803ad927d575f75548665c6c92 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 26 Aug 2020 16:16:01 -0400
Subject: [PATCH 096/118] Update BuildGPUCode and Makefiles

---
 apps/BuildGPUCode.cmake |  9 +++++++++
 apps/Makefile           | 16 ++++++++++++----
 apps/mm/Makefile        |  1 +
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/apps/BuildGPUCode.cmake b/apps/BuildGPUCode.cmake
index a12b8e2..e7eaa51 100644
--- a/apps/BuildGPUCode.cmake
+++ b/apps/BuildGPUCode.cmake
@@ -20,6 +20,7 @@ if(CMAKE_CUDA_COMPILER)
   enable_language(CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -O3")
   add_library(${APP}.cuda STATIC ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.cu)
+  add_library(${APP}.kernel.cuda STATIC ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.kernel.cu)
 endif()
 
 if(HIP_FOUND)
@@ -33,9 +34,17 @@ if(HIP_FOUND)
       DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.cu
       COMMENT "Convering .cu to .hip.cc"
       )
+    add_custom_command(
+      OUTPUT ${APP}.kernel.hip.cc
+      COMMAND ${HIP_ROOT_DIR}/hip/bin/hipify-perl ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.kernel.cu > ${APP}.kernel.hip.cc
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+      DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${APP}.kernel.cu
+      COMMENT "Convering .cu to .hip.cc"
+      )
     set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc -fPIC")
     hip_add_library(${APP}.hip STATIC ${APP}.hip.cc)
+    hip_add_library(${APP}.kernel.hip STATIC ${APP}.kernel.hip.cc)
   else ()
     message(STATUS "Found HIP, but HIPIFY NOTFOUND")
     set(HIP_FOUND OFF)
diff --git a/apps/Makefile b/apps/Makefile
index a184a1a..edf69da 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -5,9 +5,9 @@ CHPL_FLAGS=--fast $(CHPL_GPU_MODULES)
 ifeq ($(USE_CUBLAS), yes)
   CUBLAS_LIB=-lcublas
 endif
-CUDA_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPICUDA_static -L./build -l$(TARGET).cuda -L$(CUDA_ROOT_DIR)/lib -lcudart $(CUBLAS_LIB)
+CUDA_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPICUDA_static -L$(CUDA_ROOT_DIR)/lib -lcudart $(CUBLAS_LIB)
 # HIP
-HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L./build -l$(TARGET).hip -L$(HIP_ROOT_DIR)/lib -lhip_hcc
+HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L$(HIP_ROOT_DIR)/lib -lhip_hcc
 
 # OpenCL
 UNAME_S := $(shell uname -s)
@@ -23,10 +23,18 @@ gpu: $(TARGET).cu
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
 cuda%: gpu $(TARGET).%.chpl
-	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(CUDA_LIBS) -o $(TARGET).cuda.$*
+ifneq (,$(filter $*, hybrid.dist.mid hybrid.dist.lowmid))
+		chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).cuda $(CUDA_LIBS) -o $(TARGET).cuda.$*
+else
+		chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.chpl -L./build -l$(TARGET).kernel.cuda $(CUDA_LIBS) -o $(TARGET).cuda.$*
+endif
 
 hip%: gpu $(TARGET).%.chpl
-	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(HIP_LIBS) -o $(TARGET).hip.$*
+ifneq (,$(filter $*, hybrid.dist.mid hybrid.dist.lowmid))
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).hip $(HIP_LIBS) -o $(TARGET).hip.$*
+else
+	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.chpl -L./build -l$(TARGET).kernel.hip $(HIP_LIBS) -o $(TARGET).hip.$*
+endif
 
 opencl%: gpu $(TARGET).%.chpl
 	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(OPNCL_LIBS) -o $(TARGET).opencl.$*
diff --git a/apps/mm/Makefile b/apps/mm/Makefile
index b14038c..9b7e18c 100644
--- a/apps/mm/Makefile
+++ b/apps/mm/Makefile
@@ -1,2 +1,3 @@
 TARGET=mm
+USE_CUBLAS=yes
 include ../Makefile

From 5610c2e44f36cc9c5d0ac707bc237e208b7bd810 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 26 Aug 2020 16:19:01 -0400
Subject: [PATCH 097/118] CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1a6cc5..8d57fad 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -107,7 +107,7 @@ endforeach()
 
 if (GPU_ROOT_DIRS)
   file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/env.sh "#!/bin/bash\n")
-  file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh "export CHPL_GPU_HOME="${CMAKE_INSTALL_PREFIX}\n)
+  file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh "export CHPL_GPU_HOME=${CMAKE_INSTALL_PREFIX}\n")
   foreach (p ${GPU_ROOT_DIRS})
     file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/env.sh ${p}\n)
   endforeach()

From 19270a7d17be6f150384891ed68b7c3b06b6cc69 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Wed, 26 Aug 2020 16:23:25 -0400
Subject: [PATCH 098/118] Update CMakeLists.txt

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d57fad..777b574 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,7 +43,7 @@ if(HIP_FOUND)
       COMMENT "Convering GPUAPI.cu to GPUAPI.hip.cc"
       )
     set(CMAKE_CXX_COMPILER "${HIP_ROOT_DIR}/hip/bin/hipcc")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -fno-gpu-rdc -fPIC")
     hip_add_library(GPUAPIHIP SHARED GPUAPI.hip.cc)
     hip_add_library(GPUAPIHIP_static STATIC GPUAPI.hip.cc)
     set_target_properties(GPUAPIHIP PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/src/GPUAPI.h)

From 7f9a1bc4b2001106afa7d5dfb0d1798c0f9cbf44 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 13:27:12 -0400
Subject: [PATCH 099/118] Update Makefile

---
 apps/Makefile | 61 +++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/apps/Makefile b/apps/Makefile
index edf69da..54b3199 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -17,24 +17,53 @@ else
 	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -l$(TARGET).opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
 endif
 
-all: cudahybrid.dist
+# CPU
 
-gpu: $(TARGET).cu
+baseline: $(TARGET).baseline.chpl
+	chpl --fast $< -o $(TARGET).cpu.$@
+
+# CUDA
+
+build/lib$(TARGET).cuda.a: $(TARGET).cu
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
-cuda%: gpu $(TARGET).%.chpl
-ifneq (,$(filter $*, hybrid.dist.mid hybrid.dist.lowmid))
-		chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).cuda $(CUDA_LIBS) -o $(TARGET).cuda.$*
-else
-		chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.chpl -L./build -l$(TARGET).kernel.cuda $(CUDA_LIBS) -o $(TARGET).cuda.$*
-endif
+build/lib$(TARGET).kernel.cuda.a: $(TARGET).kernel.cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
-hip%: gpu $(TARGET).%.chpl
-ifneq (,$(filter $*, hybrid.dist.mid hybrid.dist.lowmid))
-	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).hip $(HIP_LIBS) -o $(TARGET).hip.$*
-else
-	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.chpl -L./build -l$(TARGET).kernel.hip $(HIP_LIBS) -o $(TARGET).hip.$*
-endif
+cuda.%.mid: $(TARGET).%.mid.chpl $(TARGET).kernel.h build/lib$(TARGET).kernel.cuda.a
+	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.mid.chpl -L./build -l$(TARGET).kernel.cuda $(CUDA_LIBS) -o $(TARGET).$@
+
+cuda.%.lowmid: $(TARGET).%.lowmid.chpl $(TARGET).kernel.h build/lib$(TARGET).kernel.cuda.a
+	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.lowmid.chpl -L./build -l$(TARGET).kernel.cuda $(CUDA_LIBS) -o $(TARGET).$@
+
+cuda.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).cuda.a
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).cuda $(CUDA_LIBS) -o $(TARGET).$@
+
+# HIP
+
+build/lib$(TARGET).hip.a: $(TARGET).cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+build/lib$(TARGET).kernel.hip.a: $(TARGET).kernel.cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+hip.%.mid: $(TARGET).%.mid.chpl $(TARGET).kernel.h  build/lib$(TARGET).kernel.hip.a
+	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.mid.chpl -L./build -l$(TARGET).kernel.hip $(HIP_LIBS) -o $(TARGET).$@
+
+hip.%.lowmid: $(TARGET).%.lowmid.chpl $(TARGET).kernel.h build/lib$(TARGET).kernel.hip.a
+	chpl $(CHPL_FLAGS) $(TARGET).kernel.h $(TARGET).$*.lowmid.chpl -L./build -l$(TARGET).kernel.hip $(HIP_LIBS) -o $(TARGET).$@
+
+hip.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).hip.a
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).hip $(HIP_LIBS) -o $(TARGET).$@
+
+# OpenCL (MID and LOW-MID are not supported so far)
+
+build/lib$(TARGET).opencl.a: $(TARGET).opencl.c
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+opencl.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).opencl.a
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).opencl $(OPNCL_LIBS) -o $(TARGET).$@
 
-opencl%: gpu $(TARGET).%.chpl
-	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl $(OPNCL_LIBS) -o $(TARGET).opencl.$*
+.PHONY: clean
+clean:
+	rm -rf ./build $(TARGET).cpu.baseline $(TARGET).cuda.* $(TARGET).hip.* $(TARGET).opencl.gpu $(TARGET).opencl.hybrid.*

From d2046071a8f7ded7e53539749e2d7c0121f0f6f3 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 13:36:38 -0400
Subject: [PATCH 100/118] Update Makefile

---
 example/Makefile | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/example/Makefile b/example/Makefile
index 9e7392d..a382fe8 100644
--- a/example/Makefile
+++ b/example/Makefile
@@ -14,17 +14,24 @@ else
 	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -lvc.opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
 endif
 
-all: cuda
+build/libvc.cuda.a: vc.cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
+
+build/libvc.hip.a: vc.cu
+	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
-gpu: vc.cu
+build/libvc.opencl.a: vc.opencl.c
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
-cuda: gpu vc.chpl
+cuda: vc.chpl vc.h build/libvc.cuda.a
 	chpl $(CHPL_FLAGS) vc.h vc.chpl $(CUDA_LIBS) -o vc.cuda
 
-hip: gpu vc.chpl
+hip: vc.chpl vc.h build/libvc.hip.a
 	chpl $(CHPL_FLAGS) vc.h vc.chpl $(HIP_LIBS) -o vc.hip
 
-opencl: gpu vc.chpl
+opencl: vc.chpl build/libvc.opencl.a
 	chpl $(CHPL_FLAGS) vc.h vc.chpl $(OPENCL_LIBS) -o vc.opencl
 
+.PHONY: clean
+clean:
+	rm -rf build vc.cuda vc.hip vc.opencl

From 828f49b846a05e67ba630631535bcf13999ce369 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 14:57:29 -0400
Subject: [PATCH 101/118] Update HIP/OpenCL apps

---
 apps/Makefile                                | 14 ++++++++----
 apps/blackscholes/bs.hybrid.dist.lowmid.chpl |  7 ++++--
 apps/blackscholes/bs.hybrid.dist.mid.chpl    |  7 ++++--
 apps/mm/mm.cu                                | 24 ++++++++++++--------
 apps/mm/mm.kernel.cu                         |  4 ++++
 5 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/apps/Makefile b/apps/Makefile
index 54b3199..d797b62 100644
--- a/apps/Makefile
+++ b/apps/Makefile
@@ -12,9 +12,9 @@ HIP_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIHIP_static -L$(HIP_ROOT_DIR)/lib -lhip_h
 # OpenCL
 UNAME_S := $(shell uname -s)
 ifeq ($(UNAME_S),Darwin)
-	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -l$(TARGET).opencl --ldflags '-framework OpenCL'
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static --ldflags '-framework OpenCL'
 else
-	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L./build -l$(TARGET).opencl -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
+	OPENCL_LIBS=-L$(CHPL_GPU_HOME)/lib -lGPUAPIOPENCL_static -L$(subst libOpenCL.so,,$(OpenCL_LIBRARIES)) -lOpenCL
 endif
 
 # CPU
@@ -24,6 +24,8 @@ baseline: $(TARGET).baseline.chpl
 
 # CUDA
 
+cuda: cuda.gpu cuda.hybrid cuda.hybrid.dist cuda.hybrid.dist.lowmid cuda.hybrid.dist.mid
+
 build/lib$(TARGET).cuda.a: $(TARGET).cu
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
@@ -41,6 +43,8 @@ cuda.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).cuda.a
 
 # HIP
 
+hip: hip.gpu hip.hybrid hip.hybrid.dist hip.hybrid.dist.lowmid hip.hybrid.dist.mid
+
 build/lib$(TARGET).hip.a: $(TARGET).cu
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
@@ -58,12 +62,14 @@ hip.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).hip.a
 
 # OpenCL (MID and LOW-MID are not supported so far)
 
+opencl: opencl.gpu opencl.hybrid opencl.hybrid.dist
+
 build/lib$(TARGET).opencl.a: $(TARGET).opencl.c
 	rm -rf build &&	mkdir build && cd build && cmake .. && make
 
 opencl.%: $(TARGET).%.chpl $(TARGET).h build/lib$(TARGET).opencl.a
-	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).opencl $(OPNCL_LIBS) -o $(TARGET).$@
+	chpl $(CHPL_FLAGS) $(TARGET).h $(TARGET).$*.chpl -L./build -l$(TARGET).opencl $(OPENCL_LIBS) -o $(TARGET).$@
 
 .PHONY: clean
 clean:
-	rm -rf ./build $(TARGET).cpu.baseline $(TARGET).cuda.* $(TARGET).hip.* $(TARGET).opencl.gpu $(TARGET).opencl.hybrid.*
+	rm -rf ./build $(TARGET).cpu.baseline $(TARGET).cuda.* $(TARGET).hip.* $(TARGET).opencl.gpu $(TARGET).opencl.hybrid*
diff --git a/apps/blackscholes/bs.hybrid.dist.lowmid.chpl b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
index 10d3ec5..f4103a2 100644
--- a/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.lowmid.chpl
@@ -4,6 +4,9 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -36,7 +39,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) {
 	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
-  ref lrand = rand.localSlice(lo .. hi);  
+  ref lrand = rand.localSlice(lo .. hi);
   ref lput = put.localSlice(lo .. hi);
   ref lcall = call.localSlice(lo .. hi);
   if (verbose) { ProfilerStart(); }
@@ -94,7 +97,7 @@ proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
-  writeln("nGPUs: ", nGPUs);    
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
diff --git a/apps/blackscholes/bs.hybrid.dist.mid.chpl b/apps/blackscholes/bs.hybrid.dist.mid.chpl
index 84c32fe..f21b236 100644
--- a/apps/blackscholes/bs.hybrid.dist.mid.chpl
+++ b/apps/blackscholes/bs.hybrid.dist.mid.chpl
@@ -4,6 +4,9 @@ use Time;
 /// GPUIterator
 ////////////////////////////////////////////////////////////////////////////////
 use GPUIterator;
+use GPUAPI;
+use BlockDist;
+use SysCTypes;
 
 ////////////////////////////////////////////////////////////////////////////////
 /// Runtime Options
@@ -36,7 +39,7 @@ proc CUDAWrapper(lo: int, hi: int, N: int) {
   if (verbose) {
 	writeln("In CUDAWrapper(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
-  ref lrand = rand.localSlice(lo .. hi);  
+  ref lrand = rand.localSlice(lo .. hi);
   ref lput = put.localSlice(lo .. hi);
   ref lcall = call.localSlice(lo .. hi);
   if (verbose) { ProfilerStart(); }
@@ -89,7 +92,7 @@ proc main() {
   writeln("BlackScholes: CPU/GPU Execution (using GPUIterator)");
   writeln("Size: ", n);
   writeln("CPU ratio: ", CPUratio);
-  writeln("nGPUs: ", nGPUs);    
+  writeln("nGPUs: ", nGPUs);
   writeln("nTrials: ", numTrials);
   writeln("output: ", output);
 
diff --git a/apps/mm/mm.cu b/apps/mm/mm.cu
index 1f9a8e2..9091dbe 100644
--- a/apps/mm/mm.cu
+++ b/apps/mm/mm.cu
@@ -2,7 +2,9 @@
 #include <stdlib.h>
 #include <sys/time.h>
 #include <assert.h>
+#ifdef __NVCC__
 #include <cublas_v2.h>
+#endif
 
 #define VERBOSE
 //#define PROF
@@ -87,12 +89,12 @@ __global__ void mm_tiled(float *dA, float *dB, float *dC, int DIM, int N, int GP
 	    sA[threadIdx.y][threadIdx.x] = dA[(it+threadIdx.y)*DIM + kt + threadIdx.x];
 	    sB[threadIdx.y][threadIdx.x] = dB[(kt+threadIdx.y)*DIM + jt + threadIdx.x];
 	    __syncthreads();
-	    
+
 	    // two 32x32 small shared (dB[it + 0:31][kt + 0:31], dC[kt+0:31][jt + 0:31]) at this point
 	    for (k = kt; k < kt+32; k++) {
 		sum += sA[i-it][k-kt] * sB[k-kt][j-jt];
 	    }
-	    
+
 	    __syncthreads();
 	}
 	dC[i*DIM+j] = sum;
@@ -108,7 +110,7 @@ extern "C" {
 	    printf("In mmCUDA\n");
 	    printf("\t GPUN: %d\n", GPUN);
 	    printf("\t range: %d..%d\n", start, end);
-#endif	
+#endif
 #ifdef PROF
 	    cudaEvent_t startCudaMallocEvent, endCudaMallocEvent;
 	    cudaEvent_t startCudaMemcpyH2DEvent, endCudaMemcpyH2DEvent;
@@ -126,7 +128,7 @@ extern "C" {
 
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaMallocEvent));
-#endif	    
+#endif
 	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
 	    CudaSafeCall(cudaMalloc(&dB, sizeof(float) * N));
 	    CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
@@ -134,7 +136,7 @@ extern "C" {
 	    CudaSafeCall(cudaEventRecord(endCudaMallocEvent));
 	    CudaSafeCall(cudaEventSynchronize(endCudaMallocEvent));
 #endif
-	    
+
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaMemcpyH2DEvent));
 #endif
@@ -144,7 +146,7 @@ extern "C" {
 	    CudaSafeCall(cudaEventRecord(endCudaMemcpyH2DEvent));
 	    CudaSafeCall(cudaEventSynchronize(endCudaMemcpyH2DEvent));
 #endif
-	    
+
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
 #endif
@@ -155,11 +157,12 @@ extern "C" {
 		dim3 grid(ceil(sqrt(N)/32), ceil(sqrt(N)/32));
 		mm_tiled<<<grid, block>>>(dA, dB, dC, ceil(sqrt(N)), N, N);
 	    } else {
+#ifdef __NVCC__
 	        cublasHandle_t handle;
 #ifdef PROF
 		long long start = getCurrentTime();
-#endif	
-		cublasCreate(&handle);           
+#endif
+		cublasCreate(&handle);
 	        float alpha = 1.0F;
 		float beta = 0.0F;
 	        int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
@@ -176,7 +179,8 @@ extern "C" {
 		long long end2 = getCurrentTime();
 		printf("cuBLAS finish: %lf msec\n", (float)(end2-start)/1000);
 #endif
-	    }	    
+#endif
+	    }
 	    CudaCheckError();
 #ifdef PROF
 	    CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
@@ -208,7 +212,7 @@ extern "C" {
 	    //for (int i = 0; i < GPUN; i++) {
 	    //	printf("C[%d] = %lf\n", start+i, C[start+i]);
 	    //}
-	    
+
 	    CudaSafeCall(cudaFree(dA));
 	    CudaSafeCall(cudaFree(dB));
 	    CudaSafeCall(cudaFree(dC));
diff --git a/apps/mm/mm.kernel.cu b/apps/mm/mm.kernel.cu
index 849f49e..eb71ea2 100644
--- a/apps/mm/mm.kernel.cu
+++ b/apps/mm/mm.kernel.cu
@@ -1,5 +1,7 @@
 #include <assert.h>
+#ifdef __NVCC__
 #include <cublas_v2.h>
+#endif
 
 __global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
@@ -32,6 +34,7 @@ void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, in
             assert(false);
         }
         else {
+#ifdef __NVCC__
             printf("Using cublas\n");
             cublasHandle_t handle;
 
@@ -41,6 +44,7 @@ void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, in
             int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
 
             cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, B, ldb, A, lda, &beta, C, ldc);
+#endif
         }
     }
 }

From f76706dca0dc8ee2231c0457c83c83bb7b50a42f Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 15:15:06 -0400
Subject: [PATCH 102/118] Update mm.kernel.cu

---
 apps/mm/mm.kernel.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/apps/mm/mm.kernel.cu b/apps/mm/mm.kernel.cu
index eb71ea2..60b363b 100644
--- a/apps/mm/mm.kernel.cu
+++ b/apps/mm/mm.kernel.cu
@@ -1,3 +1,6 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/time.h>
 #include <assert.h>
 #ifdef __NVCC__
 #include <cublas_v2.h>

From 4d6d0544dd2c149b40c5b8a73dbb27d10cf26dcc Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 15:55:27 -0400
Subject: [PATCH 103/118] Update non-dist versions of LR

---
 apps/logisticregression/lr.cu          | 118 ++++++++++++-------------
 apps/logisticregression/lr.gpu.chpl    |  42 ++++++---
 apps/logisticregression/lr.hybrid.chpl |  42 ++++++---
 3 files changed, 117 insertions(+), 85 deletions(-)

diff --git a/apps/logisticregression/lr.cu b/apps/logisticregression/lr.cu
index 3cef4a7..38ff793 100644
--- a/apps/logisticregression/lr.cu
+++ b/apps/logisticregression/lr.cu
@@ -7,14 +7,14 @@
 
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 {
-        #ifdef CUDA_ERROR_CHECK
+#ifdef CUDA_ERROR_CHECK
     if ( cudaSuccess != err )
     {
-	fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-		 file, line, cudaGetErrorString( err ) );
-	exit( -1 );
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
-        #endif
+#endif
 
     return;
 }
@@ -22,76 +22,76 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 __global__ void kernel1(float *dW, float *dWcurr, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
-	dWcurr[id] = dW[id];
+        dWcurr[id] = dW[id];
     }
 }
 
 __global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float alpha, int nSamples, int nFeatures, int start, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
-	float err = 0.0;
-	for (int s = 0; s < nSamples; s++) {
-	    float arg = 0.0;
-	    for (int f = 0; f < nFeatures; f++) {
-		arg += dWcurr[f] * dX[s * (nFeatures) + f];
-	    }
-	    float hypo = 1 / (1 + exp(-arg));
-	    err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
-	}
-	dW[id] = dWcurr[start + id] - alpha * err;
+        float err = 0.0;
+        for (int s = 0; s < nSamples; s++) {
+            float arg = 0.0;
+            for (int f = 0; f < nFeatures; f++) {
+                arg += dWcurr[f] * dX[s * (nFeatures) + f];
+            }
+            float hypo = 1 / (1 + exp(-arg));
+            err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
+        }
+        dW[id] = dWcurr[start + id] - alpha * err;
     }
 }
 
 extern "C" {
     void lrCUDA1(float *W, float *Wcurr, int start, int end, int GPUN) {
-	float *dW, *dWcurr;
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+    float *dW, *dWcurr;
+    if (GPUN > 0) {
+        assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In lrCUDA1\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
-#endif	
-	    CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * GPUN));
-	    
-	    CudaSafeCall(cudaMemcpy(dW, W + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-	    kernel1<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, GPUN);
+        printf("In lrCUDA1\n");
+        printf("\t GPUN: %d\n", GPUN);
+        printf("\t range: %d..%d\n", start, end);
+#endif
+        CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
+        CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * GPUN));
 
-	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(Wcurr + start, dWcurr, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+        CudaSafeCall(cudaMemcpy(dW, W + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+        kernel1<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, GPUN);
 
-	    CudaSafeCall(cudaFree(dW));
-	    CudaSafeCall(cudaFree(dWcurr));	    
-	}
+        CudaSafeCall(cudaDeviceSynchronize());
+        CudaSafeCall(cudaMemcpy(Wcurr + start, dWcurr, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+
+        CudaSafeCall(cudaFree(dW));
+        CudaSafeCall(cudaFree(dWcurr));
+    }
     }
-    
+
     void lrCUDA2(float* X, float *Y, float *W, float *Wcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
-	float *dX, *dY, *dW, *dWcurr;
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+    float *dX, *dY, *dW, *dWcurr;
+    if (GPUN > 0) {
+        assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In lrCUDA2\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
-#endif	
-	    CudaSafeCall(cudaMalloc(&dX, sizeof(float) * nSamples * nFeatures));
-	    CudaSafeCall(cudaMalloc(&dY, sizeof(float) * nSamples));
-	    CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * nFeatures));
-	    CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
-	    
-	    CudaSafeCall(cudaMemcpy(dX, X, sizeof(float) * nSamples * nFeatures, cudaMemcpyHostToDevice));
-	    CudaSafeCall(cudaMemcpy(dY, Y, sizeof(float) * nSamples, cudaMemcpyHostToDevice));		
-	    CudaSafeCall(cudaMemcpy(dWcurr, Wcurr, sizeof(float) * nFeatures, cudaMemcpyHostToDevice));
-	    
-	    kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
-	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(W, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
-	    
-	    CudaSafeCall(cudaFree(dX));
-	    CudaSafeCall(cudaFree(dY));
-	    CudaSafeCall(cudaFree(dW));
-	    CudaSafeCall(cudaFree(dWcurr));
-	}
+        printf("In lrCUDA2\n");
+        printf("\t GPUN: %d\n", GPUN);
+        printf("\t range: %d..%d\n", start, end);
+#endif
+        CudaSafeCall(cudaMalloc(&dX, sizeof(float) * nSamples * nFeatures));
+        CudaSafeCall(cudaMalloc(&dY, sizeof(float) * nSamples));
+        CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * nFeatures));
+        CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
+
+        CudaSafeCall(cudaMemcpy(dX, X, sizeof(float) * nSamples * nFeatures, cudaMemcpyHostToDevice));
+        CudaSafeCall(cudaMemcpy(dY, Y, sizeof(float) * nSamples, cudaMemcpyHostToDevice));
+        CudaSafeCall(cudaMemcpy(dWcurr, Wcurr, sizeof(float) * nFeatures, cudaMemcpyHostToDevice));
+
+        kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
+        CudaSafeCall(cudaDeviceSynchronize());
+        CudaSafeCall(cudaMemcpy(W, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+
+        CudaSafeCall(cudaFree(dX));
+        CudaSafeCall(cudaFree(dY));
+        CudaSafeCall(cudaFree(dW));
+        CudaSafeCall(cudaFree(dWcurr));
+    }
     }
 }
diff --git a/apps/logisticregression/lr.gpu.chpl b/apps/logisticregression/lr.gpu.chpl
index 924df9a..d18e08e 100644
--- a/apps/logisticregression/lr.gpu.chpl
+++ b/apps/logisticregression/lr.gpu.chpl
@@ -72,24 +72,40 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..nFeatures {
-      W(i) = 0: real(32);
-	}
-	for i in 1..nSamples {
-      Y(i) = (i % 2): real(32);
-      for j in 1..nFeatures {
-		if (j != 0) {
-          X(i, j) = (i % 2): real(32);
-		} else {
-          X(i, j) = 1;
-		}
+    if (false) {
+      for i in 1..nFeatures {
+        W(i) = 0: real(32);
       }
-	}
+      for i in 1..nSamples {
+        Y(i) = (i % 2): real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = (i % 2): real(32);
+          } else {
+            X(i, j) = 1;
+          }
+        }
+      }
+    } else {
+      forall i in 1..nFeatures {
+        W(i) = 0: real(32);
+      }
+      for i in 1..nSamples {
+        Y(i) = i: real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = j: real(32);
+          } else {
+            X(i, j) = j : real(32);
+          }
+        }
+      }
+    }
 
 	const startTime = getCurrentTime();
 	for ite in 1..nIters {
       lrCUDA1(W, Wcurr, 0, nFeatures-1, nFeatures);
-      lrCUDA2(X, Y, W, Wcurr, alpha, nSamples, nFeatures, 0, nFeatures-1, nFeatures);
+      lrCUDA2(X, Y, W, Wcurr, alpha, nSamples, nFeatures, 1, nFeatures, nFeatures);
 	}
 	execTimes(trial) = getCurrentTime() - startTime;
 	if (output) {
diff --git a/apps/logisticregression/lr.hybrid.chpl b/apps/logisticregression/lr.hybrid.chpl
index f2abf98..1090818 100644
--- a/apps/logisticregression/lr.hybrid.chpl
+++ b/apps/logisticregression/lr.hybrid.chpl
@@ -48,7 +48,7 @@ proc CUDAWrapper2(lo: int, hi: int, N: int) {
   if (verbose) {
 	writeln("In CUDAWrapper2(), launching the CUDA kernel with a range of ", lo, "..", hi, " (Size: ", N, ")");
   }
-  lrCUDA2(X, Y, W, Wcurr, alpha, nSamples, nFeatures, lo, hi, N);
+  lrCUDA2(X, Y, W, Wcurr, alpha, nSamples, nFeatures, lo+1, hi+1, N);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -97,19 +97,35 @@ proc main() {
 
   var execTimes: [1..numTrials] real;
   for trial in 1..numTrials {
-	for i in 1..nFeatures {
-      W(i) = 0: real(32);
-	}
-	for i in 1..nSamples {
-      Y(i) = (i % 2): real(32);
-      for j in 1..nFeatures {
-		if (j != 0) {
-          X(i, j) = (i % 2): real(32);
-		} else {
-          X(i, j) = 1;
-		}
+    if (false) {
+      for i in 1..nFeatures {
+        W(i) = 0: real(32);
       }
-	}
+      for i in 1..nSamples {
+        Y(i) = (i % 2): real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = (i % 2): real(32);
+          } else {
+            X(i, j) = 1;
+          }
+        }
+      }
+    } else {
+      forall i in 1..nFeatures {
+        W(i) = 0: real(32);
+      }
+      for i in 1..nSamples {
+        Y(i) = i: real(32);
+        for j in 1..nFeatures {
+          if (j != 0) {
+            X(i, j) = j: real(32);
+          } else {
+            X(i, j) = j : real(32);
+          }
+        }
+      }
+    }
 
 	const startTime = getCurrentTime();
 	for ite in 1..nIters {

From fd0a9493e9cc1107c6096aaf6535abc9c68d14cb Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 16:21:43 -0400
Subject: [PATCH 104/118] Fix indent issues

---
 apps/blackscholes/bs.cu              | 210 +++++++++++------------
 apps/blackscholes/bs.kernel.cu       | 186 ++++++++++----------
 apps/logisticregression/lr.cu        |  72 ++++----
 apps/logisticregression/lr.kernel.cu |  24 +--
 apps/mm/mm.cu                        | 248 +++++++++++++--------------
 apps/mm/mm.kernel.cu                 |  44 ++---
 apps/stream/stream.cu                |  60 +++----
 apps/vector_copy/vc.cu               |  86 +++++-----
 apps/vector_copy/vc.kernel.cu        |   4 +-
 9 files changed, 467 insertions(+), 467 deletions(-)

diff --git a/apps/blackscholes/bs.cu b/apps/blackscholes/bs.cu
index 48092e2..48b73ae 100644
--- a/apps/blackscholes/bs.cu
+++ b/apps/blackscholes/bs.cu
@@ -8,14 +8,14 @@
 
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 {
-        #ifdef CUDA_ERROR_CHECK
+#ifdef CUDA_ERROR_CHECK
     if ( cudaSuccess != err )
     {
-	fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-		 file, line, cudaGetErrorString( err ) );
-	exit( -1 );
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
-        #endif
+#endif
 
     return;
 }
@@ -23,22 +23,22 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 inline void __cudaCheckError( const char *file, const int line )
 {
 #ifdef CUDA_ERROR_CHECK
-  cudaError err = cudaGetLastError();
-  if ( cudaSuccess != err )
+    cudaError err = cudaGetLastError();
+    if ( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 
-  // More careful checking. However, this will affect performance.
-  // Comment away if needed.
-  err = cudaDeviceSynchronize();
-  if( cudaSuccess != err )
+    // More careful checking. However, this will affect performance.
+    // Comment away if needed.
+    err = cudaDeviceSynchronize();
+    if( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 }
 #endif
@@ -57,99 +57,99 @@ inline void __cudaCheckError( const char *file, const int line )
 __global__ void bs(float *drand, float *dput, float *dcall, int n) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < n) {
-	float c1 = 0.319381530f;
-	float c2 = -0.356563782f;
-	float c3 = 1.781477937f;
-	float c4 = -1.821255978f;
-	float c5 = 1.330274429f;
-		    
-	float zero = 0.0f;
-	float one = 1.0f;
-	float two = 2.0f;
-	float temp4 = 0.2316419f;
-		    
-	float oneBySqrt2pi = 0.398942280f;
-		    
-	float d1, d2;
-	float phiD1, phiD2;
-	float sigmaSqrtT;
-	float KexpMinusRT;
-		    
-	float inRand;		    
-		    
-	inRand = drand[id];
-		    
-	float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
-	float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
-	float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
-	float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
-	float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
-		    
-	sigmaSqrtT = sigmaVal * (float)sqrt(T);
-		    
-	d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
-	d2 = d1 - sigmaSqrtT;
-		    
-	KexpMinusRT = K * (float)exp(-R * T);
-		    
-	// phiD1 = phi(d1)
-	float X = d1;
-	float absX = (float)abs(X);
-	float t = one / (one + temp4 * absX);	
-	float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD1 = (X < zero) ? (one - y) : y; 
-	// phiD2 = phi(d2)
-	X = d2;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD2 = (X < zero) ? (one - y) : y; 
-		    
-	dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
-	
-	// phiD1 = phi(-d1);
-	X = -d1;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD1 = (X < zero) ? (one - y) : y; 
-		    
-	// phiD2 = phi(-d2);
-	X = -d2;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD2 = (X < zero) ? (one - y) : y; 
-		    
-	dput[id] = KexpMinusRT * phiD2 - S * phiD1;			
+        float c1 = 0.319381530f;
+        float c2 = -0.356563782f;
+        float c3 = 1.781477937f;
+        float c4 = -1.821255978f;
+        float c5 = 1.330274429f;
+
+        float zero = 0.0f;
+        float one = 1.0f;
+        float two = 2.0f;
+        float temp4 = 0.2316419f;
+
+        float oneBySqrt2pi = 0.398942280f;
+
+        float d1, d2;
+        float phiD1, phiD2;
+        float sigmaSqrtT;
+        float KexpMinusRT;
+
+        float inRand;
+
+        inRand = drand[id];
+
+        float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
+        float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
+        float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
+        float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
+        float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
+
+        sigmaSqrtT = sigmaVal * (float)sqrt(T);
+
+        d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+        d2 = d1 - sigmaSqrtT;
+
+        KexpMinusRT = K * (float)exp(-R * T);
+
+        // phiD1 = phi(d1)
+        float X = d1;
+        float absX = (float)abs(X);
+        float t = one / (one + temp4 * absX);
+        float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD1 = (X < zero) ? (one - y) : y;
+        // phiD2 = phi(d2)
+        X = d2;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD2 = (X < zero) ? (one - y) : y;
+
+        dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
+
+        // phiD1 = phi(-d1);
+        X = -d1;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD1 = (X < zero) ? (one - y) : y;
+
+        // phiD2 = phi(-d2);
+        X = -d2;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD2 = (X < zero) ? (one - y) : y;
+
+        dput[id] = KexpMinusRT * phiD2 - S * phiD1;
     }
 }
 
 extern "C" {
     void bsCUDA(float* rand, float *put, float *call, int start, int end, int GPUN) {
-	float *drand, *dput, *dcall;
+        float *drand, *dput, *dcall;
 
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In vcCUDA\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
-#endif	
-	    CudaSafeCall(cudaMalloc(&drand, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dput, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dcall, sizeof(float) * GPUN));	    
-	    CudaSafeCall(cudaMemcpy(drand, rand + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-	    
-	    bs<<<ceil(((float)GPUN)/1024), 1024>>>(drand, dput, dcall, GPUN);
-	    CudaCheckError();	    
-	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(put + start, dput, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
-	    CudaSafeCall(cudaMemcpy(call + start, dcall, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
-	    
-	    CudaSafeCall(cudaFree(drand));
-	    CudaSafeCall(cudaFree(dput));
-	    CudaSafeCall(cudaFree(dcall));
-	}
-    }    
+            printf("In vcCUDA\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
+#endif
+            CudaSafeCall(cudaMalloc(&drand, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dput, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dcall, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMemcpy(drand, rand + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+
+            bs<<<ceil(((float)GPUN)/1024), 1024>>>(drand, dput, dcall, GPUN);
+            CudaCheckError();
+            CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaMemcpy(put + start, dput, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+            CudaSafeCall(cudaMemcpy(call + start, dcall, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+
+            CudaSafeCall(cudaFree(drand));
+            CudaSafeCall(cudaFree(dput));
+            CudaSafeCall(cudaFree(dcall));
+        }
+    }
 }
diff --git a/apps/blackscholes/bs.kernel.cu b/apps/blackscholes/bs.kernel.cu
index 22125c6..2ca346f 100644
--- a/apps/blackscholes/bs.kernel.cu
+++ b/apps/blackscholes/bs.kernel.cu
@@ -13,71 +13,71 @@
 __global__ void bs(float *drand, float *dput, float *dcall, int n) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < n) {
-	float c1 = 0.319381530f;
-	float c2 = -0.356563782f;
-	float c3 = 1.781477937f;
-	float c4 = -1.821255978f;
-	float c5 = 1.330274429f;
-		    
-	float zero = 0.0f;
-	float one = 1.0f;
-	float two = 2.0f;
-	float temp4 = 0.2316419f;
-		    
-	float oneBySqrt2pi = 0.398942280f;
-		    
-	float d1, d2;
-	float phiD1, phiD2;
-	float sigmaSqrtT;
-	float KexpMinusRT;
-		    
-	float inRand;		    
-		    
-	inRand = drand[id];
-		    
-	float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
-	float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
-	float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
-	float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
-	float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
-		    
-	sigmaSqrtT = sigmaVal * (float)sqrt(T);
-		    
-	d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
-	d2 = d1 - sigmaSqrtT;
-		    
-	KexpMinusRT = K * (float)exp(-R * T);
-		    
-	// phiD1 = phi(d1)
-	float X = d1;
-	float absX = (float)abs(X);
-	float t = one / (one + temp4 * absX);	
-	float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD1 = (X < zero) ? (one - y) : y; 
-	// phiD2 = phi(d2)
-	X = d2;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD2 = (X < zero) ? (one - y) : y; 
-		    
-	dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
-
-	// phiD1 = phi(-d1);
-	X = -d1;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD1 = (X < zero) ? (one - y) : y; 
-		    
-	// phiD2 = phi(-d2);
-	X = -d2;
-	absX = abs(X);
-	t = one / (one + temp4 * absX);	
-	y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-	phiD2 = (X < zero) ? (one - y) : y; 
-		    
-	dput[id] = KexpMinusRT * phiD2 - S * phiD1;			
+        float c1 = 0.319381530f;
+        float c2 = -0.356563782f;
+        float c3 = 1.781477937f;
+        float c4 = -1.821255978f;
+        float c5 = 1.330274429f;
+
+        float zero = 0.0f;
+        float one = 1.0f;
+        float two = 2.0f;
+        float temp4 = 0.2316419f;
+
+        float oneBySqrt2pi = 0.398942280f;
+
+        float d1, d2;
+        float phiD1, phiD2;
+        float sigmaSqrtT;
+        float KexpMinusRT;
+
+        float inRand;
+
+        inRand = drand[id];
+
+        float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
+        float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
+        float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
+        float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
+        float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
+
+        sigmaSqrtT = sigmaVal * (float)sqrt(T);
+
+        d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
+        d2 = d1 - sigmaSqrtT;
+
+        KexpMinusRT = K * (float)exp(-R * T);
+
+        // phiD1 = phi(d1)
+        float X = d1;
+        float absX = (float)abs(X);
+        float t = one / (one + temp4 * absX);
+        float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD1 = (X < zero) ? (one - y) : y;
+        // phiD2 = phi(d2)
+        X = d2;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD2 = (X < zero) ? (one - y) : y;
+
+        dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
+
+        // phiD1 = phi(-d1);
+        X = -d1;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD1 = (X < zero) ? (one - y) : y;
+
+        // phiD2 = phi(-d2);
+        X = -d2;
+        absX = abs(X);
+        t = one / (one + temp4 * absX);
+        y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+        phiD2 = (X < zero) ? (one - y) : y;
+
+        dput[id] = KexpMinusRT * phiD2 - S * phiD1;
     }
 }
 #else
@@ -85,79 +85,79 @@ __global__ void bs(float *drand, float *dput, float *dcall, int n) {
 #endif
 
 extern "C" {
-#ifndef USE_LAMBDA    
+#ifndef USE_LAMBDA
     void LaunchBS(float* drand, float *dput, float *dcall, int N) {
         bs<<<ceil(((float)N)/1024), 1024>>>(drand, dput, dcall, N);
     }
 #else
-    void LaunchBS(float* drand, float *dput, float *dcall, int N) {    
+    void LaunchBS(float* drand, float *dput, float *dcall, int N) {
         call_gpu_functor(N, 1024, NULL, [=] __device__ (int id) {
                 float c1 = 0.319381530f;
                 float c2 = -0.356563782f;
                 float c3 = 1.781477937f;
                 float c4 = -1.821255978f;
                 float c5 = 1.330274429f;
-                
+
                 float zero = 0.0f;
                 float one = 1.0f;
                 float two = 2.0f;
                 float temp4 = 0.2316419f;
-                
+
                 float oneBySqrt2pi = 0.398942280f;
-                
+
                 float d1, d2;
                 float phiD1, phiD2;
                 float sigmaSqrtT;
                 float KexpMinusRT;
-                
-                float inRand;		    
-                
+
+                float inRand;
+
                 inRand = drand[id];
-                
+
                 float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
                 float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
                 float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
                 float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
                 float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);
-                
+
                 sigmaSqrtT = sigmaVal * (float)sqrt(T);
-                
+
                 d1 = ((float)log(S / K) + (R + sigmaVal * sigmaVal / two) * T) / sigmaSqrtT;
                 d2 = d1 - sigmaSqrtT;
-                
+
                 KexpMinusRT = K * (float)exp(-R * T);
-                
+
                 // phiD1 = phi(d1)
                 float X = d1;
                 float absX = (float)abs(X);
-                float t = one / (one + temp4 * absX);	
+                float t = one / (one + temp4 * absX);
                 float y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-                phiD1 = (X < zero) ? (one - y) : y; 
+                phiD1 = (X < zero) ? (one - y) : y;
                 // phiD2 = phi(d2)
                 X = d2;
                 absX = abs(X);
-                t = one / (one + temp4 * absX);	
+                t = one / (one + temp4 * absX);
                 y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-                phiD2 = (X < zero) ? (one - y) : y; 
-                
+                phiD2 = (X < zero) ? (one - y) : y;
+
                 dcall[id] = S * phiD1 - KexpMinusRT * phiD2;
-                
+
                 // phiD1 = phi(-d1);
                 X = -d1;
                 absX = abs(X);
-                t = one / (one + temp4 * absX);	
+                t = one / (one + temp4 * absX);
                 y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-                phiD1 = (X < zero) ? (one - y) : y; 
-                
+                phiD1 = (X < zero) ? (one - y) : y;
+
                 // phiD2 = phi(-d2);
                 X = -d2;
                 absX = abs(X);
-                t = one / (one + temp4 * absX);	
+                t = one / (one + temp4 * absX);
                 y = one - oneBySqrt2pi * (float)exp(-X * X / two) * t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
-                phiD2 = (X < zero) ? (one - y) : y; 
-                
-                dput[id] = KexpMinusRT * phiD2 - S * phiD1;			
+                phiD2 = (X < zero) ? (one - y) : y;
+
+                dput[id] = KexpMinusRT * phiD2 - S * phiD1;
             });
-}
-#endif    
+    }
+#endif
 }
diff --git a/apps/logisticregression/lr.cu b/apps/logisticregression/lr.cu
index 38ff793..48cfcef 100644
--- a/apps/logisticregression/lr.cu
+++ b/apps/logisticregression/lr.cu
@@ -44,54 +44,54 @@ __global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float al
 
 extern "C" {
     void lrCUDA1(float *W, float *Wcurr, int start, int end, int GPUN) {
-    float *dW, *dWcurr;
-    if (GPUN > 0) {
-        assert(end - start + 1 == GPUN);
+        float *dW, *dWcurr;
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-        printf("In lrCUDA1\n");
-        printf("\t GPUN: %d\n", GPUN);
-        printf("\t range: %d..%d\n", start, end);
+            printf("In lrCUDA1\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
-        CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
-        CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * GPUN));
 
-        CudaSafeCall(cudaMemcpy(dW, W + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-        kernel1<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, GPUN);
+            CudaSafeCall(cudaMemcpy(dW, W + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+            kernel1<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, GPUN);
 
-        CudaSafeCall(cudaDeviceSynchronize());
-        CudaSafeCall(cudaMemcpy(Wcurr + start, dWcurr, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+            CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaMemcpy(Wcurr + start, dWcurr, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 
-        CudaSafeCall(cudaFree(dW));
-        CudaSafeCall(cudaFree(dWcurr));
-    }
+            CudaSafeCall(cudaFree(dW));
+            CudaSafeCall(cudaFree(dWcurr));
+        }
     }
 
     void lrCUDA2(float* X, float *Y, float *W, float *Wcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
-    float *dX, *dY, *dW, *dWcurr;
-    if (GPUN > 0) {
-        assert(end - start + 1 == GPUN);
+        float *dX, *dY, *dW, *dWcurr;
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-        printf("In lrCUDA2\n");
-        printf("\t GPUN: %d\n", GPUN);
-        printf("\t range: %d..%d\n", start, end);
+            printf("In lrCUDA2\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
-        CudaSafeCall(cudaMalloc(&dX, sizeof(float) * nSamples * nFeatures));
-        CudaSafeCall(cudaMalloc(&dY, sizeof(float) * nSamples));
-        CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * nFeatures));
-        CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dX, sizeof(float) * nSamples * nFeatures));
+            CudaSafeCall(cudaMalloc(&dY, sizeof(float) * nSamples));
+            CudaSafeCall(cudaMalloc(&dWcurr, sizeof(float) * nFeatures));
+            CudaSafeCall(cudaMalloc(&dW, sizeof(float) * GPUN));
 
-        CudaSafeCall(cudaMemcpy(dX, X, sizeof(float) * nSamples * nFeatures, cudaMemcpyHostToDevice));
-        CudaSafeCall(cudaMemcpy(dY, Y, sizeof(float) * nSamples, cudaMemcpyHostToDevice));
-        CudaSafeCall(cudaMemcpy(dWcurr, Wcurr, sizeof(float) * nFeatures, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dX, X, sizeof(float) * nSamples * nFeatures, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dY, Y, sizeof(float) * nSamples, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dWcurr, Wcurr, sizeof(float) * nFeatures, cudaMemcpyHostToDevice));
 
-        kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
-        CudaSafeCall(cudaDeviceSynchronize());
-        CudaSafeCall(cudaMemcpy(W, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+            kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
+            CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaMemcpy(W, dW, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 
-        CudaSafeCall(cudaFree(dX));
-        CudaSafeCall(cudaFree(dY));
-        CudaSafeCall(cudaFree(dW));
-        CudaSafeCall(cudaFree(dWcurr));
-    }
+            CudaSafeCall(cudaFree(dX));
+            CudaSafeCall(cudaFree(dY));
+            CudaSafeCall(cudaFree(dW));
+            CudaSafeCall(cudaFree(dWcurr));
+        }
     }
 }
diff --git a/apps/logisticregression/lr.kernel.cu b/apps/logisticregression/lr.kernel.cu
index 7386d6b..4277b9d 100644
--- a/apps/logisticregression/lr.kernel.cu
+++ b/apps/logisticregression/lr.kernel.cu
@@ -2,16 +2,16 @@
 __global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float alpha, int nSamples, int nFeatures, int start, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
-	float err = 0.0;
-	for (int s = 0; s < nSamples; s++) {
-	    float arg = 0.0;
-	    for (int f = 0; f < nFeatures; f++) {
-		arg += dWcurr[f] * dX[s * (nFeatures) + f];
-	    }
-	    float hypo = 1 / (1 + exp(-arg));
-	    err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
-	}
-	dW[id] = dWcurr[start + id] - alpha * err;
+        float err = 0.0;
+        for (int s = 0; s < nSamples; s++) {
+            float arg = 0.0;
+            for (int f = 0; f < nFeatures; f++) {
+                arg += dWcurr[f] * dX[s * (nFeatures) + f];
+            }
+            float hypo = 1 / (1 + exp(-arg));
+            err += (hypo - dY[s]) * dX[s * (nFeatures) + start + id];
+        }
+        dW[id] = dWcurr[start + id] - alpha * err;
     }
 }
 #else
@@ -21,7 +21,7 @@ __global__ void kernel2(float *dW, float *dWcurr, float *dX, float *dY, float al
 extern "C" {
 #ifndef USE_LAMBDA
     void LaunchLR(float* dX, float *dY, float *dW, float *dWcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
- 	    kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
+        kernel2<<<ceil(((float)GPUN)/1024), 1024>>>(dW, dWcurr, dX, dY, alpha, nSamples, nFeatures, start-1, GPUN);
     }
 #else
     void LaunchLR(float* dX, float *dY, float *dW, float *dWcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
@@ -38,5 +38,5 @@ extern "C" {
                 dW[id] = dWcurr[(start - 1) + id] - alpha * err;
             });
     }
-#endif    
+#endif
 }
diff --git a/apps/mm/mm.cu b/apps/mm/mm.cu
index 9091dbe..bb72a28 100644
--- a/apps/mm/mm.cu
+++ b/apps/mm/mm.cu
@@ -13,22 +13,22 @@
 #define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )
 
 long long getCurrentTime() {
-  struct timeval te;
-  gettimeofday(&te, NULL); // get current time
-  long long microseconds = te.tv_sec*1000000LL + te.tv_usec;
-  return microseconds;
+    struct timeval te;
+    gettimeofday(&te, NULL); // get current time
+    long long microseconds = te.tv_sec*1000000LL + te.tv_usec;
+    return microseconds;
 }
 
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 {
-        #ifdef CUDA_ERROR_CHECK
+#ifdef CUDA_ERROR_CHECK
     if ( cudaSuccess != err )
     {
-	fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-		 file, line, cudaGetErrorString( err ) );
-	exit( -1 );
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
-        #endif
+#endif
 
     return;
 }
@@ -36,22 +36,22 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 inline void __cudaCheckError( const char *file, const int line )
 {
 #ifdef CUDA_ERROR_CHECK
-  cudaError err = cudaGetLastError();
-  if ( cudaSuccess != err )
+    cudaError err = cudaGetLastError();
+    if ( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 
-  // More careful checking. However, this will affect performance.
-  // Comment away if needed.
-  err = cudaDeviceSynchronize();
-  if( cudaSuccess != err )
+    // More careful checking. However, this will affect performance.
+    // Comment away if needed.
+    err = cudaDeviceSynchronize();
+    if( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 #endif
 }
@@ -59,13 +59,13 @@ inline void __cudaCheckError( const char *file, const int line )
 __global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id <= GPUN) {
-	int i = id / DIM;
-	int j = id % DIM;
-	float sum = 0.0f;
-	for (int k = 0; k < DIM; k++) {
-	    sum += dA[i*DIM+k] * dB[k*DIM+j];
-	}
-	dC[id] += sum;
+        int i = id / DIM;
+        int j = id % DIM;
+        float sum = 0.0f;
+        for (int k = 0; k < DIM; k++) {
+            sum += dA[i*DIM+k] * dB[k*DIM+j];
+        }
+        dC[id] += sum;
     }
 }
 
@@ -82,140 +82,140 @@ __global__ void mm_tiled(float *dA, float *dB, float *dC, int DIM, int N, int GP
     j = jt + threadIdx.x;
 
     if (i*DIM+j <= GPUN) {
-	float sum = 0.0f;
-	// per tile loop
-	for (kt = 0; kt < DIM; kt += 32) {
-	    // copy to shared memory
-	    sA[threadIdx.y][threadIdx.x] = dA[(it+threadIdx.y)*DIM + kt + threadIdx.x];
-	    sB[threadIdx.y][threadIdx.x] = dB[(kt+threadIdx.y)*DIM + jt + threadIdx.x];
-	    __syncthreads();
-
-	    // two 32x32 small shared (dB[it + 0:31][kt + 0:31], dC[kt+0:31][jt + 0:31]) at this point
-	    for (k = kt; k < kt+32; k++) {
-		sum += sA[i-it][k-kt] * sB[k-kt][j-jt];
-	    }
-
-	    __syncthreads();
-	}
-	dC[i*DIM+j] = sum;
+        float sum = 0.0f;
+        // per tile loop
+        for (kt = 0; kt < DIM; kt += 32) {
+            // copy to shared memory
+            sA[threadIdx.y][threadIdx.x] = dA[(it+threadIdx.y)*DIM + kt + threadIdx.x];
+            sB[threadIdx.y][threadIdx.x] = dB[(kt+threadIdx.y)*DIM + jt + threadIdx.x];
+            __syncthreads();
+
+            // two 32x32 small shared (dB[it + 0:31][kt + 0:31], dC[kt+0:31][jt + 0:31]) at this point
+            for (k = kt; k < kt+32; k++) {
+                sum += sA[i-it][k-kt] * sB[k-kt][j-jt];
+            }
+
+            __syncthreads();
+        }
+        dC[i*DIM+j] = sum;
     }
 }
 
 extern "C" {
     void mmCUDA(float* A, float *B, float *C, int N, int start, int end, int GPUN, int tiled) {
-	float *dA, *dB, *dC;
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        float *dA, *dB, *dC;
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In mmCUDA\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
+            printf("In mmCUDA\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
 #ifdef PROF
-	    cudaEvent_t startCudaMallocEvent, endCudaMallocEvent;
-	    cudaEvent_t startCudaMemcpyH2DEvent, endCudaMemcpyH2DEvent;
-	    cudaEvent_t startCudaKernelEvent, endCudaKernelEvent;
-	    cudaEvent_t startCudaMemcpyD2HEvent, endCudaMemcpyD2HEvent;
-	    CudaSafeCall(cudaEventCreate(&startCudaMallocEvent));
-	    CudaSafeCall(cudaEventCreate(&endCudaMallocEvent));
-	    CudaSafeCall(cudaEventCreate(&startCudaMemcpyH2DEvent));
-	    CudaSafeCall(cudaEventCreate(&endCudaMemcpyH2DEvent));
-	    CudaSafeCall(cudaEventCreate(&startCudaKernelEvent));
-	    CudaSafeCall(cudaEventCreate(&endCudaKernelEvent));
-	    CudaSafeCall(cudaEventCreate(&startCudaMemcpyD2HEvent));
-	    CudaSafeCall(cudaEventCreate(&endCudaMemcpyD2HEvent));
+            cudaEvent_t startCudaMallocEvent, endCudaMallocEvent;
+            cudaEvent_t startCudaMemcpyH2DEvent, endCudaMemcpyH2DEvent;
+            cudaEvent_t startCudaKernelEvent, endCudaKernelEvent;
+            cudaEvent_t startCudaMemcpyD2HEvent, endCudaMemcpyD2HEvent;
+            CudaSafeCall(cudaEventCreate(&startCudaMallocEvent));
+            CudaSafeCall(cudaEventCreate(&endCudaMallocEvent));
+            CudaSafeCall(cudaEventCreate(&startCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventCreate(&endCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventCreate(&startCudaKernelEvent));
+            CudaSafeCall(cudaEventCreate(&endCudaKernelEvent));
+            CudaSafeCall(cudaEventCreate(&startCudaMemcpyD2HEvent));
+            CudaSafeCall(cudaEventCreate(&endCudaMemcpyD2HEvent));
 #endif
 
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(startCudaMallocEvent));
+            CudaSafeCall(cudaEventRecord(startCudaMallocEvent));
 #endif
-	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dB, sizeof(float) * N));
-	    CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dB, sizeof(float) * N));
+            CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(endCudaMallocEvent));
-	    CudaSafeCall(cudaEventSynchronize(endCudaMallocEvent));
+            CudaSafeCall(cudaEventRecord(endCudaMallocEvent));
+            CudaSafeCall(cudaEventSynchronize(endCudaMallocEvent));
 #endif
 
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(startCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventRecord(startCudaMemcpyH2DEvent));
 #endif
-	    CudaSafeCall(cudaMemcpy(dA, A+start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-	    CudaSafeCall(cudaMemcpy(dB, B, sizeof(float) * N, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dA, A+start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dB, B, sizeof(float) * N, cudaMemcpyHostToDevice));
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(endCudaMemcpyH2DEvent));
-	    CudaSafeCall(cudaEventSynchronize(endCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventRecord(endCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventSynchronize(endCudaMemcpyH2DEvent));
 #endif
 
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
-#endif
-	    if (!tiled) {
-		mm<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, dC, ceil(sqrt(N)), N, GPUN);
-	    } else if (tiled == 1){
-		dim3 block(32,32);
-		dim3 grid(ceil(sqrt(N)/32), ceil(sqrt(N)/32));
-		mm_tiled<<<grid, block>>>(dA, dB, dC, ceil(sqrt(N)), N, N);
-	    } else {
+            CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
+#endif
+            if (!tiled) {
+                mm<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, dC, ceil(sqrt(N)), N, GPUN);
+            } else if (tiled == 1){
+                dim3 block(32,32);
+                dim3 grid(ceil(sqrt(N)/32), ceil(sqrt(N)/32));
+                mm_tiled<<<grid, block>>>(dA, dB, dC, ceil(sqrt(N)), N, N);
+            } else {
 #ifdef __NVCC__
-	        cublasHandle_t handle;
+                cublasHandle_t handle;
 #ifdef PROF
-		long long start = getCurrentTime();
+                long long start = getCurrentTime();
 #endif
-		cublasCreate(&handle);
-	        float alpha = 1.0F;
-		float beta = 0.0F;
-	        int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
+                cublasCreate(&handle);
+                float alpha = 1.0F;
+                float beta = 0.0F;
+                int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
 #ifdef PROF
-		long long end = getCurrentTime();
-		printf("cuBLAS prep: %lf msec\n", (float)(end-start)/1000);
+                long long end = getCurrentTime();
+                printf("cuBLAS prep: %lf msec\n", (float)(end-start)/1000);
 #endif
-		cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, dB, ldb, dA, lda, &beta, dC, ldc);
+                cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, dB, ldb, dA, lda, &beta, dC, ldc);
 
-        //http://peterwittek.com/cublas-matrix-c-style.html
-        //C:mxn = A:mxk X B:kxn
-        //stat=cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,n,m,k,&a1,d_b,n,d_a,k,&bet,d_c,n);
+                //http://peterwittek.com/cublas-matrix-c-style.html
+                //C:mxn = A:mxk X B:kxn
+                //stat=cublasSgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,n,m,k,&a1,d_b,n,d_a,k,&bet,d_c,n);
 #ifdef PROF
-		long long end2 = getCurrentTime();
-		printf("cuBLAS finish: %lf msec\n", (float)(end2-start)/1000);
+                long long end2 = getCurrentTime();
+                printf("cuBLAS finish: %lf msec\n", (float)(end2-start)/1000);
 #endif
 #endif
-	    }
-	    CudaCheckError();
+            }
+            CudaCheckError();
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
-	    CudaSafeCall(cudaEventSynchronize(endCudaKernelEvent));
+            CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
+            CudaSafeCall(cudaEventSynchronize(endCudaKernelEvent));
 #endif
-	    CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaDeviceSynchronize());
 
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(startCudaMemcpyD2HEvent));
+            CudaSafeCall(cudaEventRecord(startCudaMemcpyD2HEvent));
 #endif
-	    CudaSafeCall(cudaMemcpy(C + start, dC, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+            CudaSafeCall(cudaMemcpy(C + start, dC, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(endCudaMemcpyD2HEvent));
-	    CudaSafeCall(cudaEventSynchronize(endCudaMemcpyD2HEvent));
+            CudaSafeCall(cudaEventRecord(endCudaMemcpyD2HEvent));
+            CudaSafeCall(cudaEventSynchronize(endCudaMemcpyD2HEvent));
 #endif
 
 #ifdef PROF
-	    float msecMalloc, msecH2D, msecKernel, msecD2H;
-	    CudaSafeCall(cudaEventElapsedTime(&msecMalloc, startCudaMallocEvent, endCudaMallocEvent));
-	    CudaSafeCall(cudaEventElapsedTime(&msecH2D, startCudaMemcpyH2DEvent, endCudaMemcpyH2DEvent));
-	    CudaSafeCall(cudaEventElapsedTime(&msecKernel, startCudaKernelEvent, endCudaKernelEvent));
-	    CudaSafeCall(cudaEventElapsedTime(&msecD2H, startCudaMemcpyD2HEvent, endCudaMemcpyD2HEvent));
-	    printf("CUDA malloc: %lf msec\n", msecMalloc);
-	    printf("CUDA h2d: %lf msec\n", msecH2D);
-	    printf("CUDA kernel: %lf msec\n", msecKernel);
-	    printf("CUDA d2h: %lf msec\n", msecD2H);
-#endif
-
-	    //for (int i = 0; i < GPUN; i++) {
-	    //	printf("C[%d] = %lf\n", start+i, C[start+i]);
-	    //}
-
-	    CudaSafeCall(cudaFree(dA));
-	    CudaSafeCall(cudaFree(dB));
-	    CudaSafeCall(cudaFree(dC));
-	}
+            float msecMalloc, msecH2D, msecKernel, msecD2H;
+            CudaSafeCall(cudaEventElapsedTime(&msecMalloc, startCudaMallocEvent, endCudaMallocEvent));
+            CudaSafeCall(cudaEventElapsedTime(&msecH2D, startCudaMemcpyH2DEvent, endCudaMemcpyH2DEvent));
+            CudaSafeCall(cudaEventElapsedTime(&msecKernel, startCudaKernelEvent, endCudaKernelEvent));
+            CudaSafeCall(cudaEventElapsedTime(&msecD2H, startCudaMemcpyD2HEvent, endCudaMemcpyD2HEvent));
+            printf("CUDA malloc: %lf msec\n", msecMalloc);
+            printf("CUDA h2d: %lf msec\n", msecH2D);
+            printf("CUDA kernel: %lf msec\n", msecKernel);
+            printf("CUDA d2h: %lf msec\n", msecD2H);
+#endif
+
+            //for (int i = 0; i < GPUN; i++) {
+            //  printf("C[%d] = %lf\n", start+i, C[start+i]);
+            //}
+
+            CudaSafeCall(cudaFree(dA));
+            CudaSafeCall(cudaFree(dB));
+            CudaSafeCall(cudaFree(dC));
+        }
     }
 }
diff --git a/apps/mm/mm.kernel.cu b/apps/mm/mm.kernel.cu
index 60b363b..7d0816c 100644
--- a/apps/mm/mm.kernel.cu
+++ b/apps/mm/mm.kernel.cu
@@ -21,35 +21,35 @@ __global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
 
 extern "C" {
 
-void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, int tiled) {
-    if (GPUN > 0) {
-        assert(hi - low + 1 == GPUN);
+    void LaunchMM(float *A, float *B, float *C, int N, int low, int hi, int GPUN, int tiled) {
+        if (GPUN > 0) {
+            assert(hi - low + 1 == GPUN);
 #ifdef VERBOSE
-        printf("In mmCUDA\n");
-        printf("\t GPUN: %d\n", GPUN);
-        printf("\t range: %d..%d\n", start, end);
+            printf("In mmCUDA\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
-        if (!tiled) {
-            mm<<<ceil(((float)GPUN)/1024), 1024>>>(A, B, C, ceil(sqrt(N)), N, GPUN);
-        }
-        else if(tiled == 1) {
-            printf("Tile not imlemented\n");
-            assert(false);
-        }
-        else {
+            if (!tiled) {
+                mm<<<ceil(((float)GPUN)/1024), 1024>>>(A, B, C, ceil(sqrt(N)), N, GPUN);
+            }
+            else if(tiled == 1) {
+                printf("Tile not imlemented\n");
+                assert(false);
+            }
+            else {
 #ifdef __NVCC__
-            printf("Using cublas\n");
-            cublasHandle_t handle;
+                printf("Using cublas\n");
+                cublasHandle_t handle;
 
-            cublasCreate(&handle);
-            float alpha = 1.0F;
-            float beta = 0.0F;
-            int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
+                cublasCreate(&handle);
+                float alpha = 1.0F;
+                float beta = 0.0F;
+                int lda = sqrt(N), ldb = sqrt(N), ldc = sqrt(N);
 
-            cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, B, ldb, A, lda, &beta, C, ldc);
+                cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, sqrt(N), GPUN/sqrt(N), sqrt(N), &alpha, B, ldb, A, lda, &beta, C, ldc);
 #endif
+            }
         }
     }
-}
 
 }
diff --git a/apps/stream/stream.cu b/apps/stream/stream.cu
index 54381dc..43a5a4f 100644
--- a/apps/stream/stream.cu
+++ b/apps/stream/stream.cu
@@ -7,14 +7,14 @@
 
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 {
-        #ifdef CUDA_ERROR_CHECK
+#ifdef CUDA_ERROR_CHECK
     if ( cudaSuccess != err )
     {
-	fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-		 file, line, cudaGetErrorString( err ) );
-	exit( -1 );
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
-        #endif
+#endif
 
     return;
 }
@@ -22,35 +22,35 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 __global__ void stream(float *dA, float *dB, float *dC, float alpha, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
-	dA[id] = dB[id] + alpha * dC[id];
+        dA[id] = dB[id] + alpha * dC[id];
     }
 }
 
 extern "C" {
     void streamCUDA(float* A, float *B, float *C, float alpha, int start, int end, int GPUN) {
-	float *dA, *dB, *dC;
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        float *dA, *dB, *dC;
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In streamCUDA\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
-#endif		
-	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dB, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
-	    
-	    CudaSafeCall(cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-	    CudaSafeCall(cudaMemcpy(dC, C + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
-	    
-	    stream<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, dC, alpha, GPUN);
-	    
-	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
-	    
-	    CudaSafeCall(cudaFree(dA));
-	    CudaSafeCall(cudaFree(dB));
-	    CudaSafeCall(cudaFree(dC));
-	}
-    }    
+            printf("In streamCUDA\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
+#endif
+            CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dB, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dC, sizeof(float) * GPUN));
+
+            CudaSafeCall(cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMemcpy(dC, C + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+
+            stream<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, dC, alpha, GPUN);
+
+            CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+
+            CudaSafeCall(cudaFree(dA));
+            CudaSafeCall(cudaFree(dB));
+            CudaSafeCall(cudaFree(dC));
+        }
+    }
 }
diff --git a/apps/vector_copy/vc.cu b/apps/vector_copy/vc.cu
index 826fbf5..cdf765a 100644
--- a/apps/vector_copy/vc.cu
+++ b/apps/vector_copy/vc.cu
@@ -12,14 +12,14 @@
 
 inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 {
-        #ifdef CUDA_ERROR_CHECK
+#ifdef CUDA_ERROR_CHECK
     if ( cudaSuccess != err )
     {
-	fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
-		 file, line, cudaGetErrorString( err ) );
-	exit( -1 );
+        fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
-        #endif
+#endif
 
     return;
 }
@@ -27,22 +27,22 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line )
 inline void __cudaCheckError( const char *file, const int line )
 {
 #ifdef CUDA_ERROR_CHECK
-  cudaError err = cudaGetLastError();
-  if ( cudaSuccess != err )
+    cudaError err = cudaGetLastError();
+    if ( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 
-  // More careful checking. However, this will affect performance.
-  // Comment away if needed.
-  err = cudaDeviceSynchronize();
-  if( cudaSuccess != err )
+    // More careful checking. However, this will affect performance.
+    // Comment away if needed.
+    err = cudaDeviceSynchronize();
+    if( cudaSuccess != err )
     {
-      fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
-	       file, line, cudaGetErrorString( err ) );
-      exit( -1 );
+        fprintf( stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
+                 file, line, cudaGetErrorString( err ) );
+        exit( -1 );
     }
 #endif
 }
@@ -50,49 +50,49 @@ inline void __cudaCheckError( const char *file, const int line )
 __global__ void vc(float *dA, float *dB, int N) {
     int id = blockIdx.x * blockDim.x + threadIdx.x;
     if (id < N) {
-	dA[id] = dB[id];
+        dA[id] = dB[id];
     }
 }
 
 extern "C" {
     void vcCUDA(float* A, float *B, int start, int end, int GPUN) {
-	float *dA, *dB;
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        float *dA, *dB;
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In vcCUDA\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
+            printf("In vcCUDA\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
 #ifdef PROF
-	    cudaEvent_t startCudaKernelEvent, endCudaKernelEvent;
-	    CudaSafeCall(cudaEventCreate(&startCudaKernelEvent));
-	    CudaSafeCall(cudaEventCreate(&endCudaKernelEvent));
+            cudaEvent_t startCudaKernelEvent, endCudaKernelEvent;
+            CudaSafeCall(cudaEventCreate(&startCudaKernelEvent));
+            CudaSafeCall(cudaEventCreate(&endCudaKernelEvent));
 #endif
 
-	    CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMalloc(&dB, sizeof(float) * GPUN));
-	    CudaSafeCall(cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
+            CudaSafeCall(cudaMalloc(&dA, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMalloc(&dB, sizeof(float) * GPUN));
+            CudaSafeCall(cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice));
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
+            CudaSafeCall(cudaEventRecord(startCudaKernelEvent));
 #endif
-	    vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
+            vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
 #ifdef PROF
-	    CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
-	    CudaSafeCall(cudaEventSynchronize(endCudaKernelEvent));
+            CudaSafeCall(cudaEventRecord(endCudaKernelEvent));
+            CudaSafeCall(cudaEventSynchronize(endCudaKernelEvent));
 #endif
-	    CudaCheckError();
-	    CudaSafeCall(cudaDeviceSynchronize());
-	    CudaSafeCall(cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
+            CudaCheckError();
+            CudaSafeCall(cudaDeviceSynchronize());
+            CudaSafeCall(cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost));
 
 #ifdef PROF
-	    float msecKernel;
-	    CudaSafeCall(cudaEventElapsedTime(&msecKernel, startCudaKernelEvent, endCudaKernelEvent));
-	    printf("CUDA kernel: %lf msec\n", msecKernel);
+            float msecKernel;
+            CudaSafeCall(cudaEventElapsedTime(&msecKernel, startCudaKernelEvent, endCudaKernelEvent));
+            printf("CUDA kernel: %lf msec\n", msecKernel);
 #endif
 
-	    CudaSafeCall(cudaFree(dA));
-	    CudaSafeCall(cudaFree(dB));
-	}
+            CudaSafeCall(cudaFree(dA));
+            CudaSafeCall(cudaFree(dB));
+        }
     }
 }
diff --git a/apps/vector_copy/vc.kernel.cu b/apps/vector_copy/vc.kernel.cu
index c82e7f0..1b793bb 100644
--- a/apps/vector_copy/vc.kernel.cu
+++ b/apps/vector_copy/vc.kernel.cu
@@ -10,7 +10,7 @@ __global__ void vc(float *dA, float *dB, int N) {
 #endif
 
 extern "C" {
-#ifndef USE_LAMBDA    
+#ifndef USE_LAMBDA
     void LaunchVC(float* dA, float *dB, int N) {
         vc<<<ceil(((float)N)/1024), 1024>>>(dA, dB, N);
     }
@@ -18,5 +18,5 @@ extern "C" {
     void LaunchVC(float *dA, float *dB, int N) {
         call_gpu_functor(N, 1024, NULL, [=] __device__ (int i) { dA[i] = dB[i]; });
     }
-#endif    
+#endif
 }

From d875652467a7d274ea8f0591d35847b023038fe6 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 16:46:07 -0400
Subject: [PATCH 105/118] Update MM

---
 apps/mm/mm.cu        | 2 +-
 apps/mm/mm.kernel.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/apps/mm/mm.cu b/apps/mm/mm.cu
index bb72a28..d971c95 100644
--- a/apps/mm/mm.cu
+++ b/apps/mm/mm.cu
@@ -65,7 +65,7 @@ __global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
         for (int k = 0; k < DIM; k++) {
             sum += dA[i*DIM+k] * dB[k*DIM+j];
         }
-        dC[id] += sum;
+        dC[id] = sum;
     }
 }
 
diff --git a/apps/mm/mm.kernel.cu b/apps/mm/mm.kernel.cu
index 7d0816c..d60c6a1 100644
--- a/apps/mm/mm.kernel.cu
+++ b/apps/mm/mm.kernel.cu
@@ -15,7 +15,7 @@ __global__ void mm(float *dA, float *dB, float *dC, int DIM, int N, int GPUN) {
         for (int k = 0; k < DIM; k++) {
             sum += dA[i*DIM+k] * dB[k*DIM+j];
         }
-        dC[id] += sum;
+        dC[id] = sum;
     }
 }
 

From 220040802e2be6a3cac6bd45c77fce61778056e3 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 17:25:51 -0400
Subject: [PATCH 106/118] Update the OpenCL version of MM

---
 apps/mm/mm.cl       |   4 +-
 apps/mm/mm.opencl.c | 352 ++++++++++++++++++++++----------------------
 2 files changed, 178 insertions(+), 178 deletions(-)

diff --git a/apps/mm/mm.cl b/apps/mm/mm.cl
index 417e919..2f9dcb4 100644
--- a/apps/mm/mm.cl
+++ b/apps/mm/mm.cl
@@ -3,10 +3,10 @@ __kernel void mm(__global const float *A, __global const float *B, __global floa
     if (id <= GPUN) {
         int i = id / DIM;
         int j = id % DIM;
-        int sum = 0;
+        float sum = 0.0f;
         for (int k = 0; k < DIM; k++) {
             sum += A[i*DIM+k] * B[k*DIM+j];
         }
-        C[id] += sum;
+        C[id] = sum;
     }
 }
diff --git a/apps/mm/mm.opencl.c b/apps/mm/mm.opencl.c
index 72a8e0d..8a192a0 100644
--- a/apps/mm/mm.opencl.c
+++ b/apps/mm/mm.opencl.c
@@ -17,7 +17,7 @@
 const char *getErrorString(cl_int error)
 {
     switch(error){
-    // run-time and JIT compiler errors
+        // run-time and JIT compiler errors
     case 0: return "CL_SUCCESS";
     case -1: return "CL_DEVICE_NOT_FOUND";
     case -2: return "CL_DEVICE_NOT_AVAILABLE";
@@ -39,7 +39,7 @@ const char *getErrorString(cl_int error)
     case -18: return "CL_DEVICE_PARTITION_FAILED";
     case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
 
-    // compile-time errors
+        // compile-time errors
     case -30: return "CL_INVALID_VALUE";
     case -31: return "CL_INVALID_DEVICE_TYPE";
     case -32: return "CL_INVALID_PLATFORM";
@@ -80,7 +80,7 @@ const char *getErrorString(cl_int error)
     case -67: return "CL_INVALID_LINKER_OPTIONS";
     case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
 
-    // extension errors
+        // extension errors
     case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
     case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
     case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
@@ -95,183 +95,183 @@ const char *getErrorString(cl_int error)
 extern "C" {
 #endif
     void mmCUDA(float* A, float *B, float *C, int N, int start, int end, int GPUN) {
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In mmOCL\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
+            printf("In mmOCL\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
 
-        FILE *fp;
-        char *source_str;
-        size_t source_size;
-        char str[1024];
-
-        fp = fopen("mm.cl", "r");
-        if (!fp) {
-            fprintf(stderr, "Failed to load kernel.\n");
-            exit(1);
-        }
-        source_str = (char*)malloc(MAX_SOURCE_SIZE);
-        source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
-        fclose( fp );
-
-	    //printf("source: %s\n", source_str);
-
-        // Get platform and device information
-        cl_platform_id platform_id = NULL;
-        cl_device_id device_ids[2];
-        cl_uint ret_num_devices;
-        cl_uint ret_num_platforms;
-        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
-        if (ret != CL_SUCCESS) {
-            printf("clGetPlatformIDs %s\n", getErrorString(ret));
-        }
-
-        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
-        //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
-        int did = 0;
-        char *env = getenv("OCL_DEVICE_NO");
-        if (env) {
-            did = atoi(env);
-        }
-
-        cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
-
-        // Create an OpenCL context
-        cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create a command queue
-        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create memory buffers on the device for each vector
-        cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_event h2d_event[2];
-        ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, N * sizeof(float), A, 0, NULL, &h2d_event[0]);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, N * sizeof(float), B, 0, NULL, &h2d_event[1]);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        clWaitForEvents(2, h2d_event);
-
-        // Create a program from the kernel source
-        cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Build the program
-        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
+            FILE *fp;
+            char *source_str;
+            size_t source_size;
+            char str[1024];
+
+            fp = fopen("mm.cl", "r");
+            if (!fp) {
+                fprintf(stderr, "Failed to load kernel.\n");
+                exit(1);
+            }
+            source_str = (char*)malloc(MAX_SOURCE_SIZE);
+            source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+            fclose( fp );
+
+            //printf("source: %s\n", source_str);
+
+            // Get platform and device information
+            cl_platform_id platform_id = NULL;
+            cl_device_id device_ids[2];
+            cl_uint ret_num_devices;
+            cl_uint ret_num_platforms;
+            cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+            if (ret != CL_SUCCESS) {
+                printf("clGetPlatformIDs %s\n", getErrorString(ret));
+            }
+
+            ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
+            //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
+            int did = 0;
+            char *env = getenv("OCL_DEVICE_NO");
+            if (env) {
+                did = atoi(env);
+            }
+
+            cl_device_id device_id = device_ids[did];
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
+            printf("GPU %s\n", str);
+
+            // Create an OpenCL context
+            cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create a command queue
+            cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create memory buffers on the device for each vector
+            cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, GPUN * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, N * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_event h2d_event[2];
+            ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), A+start, 0, NULL, &h2d_event[0]);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, N * sizeof(float), B, 0, NULL, &h2d_event[1]);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            clWaitForEvents(2, h2d_event);
+
+            // Create a program from the kernel source
+            cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Build the program
+            ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create the OpenCL kernel
+            cl_kernel kernel = clCreateKernel(program, "mm", &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Set the arguments of the kernel
+            ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            int tmp = ceil(sqrt(N));
+            ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&tmp);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&N);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&GPUN);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Execute the OpenCL kernel on the list
+            size_t local_item_size = 64; // Divide work items into groups of 64
+            size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
+            cl_event k_event;
+            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            } else {
+                clWaitForEvents(1, &k_event);
+            }
+
+            cl_event d2h_event;
+
+            ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), C + start, 0, NULL, &d2h_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clFinish(command_queue);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            cl_ulong time_start;
+            cl_ulong time_end;
+
+            // H2D
+            clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D1 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D2 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            // Kernel
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            // D2H
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
         }
-
-        // Create the OpenCL kernel
-        cl_kernel kernel = clCreateKernel(program, "mm", &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Set the arguments of the kernel
-        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        int tmp = ceil(sqrt(N));
-        ret = clSetKernelArg(kernel, 3, sizeof(cl_int), (void *)&tmp);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 4, sizeof(cl_int), (void *)&N);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&N);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Execute the OpenCL kernel on the list
-        size_t local_item_size = 64; // Divide work items into groups of 64
-        size_t global_item_size = local_item_size * ((N + local_item_size -1) / local_item_size); // Process the entire lists
-        cl_event k_event;
-        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        } else {
-            clWaitForEvents(1, &k_event);
-        }
-
-        cl_event d2h_event;
-
-        ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, start * sizeof(float), GPUN * sizeof(float), C + start, 0, NULL, &d2h_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clFinish(command_queue);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        cl_ulong time_start;
-        cl_ulong time_end;
-
-        // H2D
-        clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D1 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D2 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        // Kernel
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        // D2H
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-    }
     }
 #ifdef __cplusplus
 }

From 99e060bf106dcaabc724491da6641fb1582daeec Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 17:39:57 -0400
Subject: [PATCH 107/118] Update the OpenCL version of LR

---
 apps/logisticregression/lr.cl       |  22 +-
 apps/logisticregression/lr.opencl.c | 688 ++++++++++++++--------------
 2 files changed, 358 insertions(+), 352 deletions(-)

diff --git a/apps/logisticregression/lr.cl b/apps/logisticregression/lr.cl
index d650327..b6f3b4b 100644
--- a/apps/logisticregression/lr.cl
+++ b/apps/logisticregression/lr.cl
@@ -1,22 +1,22 @@
 __kernel void lr1(__global float *Wcurr, __global const float *W, int n) {
     int id = get_global_id(0);
     if (id < n) {
-      Wcurr[id] = W[id];
+        Wcurr[id] = W[id];
     }
 }
 
-__kernel void lr2(__global float *dW, __global const float *dWcurr, __global const float *dX, __global const float *dY, float alpha, int nSamples, int nFeatures, int N) {
+__kernel void lr2(__global float *dW, __global const float *dWcurr, __global const float *dX, __global const float *dY, float alpha, int nSamples, int nFeatures, int start, int N) {
     int id = get_global_id(0);
     if (id < N) {
-    float err = 0.0;
-    for (int s = 0; s < nSamples; s++) {
-        float arg = 0.0;
-        for (int f = 0; f < nFeatures; f++) {
-        arg += dWcurr[f] * dX[s * (nFeatures) + f];
+        float err = 0.0;
+        for (int s = 0; s < nSamples; s++) {
+            float arg = 0.0;
+            for (int f = 0; f < nFeatures; f++) {
+                arg += dWcurr[f] * dX[s * (nFeatures) + f];
+            }
+            float hypo = 1 / (1 + exp(-arg));
+            err += (hypo - dY[s]) * dX[s * (nFeatures) + id];
         }
-        float hypo = 1 / (1 + exp(-arg));
-        err += (hypo - dY[s]) * dX[s * (nFeatures) + id];
-    }
-    dW[id] = dWcurr[id] - alpha * err;
+        dW[id] = dWcurr[start + id] - alpha * err;
     }
 }
diff --git a/apps/logisticregression/lr.opencl.c b/apps/logisticregression/lr.opencl.c
index a1581f3..ecb4a45 100644
--- a/apps/logisticregression/lr.opencl.c
+++ b/apps/logisticregression/lr.opencl.c
@@ -17,7 +17,7 @@
 const char *getErrorString(cl_int error)
 {
     switch(error){
-    // run-time and JIT compiler errors
+        // run-time and JIT compiler errors
     case 0: return "CL_SUCCESS";
     case -1: return "CL_DEVICE_NOT_FOUND";
     case -2: return "CL_DEVICE_NOT_AVAILABLE";
@@ -39,7 +39,7 @@ const char *getErrorString(cl_int error)
     case -18: return "CL_DEVICE_PARTITION_FAILED";
     case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
 
-    // compile-time errors
+        // compile-time errors
     case -30: return "CL_INVALID_VALUE";
     case -31: return "CL_INVALID_DEVICE_TYPE";
     case -32: return "CL_INVALID_PLATFORM";
@@ -80,7 +80,7 @@ const char *getErrorString(cl_int error)
     case -67: return "CL_INVALID_LINKER_OPTIONS";
     case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
 
-    // extension errors
+        // extension errors
     case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
     case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
     case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
@@ -95,355 +95,361 @@ const char *getErrorString(cl_int error)
 extern "C" {
 #endif
     void lrCUDA1(float* W, float *Wcurr, int start, int end, int GPUN) {
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In vcOCL\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
+            printf("In vcOCL\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
 #endif
 
-        FILE *fp;
-        char *source_str;
-        size_t source_size;
-        char str[1024];
-
-        fp = fopen("lr.cl", "r");
-        if (!fp) {
-            fprintf(stderr, "Failed to load kernel.\n");
-            exit(1);
-        }
-        source_str = (char*)malloc(MAX_SOURCE_SIZE);
-        source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
-        fclose( fp );
-
-	    //printf("source: %s\n", source_str);
-
-        // Get platform and device information
-        cl_platform_id platform_id = NULL;
-        cl_device_id device_ids[2];
-        cl_uint ret_num_devices;
-        cl_uint ret_num_platforms;
-        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
-        if (ret != CL_SUCCESS) {
-            printf("clGetPlatformIDs %s\n", getErrorString(ret));
-        }
-
-        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
-        //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
-        int did = 0;
-        char *env = getenv("OCL_DEVICE_NO");
-        if (env) {
-            did = atoi(env);
-        }
-
-        cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
-
-        // Create an OpenCL context
-        cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create a command queue
-        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create memory buffers on the device for each vector
-        cl_mem w_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem wcurr_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, GPUN * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_event h2d_event;
-        ret = clEnqueueWriteBuffer(command_queue, w_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), W + start, 0, NULL, &h2d_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        } else {
-            clWaitForEvents(1, &h2d_event);
-        }
-
-        // Create a program from the kernel source
-        cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Build the program
-        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create the OpenCL kernel
-        cl_kernel kernel = clCreateKernel(program, "lr1", &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
+            FILE *fp;
+            char *source_str;
+            size_t source_size;
+            char str[1024];
+
+            fp = fopen("lr.cl", "r");
+            if (!fp) {
+                fprintf(stderr, "Failed to load kernel.\n");
+                exit(1);
+            }
+            source_str = (char*)malloc(MAX_SOURCE_SIZE);
+            source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+            fclose( fp );
+
+            //printf("source: %s\n", source_str);
+
+            // Get platform and device information
+            cl_platform_id platform_id = NULL;
+            cl_device_id device_ids[2];
+            cl_uint ret_num_devices;
+            cl_uint ret_num_platforms;
+            cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+            if (ret != CL_SUCCESS) {
+                printf("clGetPlatformIDs %s\n", getErrorString(ret));
+            }
+
+            ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
+            //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
+            int did = 0;
+            char *env = getenv("OCL_DEVICE_NO");
+            if (env) {
+                did = atoi(env);
+            }
+
+            cl_device_id device_id = device_ids[did];
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
+            printf("GPU %s\n", str);
+
+            // Create an OpenCL context
+            cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create a command queue
+            cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create memory buffers on the device for each vector
+            cl_mem w_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem wcurr_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, GPUN * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_event h2d_event;
+            ret = clEnqueueWriteBuffer(command_queue, w_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), W + start, 0, NULL, &h2d_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            } else {
+                clWaitForEvents(1, &h2d_event);
+            }
+
+            // Create a program from the kernel source
+            cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Build the program
+            ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create the OpenCL kernel
+            cl_kernel kernel = clCreateKernel(program, "lr1", &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Set the arguments of the kernel
+            ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&wcurr_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&w_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&GPUN);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Execute the OpenCL kernel on the list
+            size_t local_item_size = 64; // Divide work items into groups of 64
+            size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
+            cl_event k_event;
+            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            } else {
+                clWaitForEvents(1, &k_event);
+            }
+
+            cl_event d2h_event;
+            ret = clEnqueueReadBuffer(command_queue, wcurr_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), Wcurr + start, 0, NULL, &d2h_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            ret = clFinish(command_queue);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            cl_ulong time_start;
+            cl_ulong time_end;
+
+            // H2D
+            clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+
+            // Kernel
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            // D2H
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
         }
-
-        // Set the arguments of the kernel
-        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&wcurr_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&w_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 2, sizeof(cl_int), (void *)&GPUN);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Execute the OpenCL kernel on the list
-        size_t local_item_size = 64; // Divide work items into groups of 64
-        size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
-        cl_event k_event;
-        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        } else {
-            clWaitForEvents(1, &k_event);
-        }
-
-        cl_event d2h_event;
-        ret = clEnqueueReadBuffer(command_queue, wcurr_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), Wcurr + start, 0, NULL, &d2h_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        ret = clFinish(command_queue);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        cl_ulong time_start;
-        cl_ulong time_end;
-
-        // H2D
-        clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-
-        // Kernel
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        // D2H
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-    }
     }
 
     void lrCUDA2(float* X, float *Y, float *W, float *Wcurr, float alpha, int nSamples, int nFeatures, int start, int end, int GPUN) {
-	if (GPUN > 0) {
-	    assert(end - start + 1 == GPUN);
+        if (GPUN > 0) {
+            assert(end - start + 1 == GPUN);
 #ifdef VERBOSE
-	    printf("In vcOCL\n");
-	    printf("\t GPUN: %d\n", GPUN);
-	    printf("\t range: %d..%d\n", start, end);
-        alpha = 0.1;
+            printf("In vcOCL\n");
+            printf("\t GPUN: %d\n", GPUN);
+            printf("\t range: %d..%d\n", start, end);
+            alpha = 0.1;
 #endif
 
-        FILE *fp;
-        char *source_str;
-        size_t source_size;
-        char str[1024];
-
-        fp = fopen("lr.cl", "r");
-        if (!fp) {
-            fprintf(stderr, "Failed to load kernel.\n");
-            exit(1);
-        }
-        source_str = (char*)malloc(MAX_SOURCE_SIZE);
-        source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
-        fclose( fp );
-
-	    //printf("source: %s\n", source_str);
-
-        // Get platform and device information
-        cl_platform_id platform_id = NULL;
-        cl_device_id device_ids[2];
-        cl_uint ret_num_devices;
-        cl_uint ret_num_platforms;
-        cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
-        if (ret != CL_SUCCESS) {
-            printf("clGetPlatformIDs %s\n", getErrorString(ret));
-        }
-
-        ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
-        //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
-        int did = 0;
-        char *env = getenv("OCL_DEVICE_NO");
-        if (env) {
-            did = atoi(env);
-        }
-
-        cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
-
-        // Create an OpenCL context
-        cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create a command queue
-        cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
+            FILE *fp;
+            char *source_str;
+            size_t source_size;
+            char str[1024];
+
+            fp = fopen("lr.cl", "r");
+            if (!fp) {
+                fprintf(stderr, "Failed to load kernel.\n");
+                exit(1);
+            }
+            source_str = (char*)malloc(MAX_SOURCE_SIZE);
+            source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
+            fclose( fp );
+
+            //printf("source: %s\n", source_str);
+
+            // Get platform and device information
+            cl_platform_id platform_id = NULL;
+            cl_device_id device_ids[2];
+            cl_uint ret_num_devices;
+            cl_uint ret_num_platforms;
+            cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms);
+            if (ret != CL_SUCCESS) {
+                printf("clGetPlatformIDs %s\n", getErrorString(ret));
+            }
+
+            ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 2, device_ids, &ret_num_devices);
+            //printf("device ID: %d, # of devices: %d\n", ret, ret_num_devices);
+            int did = 0;
+            char *env = getenv("OCL_DEVICE_NO");
+            if (env) {
+                did = atoi(env);
+            }
+
+            cl_device_id device_id = device_ids[did];
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
+            printf("GPU %s\n", str);
+
+            // Create an OpenCL context
+            cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create a command queue
+            cl_command_queue command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create memory buffers on the device for each vector
+            cl_mem x_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nSamples * nFeatures * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem y_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nSamples * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem wcurr_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nFeatures * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_mem w_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            cl_event h2d_event[3];
+            ret = clEnqueueWriteBuffer(command_queue, x_mem_obj, CL_TRUE, 0, nSamples * nFeatures * sizeof(float), X, 0, NULL, &h2d_event[0]);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            ret = clEnqueueWriteBuffer(command_queue, y_mem_obj, CL_TRUE, 0, nSamples * sizeof(float), Y, 0, NULL, &h2d_event[1]);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            ret = clEnqueueWriteBuffer(command_queue, wcurr_mem_obj, CL_TRUE, 0, nFeatures * sizeof(float), Wcurr, 0, NULL, &h2d_event[2]);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            clWaitForEvents(3, h2d_event);
+
+            // Create a program from the kernel source
+            cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Build the program
+            ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Create the OpenCL kernel
+            cl_kernel kernel = clCreateKernel(program, "lr2", &ret);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Set the arguments of the kernel
+            ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&w_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&wcurr_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&x_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&y_mem_obj);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&alpha);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nSamples);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&nFeatures);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            int tmp = start - 1;
+            ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&tmp);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            ret = clSetKernelArg(kernel, 8, sizeof(cl_int), (void *)&GPUN);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+
+            // Execute the OpenCL kernel on the list
+            size_t local_item_size = 64; // Divide work items into groups of 64
+            size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
+            cl_event k_event;
+            ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            } else {
+                clWaitForEvents(1, &k_event);
+            }
+
+            cl_event d2h_event;
+            ret = clEnqueueReadBuffer(command_queue, w_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), W, 0, NULL, &d2h_event);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            ret = clFinish(command_queue);
+            if (ret != CL_SUCCESS) {
+                printf("%s\n", getErrorString(ret));
+            }
+            cl_ulong time_start;
+            cl_ulong time_end;
+
+            // H2D
+            clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D1 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D2 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            clGetEventProfilingInfo(h2d_event[2], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(h2d_event[2], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("H2D3 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+
+            // Kernel
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+            // D2H
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
+            clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
+            printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
         }
-
-        // Create memory buffers on the device for each vector
-        cl_mem x_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nSamples * nFeatures * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem y_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nSamples * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem wcurr_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, nFeatures * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_mem w_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, GPUN * sizeof(float), NULL, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        cl_event h2d_event[3];
-        ret = clEnqueueWriteBuffer(command_queue, x_mem_obj, CL_TRUE, 0, nSamples * nFeatures * sizeof(float), X, 0, NULL, &h2d_event[0]);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        ret = clEnqueueWriteBuffer(command_queue, y_mem_obj, CL_TRUE, 0, nSamples * sizeof(float), Y, 0, NULL, &h2d_event[1]);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        ret = clEnqueueWriteBuffer(command_queue, wcurr_mem_obj, CL_TRUE, 0, nFeatures * sizeof(float), Wcurr, 0, NULL, &h2d_event[2]);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        clWaitForEvents(3, h2d_event);
-
-        // Create a program from the kernel source
-        cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Build the program
-        ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Create the OpenCL kernel
-        cl_kernel kernel = clCreateKernel(program, "lr2", &ret);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Set the arguments of the kernel
-        ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&w_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&wcurr_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&x_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&y_mem_obj);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 4, sizeof(cl_float), (void *)&alpha);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 5, sizeof(cl_int), (void *)&nSamples);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 6, sizeof(cl_int), (void *)&nFeatures);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&GPUN);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-
-        // Execute the OpenCL kernel on the list
-        size_t local_item_size = 64; // Divide work items into groups of 64
-        size_t global_item_size = local_item_size * ((GPUN + local_item_size -1) / local_item_size); // Process the entire lists
-        cl_event k_event;
-        ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, &k_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        } else {
-            clWaitForEvents(1, &k_event);
-        }
-
-        cl_event d2h_event;
-        ret = clEnqueueReadBuffer(command_queue, w_mem_obj, CL_TRUE, 0, GPUN * sizeof(float), W + start, 0, NULL, &d2h_event);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        ret = clFinish(command_queue);
-        if (ret != CL_SUCCESS) {
-            printf("%s\n", getErrorString(ret));
-        }
-        cl_ulong time_start;
-        cl_ulong time_end;
-
-        // H2D
-        clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event[0], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D1 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event[1], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D2 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        clGetEventProfilingInfo(h2d_event[2], CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(h2d_event[2], CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("H2D3 time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-
-        // Kernel
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(k_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("Kernel time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-        // D2H
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
-        clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
-        printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
-    }
     }
 #ifdef __cplusplus
 }

From e1372ddd71aa6518460163865383033f99b58b52 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 18:45:12 -0400
Subject: [PATCH 108/118] Fix segfault in the OpenCL variants

---
 apps/blackscholes/bs.opencl.c       |  5 +++--
 apps/logisticregression/lr.opencl.c | 10 ++++++----
 apps/mm/mm.opencl.c                 |  5 +++--
 apps/stream/stream.opencl.c         |  5 +++--
 apps/vector_copy/vc.opencl.c        |  5 +++--
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/apps/blackscholes/bs.opencl.c b/apps/blackscholes/bs.opencl.c
index cbe624b..6ea90b9 100644
--- a/apps/blackscholes/bs.opencl.c
+++ b/apps/blackscholes/bs.opencl.c
@@ -139,8 +139,9 @@ extern "C" {
         }
 
         cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
+        size_t sret;
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+		printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
         // Create an OpenCL context
         cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
diff --git a/apps/logisticregression/lr.opencl.c b/apps/logisticregression/lr.opencl.c
index ecb4a45..a35c55c 100644
--- a/apps/logisticregression/lr.opencl.c
+++ b/apps/logisticregression/lr.opencl.c
@@ -138,8 +138,9 @@ extern "C" {
             }
 
             cl_device_id device_id = device_ids[did];
-            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-            printf("GPU %s\n", str);
+            size_t sret;
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+            printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
             // Create an OpenCL context
             cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
@@ -290,8 +291,9 @@ extern "C" {
             }
 
             cl_device_id device_id = device_ids[did];
-            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-            printf("GPU %s\n", str);
+            size_t sret;
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+            printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
             // Create an OpenCL context
             cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
diff --git a/apps/mm/mm.opencl.c b/apps/mm/mm.opencl.c
index 8a192a0..af5f8ed 100644
--- a/apps/mm/mm.opencl.c
+++ b/apps/mm/mm.opencl.c
@@ -138,8 +138,9 @@ extern "C" {
             }
 
             cl_device_id device_id = device_ids[did];
-            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-            printf("GPU %s\n", str);
+            size_t sret;
+            clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+            printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
             // Create an OpenCL context
             cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
diff --git a/apps/stream/stream.opencl.c b/apps/stream/stream.opencl.c
index 092fab4..1c43c68 100644
--- a/apps/stream/stream.opencl.c
+++ b/apps/stream/stream.opencl.c
@@ -140,8 +140,9 @@ extern "C" {
         }
 
         cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
+        size_t sret;
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+		printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
         // Create an OpenCL context
         cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
diff --git a/apps/vector_copy/vc.opencl.c b/apps/vector_copy/vc.opencl.c
index 002ea1f..c62f497 100644
--- a/apps/vector_copy/vc.opencl.c
+++ b/apps/vector_copy/vc.opencl.c
@@ -138,8 +138,9 @@ extern "C" {
         }
 
         cl_device_id device_id = device_ids[did];
-        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &ret);
-		printf("GPU %s\n", str);
+        size_t sret;
+        clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+		printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
 
         // Create an OpenCL context
         cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);

From 6e90303f6e8238649da66c8c207f62992cf3232d Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Thu, 27 Aug 2020 19:04:25 -0400
Subject: [PATCH 109/118] Delete trailing spaces

---
 src/GPUAPI.cu       |  2 +-
 src/GPUAPI.opencl.c | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/GPUAPI.cu b/src/GPUAPI.cu
index 5fbda7b..87d47ca 100644
--- a/src/GPUAPI.cu
+++ b/src/GPUAPI.cu
@@ -96,7 +96,7 @@ extern "C" {
           exit(1);
       }
   }
-    
+
   void Free(void* devPtr) {
       CudaSafeCall(cudaFree(devPtr));
   }
diff --git a/src/GPUAPI.opencl.c b/src/GPUAPI.opencl.c
index 42f7502..5e351ad 100644
--- a/src/GPUAPI.opencl.c
+++ b/src/GPUAPI.opencl.c
@@ -18,7 +18,7 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  
+
   const char *openclGetErrorString(cl_int error)
   {
     switch(error){
@@ -43,7 +43,7 @@ extern "C" {
     case -17: return "CL_LINK_PROGRAM_FAILURE";
     case -18: return "CL_DEVICE_PARTITION_FAILED";
     case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-      
+
     // compile-time errors
     case -30: return "CL_INVALID_VALUE";
     case -31: return "CL_INVALID_DEVICE_TYPE";
@@ -105,10 +105,10 @@ extern "C" {
         exit( -1 );
       }
 #endif
-    
+
     return;
   }
-  
+
   void GetDeviceCount(int *count) {
     cl_platform_id platforms[MAX_PLATFORM_ENTRIES];
     cl_uint num_platforms;
@@ -138,10 +138,10 @@ extern "C" {
       for (int i = 0; i < num_devices; i++) {
 	OpenCLSafeCall(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
 	printf("GPUAPI: \tdevice[%d].NAME = %s\n", i, buffer);
-      }      
+      }
     }
   }
-  
+
   void GetDevice(int *device) {
 
   }
@@ -175,7 +175,7 @@ extern "C" {
           exit(1);
       }
   }
-    
+
   void Free(void* devPtr) {
   }
 #ifdef __cplusplus

From b0256f9b52a64681e4daefdd908603e377542b64 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 14:51:24 -0400
Subject: [PATCH 110/118] Add rst documents

---
 doc/rst/api/gpuapi.rst           | 259 ++++++++++++++++++++++
 doc/rst/api/gpuiterator.rst      |  35 +++
 doc/rst/conf.py                  | 174 +++++++++++++++
 doc/rst/details/gpuapi.rst       |  27 +++
 doc/rst/details/gpuiterator.rst  | 154 +++++++++++++
 doc/rst/history/evolution.rst    |  16 ++
 doc/rst/index.rst                |  54 +++++
 doc/rst/instructions/build.rst   |  81 +++++++
 doc/rst/instructions/compile.rst |  95 +++++++++
 doc/rst/instructions/guide.rst   |  61 ++++++
 doc/rst/instructions/low-mid.rst |  57 +++++
 doc/rst/instructions/low.rst     | 356 +++++++++++++++++++++++++++++++
 doc/rst/instructions/mid.rst     |  56 +++++
 doc/rst/instructions/write.rst   |  13 ++
 14 files changed, 1438 insertions(+)
 create mode 100644 doc/rst/api/gpuapi.rst
 create mode 100644 doc/rst/api/gpuiterator.rst
 create mode 100755 doc/rst/conf.py
 create mode 100644 doc/rst/details/gpuapi.rst
 create mode 100644 doc/rst/details/gpuiterator.rst
 create mode 100644 doc/rst/history/evolution.rst
 create mode 100644 doc/rst/index.rst
 create mode 100644 doc/rst/instructions/build.rst
 create mode 100644 doc/rst/instructions/compile.rst
 create mode 100644 doc/rst/instructions/guide.rst
 create mode 100644 doc/rst/instructions/low-mid.rst
 create mode 100644 doc/rst/instructions/low.rst
 create mode 100644 doc/rst/instructions/mid.rst
 create mode 100644 doc/rst/instructions/write.rst

diff --git a/doc/rst/api/gpuapi.rst b/doc/rst/api/gpuapi.rst
new file mode 100644
index 0000000..fe8908b
--- /dev/null
+++ b/doc/rst/api/gpuapi.rst
@@ -0,0 +1,259 @@
+.. default-domain:: chpl
+
+===============
+GPUAPI
+===============
+
+MID-level API Reference
+########################
+
+.. class:: GPUArray
+
+   .. method:: proc init(ref arr)
+
+      Allocates memory on the device. The allocation size is automatically computed by this module -i.e., ``(arr.size: size_t) * c_sizeof(arr.eltType)``.
+
+      :arg arr: The reference of the non-distributed Chapel Array that will be mapped onto the device.
+
+      .. code-block:: chapel
+         :emphasize-lines: 6,21
+
+         // Example 1: Non-distributed array
+         var A: [1..n] int;
+
+         proc GPUCallBack(lo: int, hi: int, N: int) {
+           // n * sizeof(int) will be allocated onto the device
+           var dA = new GPUArray(A);
+           ...
+         }
+
+         // GPUIterator
+         forall i in GPU(1..n, GPUCallBack) { A(i) = ...; }
+
+         // Example 2: Distributed array
+         use BlockDist;
+         var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+         var A: [D] int;
+         proc GPUCallBack(lo: int, hi: int, n: int) {
+           // get the local portion of the distributed array
+           var localA = A.localSlice(lo...hi);
+           // n * sizeof(int) will be allocated onto the device
+           var dA = new GPUArray(localA);
+           ...
+         }
+
+         // GPUIterator
+         forall i in GPU(D, GPUCallBack) { A(i) = ...; }
+
+      .. note:: The allocated memory resides on the `current device`. With the ``GPUIterator``, the current device is automatically set by it. Without it, it is the user's responsibilities to set the current device (e.g., by calling the ``SetDevice`` API below). Otherwise, the default device (usually the first GPU) will be used.
+
+      .. note:: With distributed arrays, it is required to use Chapel array's `localSlice API <https://chapel-lang.org/docs/builtins/ChapelArray.html#ChapelArray.localSlice>`_ to get the local portion of the distributed array. With the ``GPUIterator``, the local portion is already computed and given as the first two arguments (``lo`` and ``hi``).
+
+   .. method:: toDevice()
+
+      Transfers the contents of the Chapel array to the device.
+
+      .. code-block:: chapel
+         :emphasize-lines: 3
+
+         proc GPUCallBack(lo: int, hi: int, n:int) {
+           var dA = GPUArray(A);
+           dA.toDevice();
+         }
+
+   .. method:: fromDevice()
+
+      Transfers back the contents of the device array to the Chapel array.
+
+      .. code-block:: chapel
+         :emphasize-lines: 3
+
+         proc GPUCallBack(lo: int, hi: int, n:int) {
+           var dA = GPUArray(A);
+           dA.fromDevice();
+         }
+
+   .. method:: free()
+
+      Frees memory on the device.
+
+      .. code-block:: chapel
+         :emphasize-lines: 3
+
+         proc GPUCallBack(lo: int, hi: int, n:int) {
+           var dA = GPUArray(A);
+           dA.free();
+         }
+
+   .. method:: dPtr(): c_void_ptr
+
+      Returns a pointer to the allocated device memory.
+
+      :returns: pointer to the allocated device memory
+      :rtype: `c_void_ptr`
+
+   .. method:: hPtr(): c_void_ptr
+
+      Returns a pointer to the head of the Chapel array.
+
+      :returns: pointer to the head of the Chapel array
+      :rtype: `c_void_ptr`
+
+
+.. method:: toDevice(args: GPUArray ...?n)
+
+   Utility function that takes a variable number of ``GPUArray`` and performs the ``toDevice`` operation for each.
+
+.. method:: fromDevice(args: GPUArray ...?n)
+
+   Utility function that takes a variable number of ``GPUArray`` and performs the ``fromDevice`` operation for each.
+
+.. method:: free(args: GPUArray ...?n)
+
+   Utility function that takes a variable number of ``GPUArray`` and performs the ``free`` operation for each.
+
+.. code-block:: chapel
+
+   var dA = GPUArray(A);
+   var dB = GPUArray(B);
+   var dC = GPUArray(C);
+
+   toDevice(A, B)
+   ..
+   fromDevice(C);
+   free(A, B, C);
+
+
+LOW-MID-level API Reference
+############################
+
+.. method:: Malloc(ref devPtr: c_void_ptr, size: size_t)
+
+   Allocates memory on the device.
+
+   :arg devPtr: Pointer to the allocated device array
+   :type devPtr: `c_voidPtr`
+
+   :arg size: Allocation size in bytes
+   :type size: `size_t`
+
+   .. code-block:: chapel
+      :emphasize-lines: 6,21
+
+      // Example 1: Non-distributed array
+      var A: [1..n] int;
+
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        var dA: c_void_ptr;
+        Malloc(dA, (A.size: size_t) * c_sizeof(A.eltType));
+        ...
+      }
+
+      // GPUIterator
+      forall i in GPU(1..n, GPUCallBack) { A(i) = ...; }
+
+      // Example 2: Distributed array
+      use BlockDist;
+      var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+      var A: [D] int;
+      proc GPUCallBack(lo: int, hi: int, n: int) {
+        var dA: c_void_ptr;
+        // get the local portion of the distributed array
+        var localA = A.localSlice(lo...hi);
+        Malloc(dA, (localA.size: size_t) * c_sizeof(localA.eltType));
+        ...
+      }
+
+      // GPUIterator
+      forall i in GPU(D, GPUCallBack) { A(i) = ...; }
+
+   .. note:: ``c_sizeofo(A.eltType)`` returns the size in bytes of the element of the Chapel array ``A``. For more details, please refer to `this <https://chapel-lang.org/docs/builtins/CPtr.html#CPtr.c_sizeof>`_.
+
+
+.. method:: Memcpy(dst: c_void_ptr, src: c_void_ptr, count: size_t, kind: int)
+
+   Transfers data between the host and the device
+
+   :arg dst: the desination address
+   :type dst: `c_void_ptr`
+
+   :arg src: the source address
+   :type src: `c_void_ptr`
+
+   :arg count: size in bytes to be transferred
+   :type count: `size_t`
+
+   :arg kind: type of transfer (``0``: host-to-device, ``1``: device-to-host)
+   :type kind: `int`
+
+   .. code-block:: chapel
+      :emphasize-lines: 7-10
+
+      // Non-distributed array
+      var A: [1..n] int;
+
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        var dA: c_void_ptr;
+        Malloc(dA, (A.size: size_t) * c_sizeof(A.eltType));
+        // host-to-device
+        Memcpy(dA, c_ptrTo(A), size, 0);
+        // device-to-host
+        Memcpy(c_ptrTo(A), dA, size, 1));
+      }
+
+   .. note:: ``c_ptrTo(A)`` returns a pointer to the Chapel rectangular array ``A``. For more details, see `this document <https://chapel-lang.org/docs/builtins/CPtr.html#CPtr.c_ptrTo>`_.
+
+
+.. method:: Free(devPtr: c_void_ptr)
+
+   Frees memory on the device
+
+   :arg devPtr: Device pointer to memory to be freed.
+   :type devPtr: `c_void_ptr`
+
+.. method:: GetDeviceCount(ref count: int(32))
+
+   Returns the number of GPU devices on the current locale.
+
+   :arg count: the number of GPU devices
+   :type count: `int(32)`
+
+   .. code-block:: chapel
+
+      var nGPUs: int(32);
+      GetDeviceCount(nGPUs);
+      writeln(nGPUs);
+
+.. method:: GetDevice(ref id: int(32))
+
+   Returns the device ID currently being used.
+
+   :arg id: the device ID current being used
+   :type id: `int(32)`
+
+.. method:: SetDevice(device: int(32))
+
+   Sets the device ID to be used.
+
+   :arg id: the device ID to be used. ``id`` must be 1) greater than or equal to zero, and 2) less than the number of GPU devices.
+   :type id: `int(32)`
+
+.. method:: ProfilerStart()
+
+   **NVIDIA GPUs Only** Start profiling with ``nvprof``
+
+.. method:: ProfilerStop()
+
+   **NVIDIA GPUs Only** Stop profiling with ``nvprof``
+
+   .. code-block:: chapel
+
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        ProfilerStart();
+        ...
+        ProfilerStop();
+      }
+
+.. method:: DeviceSynchronize()
+
+   Waits for the device to finish.
diff --git a/doc/rst/api/gpuiterator.rst b/doc/rst/api/gpuiterator.rst
new file mode 100644
index 0000000..1821f9a
--- /dev/null
+++ b/doc/rst/api/gpuiterator.rst
@@ -0,0 +1,35 @@
+.. default-domain:: chpl
+
+===============
+GPUIterator
+===============
+
+.. iterfunction:: iter GPU(c: range(?), GPUCallBack: func(int, int, int, void), CPUPercent: int = 0)
+
+   :arg c: The range to iterate over. The length of the range must be greater
+           than zero.
+   :type c: `range(?)`           
+
+   :arg GPUCallBack: The reference to a Chapel function that is invoked after the iterator has computed a subrange for the GPU portion. It must take three integers: ``lo:int, hi:int, n:int``, where ``lo`` and ``hi`` are the lower and the upper bound of the GPU portion respectively, and ``n`` is ``hi-lo+1``.
+   :type GPUCallBack: `func(int, int, int, void)`
+
+   :arg CPUPercent: The percentage of the iteration space will be executed on the CPU. The default number for it is zero, meaning the whole itreration space goes to the GPU side.
+   :type CPUPercent: `int`
+
+   :yields: Indices in the CPU portion of the range ``c``. 
+
+
+.. iterfunction:: iter GPU(D: domain, GPUCallBack: func(int, int, int, void), CPUPercent: int = 0)
+
+   :arg D: The domain to iterate over. The length of the range must be greater
+           than zero. It must be a rectangular domain. Also, if ``D`` is ``dmapped``, it must be ``BlockDist``.
+   :type D: `domain`           
+
+   :arg GPUCallBack: The reference to a Chapel function that is invoked after the iterator has computed a subrange for the GPU portion. It must take three integers: ``lo:int, hi:int, n:int``, where ``lo`` and ``hi`` are the lower and the upper bound of the GPU portion respectively, and ``n`` is ``hi-lo+1``.
+   :type GPUCallBack: `func(int, int, int, void)`
+
+   :arg CPUPercent: The percentage of the iteration space will be executed on the CPU. The default number for it is zero, meaning the whole itreration space goes to the GPU side.
+   :type CPUPercent: `int`
+
+   :yields: Indices in the CPU portion of the range ``D``. 
+            
diff --git a/doc/rst/conf.py b/doc/rst/conf.py
new file mode 100755
index 0000000..f178b46
--- /dev/null
+++ b/doc/rst/conf.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# Chapel-GPU documentation build configuration file, created by
+# sphinx-quickstart on Tue Jul 21 22:13:19 2020.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx_rtd_theme",
+    "sphinxcontrib.chapeldomain",
+    "sphinxcontrib.yt",
+    "sphinx.ext.autosectionlabel",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'Chapel-GPU'
+copyright = '2019, Rice University, 2019-2020, Georgia Institute of Technology'
+author = 'Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = ''
+# The full version, including alpha/beta/rc tags.
+release = '0.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+#pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# This is required for the alabaster theme
+# refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
+html_sidebars = {
+    '**': [
+        'relations.html',  # needs 'show_related': True theme option to display
+        'searchbox.html',
+    ]
+}
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'Chapel-GPUdoc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'Chapel-GPU.tex', 'Chapel-GPU Documentation (DRAFT)',
+     'Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'chapel-gpu', 'Chapel-GPU Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Chapel-GPU', 'Chapel-GPU Documentation',
+     author, 'Chapel-GPU', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+
diff --git a/doc/rst/details/gpuapi.rst b/doc/rst/details/gpuapi.rst
new file mode 100644
index 0000000..9e0f8ce
--- /dev/null
+++ b/doc/rst/details/gpuapi.rst
@@ -0,0 +1,27 @@
+.. default-domain:: chpl
+
+===========
+GPUAPI
+===========
+
+Overview
+################
+
+The GPUAPI module provides Chapel-level GPU API. The use of the API assumes cases where the user would like to 1) write GPU kernels in low-level GPU languages such as CUDA/HIP/OpenCL, or 2) utilize highly-tuned GPU libraries, and would like to stick with Chapel for the other parts (allocation, data transfers). Currently, it provides two tiers of GPU API:
+
+* `MID-level`: Provides Chapel user-friendly GPU API functions.
+
+  * Example: ``var ga = new GPUArray(A);``
+
+* `LOW-MID-level`: Provides wrapper functions for raw GPU API functions
+
+  * Example: ``var ga: c_void_ptr = Malloc(sizeInBytes);``
+
+
+
+Further Readings
+################
+
+* Exploring a multi-resolution GPU programming model for Chapel. Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, 7th Annual Chapel Implementers and Users Workshop (CHIUW), May 2020. (co-located with IPDPS2020).
+
+  .. youtube:: Mq_vhXlSHxU
diff --git a/doc/rst/details/gpuiterator.rst b/doc/rst/details/gpuiterator.rst
new file mode 100644
index 0000000..3555dd2
--- /dev/null
+++ b/doc/rst/details/gpuiterator.rst
@@ -0,0 +1,154 @@
+.. default-domain:: chpl
+
+===========
+GPUIterator
+===========
+
+Overview
+############
+A primary goal of this module is to provide an appropriate interface between Chapel and accelerator programs such that expert accelerator programmers can explore different variants in a portable way (i.e., CPU-only, GPU-only, X% for CPU + Y% for GPU on a single or multiple CPU+GPU node(s)). To address these challenges, here we introduce a Chapel module, ``GPUIterator``, which facilitates invoking a user-written GPU program from Chapel. Since `Chapel's data-parallel loops <https://chapel-lang.org/docs/users-guide/datapar/forall.html>`_ (``forall``) fit well with GPU execution, the ``GPUIterator`` is designed to be invoked in a ``forall`` loop. Consider the following ``STREAM`` code:
+
+.. code-block:: chapel
+                
+  forall i in 1..n {
+      A(i) = B(i) + alpha * C(i);
+  }
+
+
+
+Assuming a GPU version of ``STREAM`` is ready (``streamCUDA`` below), the user can wrap the original iteration space in ``GPU()`` with two additional arguments: ``GPUCallBack`` is a callback function that is invoked after the module has computed a subrange for the GPU portion by using ``CPUPercent``:
+
+.. code-block:: chapel
+   :linenos:
+      
+   // A GPUIterator version
+   extern proc streamCUDA(A: [] real(32), B:[] real(32), C:[] real(32),
+                          alpha: real(32), lo: int, hi: int, N: int); 
+   var GPUCallBack = lambda(lo: int, hi: int, N: int) {
+     // call the GPU program with a range of lo..hi
+     streamCUDA(A, B, C, alpha, lo, hi, N);
+   };
+   CPUPercent = 50; // CPU 50% + GPU 50% in this case
+   forall i in GPU(1..n, GPUCallBack, CPUPercent) {
+     A(i) = B(i) + alpha * C(i);
+   }
+
+  
+It is worth noting that ``GPUIterator`` supports multi-GPUs execution and `multi-locale execution <https://chapel-lang.org/docs/users-guide/locality/compilingAndExecutingMultiLocalePrograms.html>`_. For multi-GPUs execution, the module automatically detects the numbers of GPUs per node (or accept a user-specified number), and invokes the callback function for each GPU, which can be done without any modification to the code above. For multi-locale execution, the iterator accepts a `block distributed domain <https://chapel-lang.org/docs/primers/distributions.html#block-and-distribution-basics>`_, which allows the user to run the code above on multiple CPUs+GPUs nodes with minimal modifications. 
+
+Why GPUIterator?
+##################
+Chapel offers `the C interoperability feature <https://chapel-lang.org/docs/master/technotes/extern.html>`_, which allows the user to invoke C/C++ functions from their Chapel programs. In the context of GPU programming in Chapel, the user typically prepares a GPU version of a ``forall`` loop written in CUDA/HIP/OpenCL and invokes it using the interoperability feature. For example, consider the following baseline ``forall`` implementation that performs ``STREAM``:
+
+.. code-block:: chapel
+   :linenos:                
+                    
+   // Chapel file
+   var A: [1..n] real(32);
+   var B: [1..n] real(32);
+   var C: [1..n] real(32);
+   var alpha: real(32) = 3.0;
+   forall i in 1..n {
+     A(i) = B(i) + alpha * C(i);
+   }
+
+Assuming ``streamCUDA()``, which is a full CUDA/HIP/OpenCL implementation of the ``forall``, is available, here is what the GPU version looks like:
+
+.. code-block:: chapel
+   :linenos:
+      
+   // Chapel file
+   // Declare an external C/C++ function which performs STREAM on GPUs
+   extern proc streamCUDA(A: [] real(32), B:[] real(32), C:[] real(32),
+                          alpha: real(32), lo: int, hi: int, N: int);
+
+   var A: [1..n] real(32);
+   var B: [1..n] real(32);
+   var C: [1..n] real(32);
+   var alpha: real(32);
+   streamCUDA(A, B, C, alpha, 1, n, n);
+
+   
+.. code-block:: c
+   :linenos:
+      
+   // Separate C file
+   void streamCUDA(float *A, float *B, float *C,
+                   float alpha, int start, int end, int size) {
+   // A full GPU implementation of STREAM (CUDA/HIP/OpenCL)
+   // 1. device memory allocations
+   // 2. host-to-device data transfers
+   // 3. GPU kernel compilations (if needed)
+   // 4. GPU kernel invocations
+   // 5. device-to-host data transfers
+   // 6. clean up
+   // Note: A[0] and B[0] here corresponds to
+   // A(1) and B(1) in the Chapel part respectively
+   }
+
+
+The key difference is that the original ``forall`` loop is replaced with the function call to the native function that includes typical host and device operations including device memory (de)allocations, data transfers, and kernel invocations.
+
+Unfortunately, the source code is not very portable particularly when the user wants to explore different configurations to get higher performance. One scenario is that, since GPUs are not always faster than CPUs (and vice versa), the user has to be juggling ``forall`` with ``streamCUDA()`` depending on the data size and the complexity of computations (e.g., by commenting in/out each version).
+
+One intuitive workaround would be to put an ``if`` statement to decide whether to use which version (CPUs or GPUs):
+
+.. code-block:: chapel
+   :linenos:                
+
+   if (cond) {
+     forall i in 1..n { // STREAM }
+   } else {
+     streamCUDA(...);
+   }
+
+However, this raises another problem: it is still not very portable when the user wants to do 1) multi-locale CPU+GPU execution, and 2) advanced workload distributions such as hybrid execution of the CPU and GPU versions. Specifically, WITHOUT the module, the user has to write the following code:
+
+.. code-block:: chapel
+   :linenos:
+
+   // WITHOUT the GPUIterator module (no hybrid execution)
+   // suppose D is a block distributed domain
+   if (cond) {
+     forall i in D { ... }
+   } else {
+     coforall loc in Locales {
+       on loc {
+         coforall GPUID in 0..#nGPUs {
+           var lo = ...; // needs to be computed manually
+           var hi = ...; // needs to be computed manually
+           var localA = A.localSlice(lo..hi);
+           ...
+           // GPUID needs to be manually set before streamCUDA() is called
+           streamCUDA(localA, ...); 
+         }
+       }
+     }
+   }
+
+
+WITH the module, again, the code is much simpler and more portable:
+
+.. code-block:: chapel
+   :linenos:               
+
+   // WITH the GPUIterator module
+   // suppose D is a block distributed domain
+   var GPUCallBack = lambda(lo: int, hi: int, N: int) {
+     // call the GPU program with a range of lo..hi
+     // lo..hi is automatically computed
+     // the module internally and automatically sets GPUID
+     streamCUDA(A.localSlice(lo..hi), ...);
+   };
+   CPUPercent = 50; // CPU 50% + GPU 50% in this case
+   forall i in GPU(D, GPUCallBack, CPUPercent) {
+     A(i) = B(i) + alpha * C(i);
+   }
+
+
+
+Further Readings
+################
+
+* GPUIterator: bridging the gap between Chapel and GPU platforms. Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The ACM SIGPLAN 6th Annual Chapel Implementers and Users Workshop (CHIUW), June 2019. (co-located with PLDI2019/ACM FCRC2019) `DOI <https://dl.acm.org/doi/10.1145/3329722.3330142>`_.
+
diff --git a/doc/rst/history/evolution.rst b/doc/rst/history/evolution.rst
new file mode 100644
index 0000000..c5b828f
--- /dev/null
+++ b/doc/rst/history/evolution.rst
@@ -0,0 +1,16 @@
+=============================================
+Chapel-GPU Evolution
+=============================================
+
+version 0.2.0, August, 2020
+############################
+
+Version 0.2.0 adds the following new features to version 0.1.0:
+
+- Add the ``GPUAPI`` module, which provides Chapel-level GPU API and reduces the complexity of writing data (de)allocations, and transfers for GPU execution.
+- Introduce a ``cmake``-based build system, which facilitates building GPU prorams on different GPU platforms (CUDA, HIP, and OpenCL).
+
+version 0.1.0, July, 2019
+###########################
+
+Version 0.1.0 provides an initial version of the ``GPUIterator`` module, which facilitates the invocation of user-provided GPU programs from Chapel programs. 
diff --git a/doc/rst/index.rst b/doc/rst/index.rst
new file mode 100644
index 0000000..113fde3
--- /dev/null
+++ b/doc/rst/index.rst
@@ -0,0 +1,54 @@
+.. role:: chapel(code)
+   :language: chapel
+
+Chapel GPU Documentation
+================================
+
+.. note:: This document is mostly complete, but still under internal review. If you wish to try Chapel-GPU right now, please use `the feature/explicit branch <https://github.com/ahayashi/chapel-gpu/tree/feature/explicit>`_. We'll soon release version 0.2.0, which includes all the features described in this document, and update `the master branch <https://github.com/ahayashi/chapel-gpu>`_ accordingly.
+
+Overview
+--------
+This document describes the following two Chapel modules that facilitate GPU programming in Chapel:
+
+* `GPUIterator`: A Chapel iterator that facilitates invoking user-written GPU programs (e.g., CUDA/HIP/OpenCL) from Chapel programs. It is also designed to easily perform hybrid and/or distributed execution - i.e., CPU-only, GPU-only, X% for CPU + Y% for GPU on a single or multiple CPU+GPU node(s), which helps the user to explore the best configuration.
+
+* `GPUAPI`: Chapel-level GPU API that allows the user to perform basic operations such as GPU memory (de)allocations, device-to-host/host-to-device transfers, and so on. This module can be used either standalone or with the GPUIterator module. Currently, the following two tiers of API are provided:
+
+    * `MID-level`: Provides Chapel user-friendly GPU API functions.
+
+       * Example: :chapel:`var ga = new GPUArray(A);`
+
+    * `LOW-MID-level`: Provides wrapper functions for raw GPU API functions
+
+       * Example: :chapel:`var ga: c_void_ptr = Malloc(sizeInBytes);`
+
+
+Also, in this document, for categorization purposes, the term `LOW-level` is referred to a GPUIterator only version, where the GPUIterator is only used for invoking raw GPU programs in which there is no Chapel-level abstraction.
+
+
+.. toctree::
+   :maxdepth: 2
+   :caption: QuickStart Instructions
+
+   instructions/build
+   instructions/write
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Technical Details
+
+   details/gpuiterator
+   details/gpuapi
+
+.. toctree::
+   :maxdepth: 2
+   :caption: API Reference
+
+   api/gpuiterator
+   api/gpuapi
+
+.. toctree::
+   :maxdepth: 2
+   :caption: History
+
+   history/evolution
diff --git a/doc/rst/instructions/build.rst b/doc/rst/instructions/build.rst
new file mode 100644
index 0000000..c70a33f
--- /dev/null
+++ b/doc/rst/instructions/build.rst
@@ -0,0 +1,81 @@
+=======================
+Building Chapel-GPU
+=======================
+
+Prerequisites
+##############
+
+* Chapel: 1.22 or below. Detailed instructions for installing Chapel can be found: `here <https://chapel-lang.org/docs/usingchapel/QUICKSTART.html>`_.
+
+* GPU Compilers & Runtimes: GPUIterator and GPUAPI require either of the following GPU programing environments.
+
+   * NVIDIA CUDA: Tested with 10.2
+   * AMD HIP: Tested with 2.8
+   * OpenCL: Tested with 2.2 and 1.2
+
+* Build tools
+
+   * cmake: 3.8.0 or above
+
+.. note:: While ``GPUIterator`` works with OpenCL, ``GPUAPI`` with OpenCL is under developement.
+
+Instructions
+##############
+
+1. Clone the repository
+
+.. code-block:: bash
+
+   git clone https://github.com/ahayashi/chapel-gpu.git
+
+2. Build ``libGPUAPI`` library using ``cmake``
+
+.. code-block:: bash
+
+   cd chapel-gpu
+   mkdir build
+   cd build
+   cmake ..
+   make
+   make install
+
+This produces the following files:
+
+.. csv-table::
+   :header: "File", "Type", "Destination", "Description"
+   :widths: 20, 20, 20, 50
+
+   env.sh, Shell Script, ``bin``, Sets environment variables.
+   libGPUAPIX.so, Shared Library, ``lib``, "X is eiher CUDA, HIP, or OpenCL."
+   libGPUAPIX_static.a, Static Library, ``lib``, "A static version of libGPUAPI. Mainly used in this document."
+   GPUAPI.h, Header File, ``include``, "A header file that includes the declaration of GPUAPI functions."
+   lambda.h, Header File, ``include``, "A header file that facilitates writing device lambdas."
+   GPUIterator.chpl, Chapel Source File, ``modules``, "The ``GPUIterator`` module"
+   GPUAPI.chpl, Chapel Source File, ``modules``, "The ``GPUAPI`` module"
+
+
+By default, the libraries are installed into :code:`chapel-gpu/install`. If you wish to install it into your preferred directly, please type:
+
+.. code-block:: bash
+
+   cmake -DCMAKE_INSTALL_PREFIX=path/to/your_preferred_directory ..
+
+
+.. note::
+   **For CUDA Users**: If CUDA is not found, make sure :code:`nvcc` is in your path or tell :code:`cmake` the path to :code:`nvcc`. For example: :code:`cmake CMAKE_CUDA_COMPILER=path_to/nvcc ..`
+
+   **For AMD HIP Users**: Chapel-GPU relies on :code:`hipify-perl` to convert CUDA programs to HIP programs internally. If you are pretty sure HIP is installed on your system, but :code:`cmake` complains :code:`hipify-perl` is not found, consider updating the following line in :code:`CMakeLists.txt`: :code:`if(EXISTS "${HIP_ROOT_DIR}/hip/bin/hipify-perl")`
+
+3. source ``env.sh``
+
+.. code-block:: bash
+
+   cd ..
+   source ./install/bin/env.sh
+
+|
+   This sets 1) ``$CHPL_GPU_HOME``, and 2) environment variables related to CUDA/HIP/OpenCL installation directory, the latter of which can be referred when the user creates object files for thier GPU programs.
+
+4. Build and run a test program
+
+  See :doc:`Compiling and running <compile>`
diff --git a/doc/rst/instructions/compile.rst b/doc/rst/instructions/compile.rst
new file mode 100644
index 0000000..c792f9c
--- /dev/null
+++ b/doc/rst/instructions/compile.rst
@@ -0,0 +1,95 @@
+=============================================
+Compiling and Running Applications
+=============================================
+
+The repository has several example applications in ``chapel-gpu/example`` and ``chapel-gpu/apps`` directory, most of which has a distributed version:
+
+.. csv-table::
+   :header: "Benchmark", "Location", "Description", "Note"
+   :widths: 20, 20, 20, 20
+
+   Vector Copy, ``example`` and ``apps/vector_copy``, A simple vector kernel,
+   STREAM, ``apps/stream``, `A = B + alpha * C`,
+   BlackScholes, ``apps/blackscholes``, The Black-Scholes Equation,
+   Logistic Regression, ``apps/logisticregression``, A classification algorithm,
+   Matrix Multiplication, ``apps/mm``, Matrix-Matrix Multiply,
+   PageRank, ``apps/mm``, The pagerank algorithm, WIP
+   N-Queens, WIP, The n-queens problem, WIP
+
+
+.. note:: This section assumes the Chapel-GPU components are already installed in ``$CHPL_GPU_HOME``. If you have not done so please see :ref:`Building Chapel-GPU`.
+
+Compiling Applications
+########################
+
+The example applications in ``chapel-gpu/example`` and ``chapel-gpu/apps`` directory can be build by just doing ``make X``, where ``X`` is either ``cuda``, ``hip``, or ``opencl``. Please be sure to ``source`` the setting script before doing so.
+
+1. Set environment variables
+
+  .. code-block:: bash
+
+     source $CHPL_GPU_HOME/bin/env.sh
+
+2. Compile
+
+   - Example 1: ``chapel-gpu/example``
+
+     .. code-block:: bash
+
+        cd path/to/chapel-gpu/example
+        make cuda
+        or
+        make hip
+        or
+        make opencl
+
+   - Example 2: ``chapel-gpu/apps/stream``
+
+     .. code-block:: bash
+
+        cd path/to/chapel-gpu/apps/stream
+        make cuda
+        or
+        make hip
+        or
+        make opencl
+
+   .. note:: A baseline implementation for CPUs can be built by doing ``make baseline``.
+
+3. Check the generated executables
+
+   For example, ``make cuda`` in ``apps/vector_copy`` generates the following files:
+
+   .. csv-table::
+      :header: "Name", "Description", "Individual make command"
+      :widths: 10, 20, 20
+
+      ``vc.baseline``, A baseline implementation for CPUs., ``make baseline``
+      ``vc.cuda.gpu``, A GPU-only implmentation w/o the GPUIterator., ``make cuda.gpu``
+      ``vc.cuda.hybrid``, The GPUIterator implemenation (single-locale)., ``make cuda.hybrid``
+      ``vc.cuda.hybrid.dist``, The GPUIterator implemenation (multi-locale)., ``make cuda.hybrid.dist``
+      ``vc.cuda.hybrid.dist.lowmid``, The LOW-MID implemenation (multi-locale)., ``make cuda.hybrid.dist.lowmid``
+      ``vc.cuda.hybrid.dist.mid``, The MID implementation (multi-locale)., ``make cuda.hybrid.dist.mid``
+
+
+   .. tip:: If you want to compile a specific variant, please do ``make X.Y``, where ``X`` is either ``cuda``, ``hip``, or ``opencl``, and ``Y`` is either ``gpu``, ``hybrid``, ``hybrid.dist``, ``hybrid.dist.lowmid``, or ``hybrid.dist.mid``. Please also see the third column above. Also, the LOW-MID and MID variants with OpenCL are currently not supported.
+
+  .. note:: The ``Makefile`` internally uses ``cmake`` to generate a static library from a GPU source program (``vc.cu`` in this case)
+
+Running Applications
+#####################
+
+Once you have compiled a Chapel-GPU program, you can run it from the command-line:
+
+.. code-block:: bash
+
+   ./vc.cuda.hybrid
+
+Also, many of the example applications accepts the ``--n`` option, which changes input size, the ``--CPUratio`` (or ``--CPUPercent``) option, which controls the percentage of an iteration space will be executed on CPUs, and the ``--output`` option, which outputs the result arrays. For example:
+
+.. code-block:: bash
+
+   ./vc.cuda.hybrid --n=256 --CPUratio=50 --output=1
+
+For multi-locale execution, please refer to `this document <https://chapel-lang.org/docs/usingchapel/QUICKSTART.html#using-chapel-in-multi-locale-mode>`_.
+
diff --git a/doc/rst/instructions/guide.rst b/doc/rst/instructions/guide.rst
new file mode 100644
index 0000000..1db55ad
--- /dev/null
+++ b/doc/rst/instructions/guide.rst
@@ -0,0 +1,61 @@
+=============================================
+Guide to Write GPU programs
+=============================================
+
+General Guidelines
+###################
+
+In general, GPU programs should include typical host and device operations including device memory (de)allocations, data transfers, and kernel invocations. Depending on the abstraction level you choose, some of these operations can be written in a Chapel-user-friendly way:
+
+.. list-table::
+   :widths: 15 15 15 15
+   :header-rows: 1
+
+   * - Level
+     - MID-level
+     - LOW-MID-level
+     - LOW-level
+   * - Kernel Invocation
+     - CUDA/HIP
+     - CUDA/HIP
+     - CUDA/HIP/OpenCL
+   * - Memory (de)allocations
+     - Chapel (MID)
+     - Chapel (LOW-MID)
+     - CUDA/HIP/OpenCL
+   * - Data transfers
+     - Chapel (MID)
+     - Chapel (LOW-MID)
+     - CUDA/HIP/OpenCL
+
+
+.. seealso::
+
+   * :ref:`Writing MID-level programs`
+   * :ref:`MID-level API Reference`
+   * :ref:`Writing LOW-MID-level programs`
+   * :ref:`LOW-MID-level API Reference`
+   * :ref:`Writing LOW-level (GPUIterator Only) programs`
+
+.. note:: LOW/LOW-MID/MID levels can interoperate with each other.
+
+
+Writing GPU program
+#######################################
+
+
+The design and implementation of a CUDA/HIP/OpenCL program that is supposed to be called from the callback function is completely up to you. However, please be aware that it can be called multiple times (i.e., the number of GPUs per locale * the number of locales) as the GPUIterator automatically and implicitly handles multiple- GPUs and locales. We'd highly recommend writing your GPU program in a way that is 1) device neutral (no device setting call) and 2) flexibile to change in iteration spaces -i.e., ``start`` and ``end``  (including data allocations and transfers).
+
+.. Data Transfers
+.. ***************
+
+.. .. code-block:: chapel
+
+..   forall i in GPU(1..n, GPUCallBack) {
+..      A(i) = B(i);
+..   }
+
+
+.. Write a GPU program that is flexible to adapt to different iteration spaces.
+
+.. is GPU ID neutral, where [DEFINITION], which improve the portability of your GPU program significantly.
diff --git a/doc/rst/instructions/low-mid.rst b/doc/rst/instructions/low-mid.rst
new file mode 100644
index 0000000..6c87ac4
--- /dev/null
+++ b/doc/rst/instructions/low-mid.rst
@@ -0,0 +1,57 @@
+.. default-domain:: chpl
+                    
+=============================================
+Writing LOW-MID-level programs
+=============================================
+
+LOW-MID-level API
+######################
+
+The biggest motivation for introducing ``LOW-MID`` and ``MID`` -level GPU API is moving some of low-level GPU operations to the Chapel-level. Consider the following GPU callback function and C function:
+
+.. code-block:: chapel
+   :caption: vc.hybrid.chpl
+
+   // lo, hi, and N are automatically computed by the GPUIterator
+   proc GPUCallBack(lo: int, hi: int, N: int) {
+     vcCUDA(A, B, lo, hi, N);
+   }
+
+.. code-block:: c
+   :caption: vc.cu
+
+   extern "C" {
+     void vcCUDA(float* A, float *B, int start, int end, int GPUN) {
+       float *dA, *dB;
+       cudaMalloc(&dA, sizeof(float) * GPUN);
+       cudaMalloc(&dB, sizeof(float) * GPUN);
+       cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice);
+       vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
+       cudaDeviceSynchronize();
+       cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost);
+       cudaFree(dA);
+       cudaFree(dB);
+     }
+   }
+
+At the LOW-MID-level, most of the CUDA/HIP/OpenCL-level 1) device memory allocation, 2) device synchronization, and 3) data transfer can be written in Chapel. However, it's worth noting that this level of abstraction only provides thin wrapper functions for the CUDA/HIP/OpenCL-level API functions, which requires you to directly manipulate C types like ``c_void_ptr`` and so on. The LOW-MID is helpful particularly when you want to fine-tune the use of GPU API, but still want to stick with Chapel. Here is an example program written with the LOW-MID-level API:
+
+.. code-block:: chapel
+   :caption: vc.hybrid.chpl
+
+   proc GPUCallBack(lo: int, hi: int, N: int) {
+     var dA, dB: c_void_ptr;
+     var size: size_t = (lA.size:size_t * c_sizeof(lA.eltType));
+     Malloc(dA, size);
+     Malloc(dB, size);
+     Memcpy(dB, c_ptrTo(lB), size, 0);
+     LaunchVC(dA, dB, N: size_t);
+     DeviceSynchronize();
+     Memcpy(c_ptrTo(lA), dA, size, 1);
+     Free(dA);
+     Free(dB);     
+   }
+
+.. tip:: The LOW-MID-level API can interoperate with the MID-level API.
+
+.. seealso:: :ref:`LOW-MID-level API Reference`
diff --git a/doc/rst/instructions/low.rst b/doc/rst/instructions/low.rst
new file mode 100644
index 0000000..9627740
--- /dev/null
+++ b/doc/rst/instructions/low.rst
@@ -0,0 +1,356 @@
+.. default-domain:: chpl
+
+=============================================
+Writing LOW-level (GPUIterator Only) programs
+=============================================
+
+Here we provide a step-by-step guide for utilizing the ``GPUIterator`` module using a simple Chapel program (``vector copy``) in single-locale and multiple-locale scenarios.
+
+Single-locale version
+######################
+
+In this single-locale scenario, you are supposed to create and edit one Chapel source file and one CUDA source file: ``vc.hybrid.chpl`` and ``vc.cu``.
+
+1. Import the GPUIterator module
+
+   First, import the module using the ``use`` keyword:
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.chpl
+
+      use GPUIterator;
+
+2. Declare Chapel arrays
+
+   Then, create two Chapel arrays, ``A`` and ``B``, which will be used for the copy operation:
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.chpl
+      :emphasize-lines: 3,4,5
+
+      use GPUIterator;
+
+      config const n = 32: int;
+      var A: [1..n] real(32);
+      var B: [1..n] real(32);
+
+
+
+   .. tip:: It's wise to define ``n`` as `a configurable constant <https://chapel-lang.org/docs/users-guide/base/configs.html>`_, which can be overridden on the command line (e.g., ``./vc --n=1024``).
+
+3. Import your GPU program
+
+   a. Write a GPU program
+
+      It is worth noting that the design and implementation of the GPU program is completely your choice. Please also see :ref:`Guide to Write GPU programs`. Here is one working vector copy example with CUDA:
+
+   .. code-block:: c
+      :caption: vc.cu
+
+      __global__ void vc(float *dA, float *dB, int N) {
+        int id = blockIdx.x * blockDim.x + threadIdx.x;
+        if (id < N) {
+	      dA[id] = dB[id];
+        }
+      }
+
+      extern "C" {
+        void vcCUDA(float* A, float *B, int start, int end, int GPUN) {
+          float *dA, *dB;
+          cudaMalloc(&dA, sizeof(float) * GPUN);
+          cudaMalloc(&dB, sizeof(float) * GPUN);
+          cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice);
+          vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
+          cudaDeviceSynchronize();
+          cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost);
+          cudaFree(dA);
+          cudaFree(dB);
+        }
+      }
+
+
+
+   .. note:: For the presentation purposes, any error checking is omitted. A complete program can be found in ``apps`` directory.
+
+
+
+   b. Declare it as an external function
+
+      Use `Chapel's C interoperability feature <https://chapel-lang.org/docs/technotes/extern.html>`_ to declare ``vcCUDA()`` as an external function.
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.chpl
+      :emphasize-lines: 7
+
+      use GPUIterator;
+
+      config const n = 32: int;
+      var A: [1..n] real(32);
+      var B: [1..n] real(32);
+
+      extern proc vcCUDA(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int);
+
+
+   .. note:: More details on the C interoperability feature can be found `here <https://chapel-lang.org/docs/technotes/extern.html>`_.
+
+
+4. Write a GPU callback function
+
+   The GPU callback function is supposed to be invoked from the GPUIterator with an automatically computed subrange (``lo`` and ``hi``). In this example, we call the external function ``vcCUDA`` with the two global arrays (``A`` and ``B``), the subrange (``lo`` and ``hi``), plus the number of elements (``N = size(lo..hi)``).
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.chpl
+      :emphasize-lines: 9,10,11,12
+
+      use GPUIterator;
+
+      config const n = 32: int;
+      var A: [1..n] real(32);
+      var B: [1..n] real(32);
+
+      extern proc vcCUDA(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int);
+
+      // lo, hi, and N are automatically computed by the GPUIterator
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        vcCUDA(A, B, lo, hi, N);
+      }
+
+.. _callback:
+
+   It is worth noting that there will be multiple calls to ``GPUCallBack()`` when the number of GPUs is greater than one. Internally, the GPUIterator detects the number of GPUs within a locale, then automatically computes a subrange for each GPU, and creates a separate task that is responsible for each GPU. This design keeps the callback function simple and independent from GPU ID. The table below illustrates how ``GPUCallBack()`` is called when ``n=1024, nLocales=1, nGPUs=2``:
+
+   .. list-table:: n=1024, nLocales=1, nGPUs=2
+      :widths: 15 15 15 15
+      :header-rows: 1
+
+      * -
+        - Locales[0]
+        -
+        -
+      * -
+        - CPUs
+        - GPU0
+        - GPU1
+      * - ``lo..hi``
+        - ``1..512``
+        - ``512..767``
+        - ``768..1024``
+      * - ``GPUCallBack(lo,hi,N);``
+        - N/A
+        - ``GPUCallBack(512,767,256);``
+        - ``GPUCallBack(768,1024,256);``
+
+   .. tip:: The number of GPUs can be overridden by giving the `--nGPUs=n` option (two dashes) on the command line
+
+   .. note::
+
+      1. Writing GPU ID dependent code in a callback function can be also done using the ``GetDevice`` function of the GPUAPI :
+
+      .. code-block:: chapel
+
+         use GPUAPI;
+         proc GPUCallBack(lo: int, hi:int, N:int) {
+           var id;
+           GetDevice(id);
+           if (id == 0) { ... }
+           else if ...
+         }
+
+         
+      2. While the use of a lambda function would be more productive and elegant, we'd recommend writing a Chapel function for the callback since the lambda support in Chapel is still early.
+
+      .. code-block:: chapel
+
+         var GPUCallBack = lambda(lo: int, hi:int, N:int) { vcCUDA(A, B, lo, hi, N); };
+         forall i in GPU(1..n, GPUCallback) { ... }
+
+
+      If the this lambda version does not work, try `this workaround <https://github.com/chapel-lang/chapel/issues/8351>`_:
+         
+      .. code-block:: chapel
+
+
+         record Lambda {
+           proc this(lo:int, hi:int, N:int) { vcCUDA(A, B, lo, hi, N); }
+         }
+         var GPUCallBack = new Lambda();
+         forall i in GPU(1..n, GPUCallback) { ... }
+
+
+5. Invoke the ``GPU()`` iterator in a ``forall`` loop
+
+   When writing a ``forall`` loop, simply wrap the iteration space (``1..n``) in ``GPU()`` and give the callback function (``GPUCallBack``). Here is a complete program with output verification:
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.chpl
+      :emphasize-lines: 15-18
+
+      use GPUIterator;
+
+      config const n = 32: int;
+      var A: [1..n] real(32);
+      var B: [1..n] real(32);
+
+      extern proc vcCUDA(A: [] real(32), B: [] real(32), lo: int, hi: int, N: int);
+
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        vcCUDA(A, B, lo, hi, N);
+      }
+
+      B = 1;
+
+      forall i in GPU(1..n, GPUCallBack) {
+        // CPU Version
+        A(i) = B(i);
+      }
+
+      if (A.equals(B)) {
+        writeln("Verified");
+      } else {
+        writeln("Not Verified");
+      }
+
+
+6. Compile and Run
+
+   See :doc:`Compiling and running <compile>`
+
+Multi-locale version
+######################
+
+In the multi-locale scenario, you are supposed to update ``vc.hybrid.chpl`` slightly, but you can keep the GPU program (``vc.cu``) unchanged.
+
+0. Copy ``vc.hybrid.chpl`` to ``vc.hybrid.dist.chpl``
+
+1. Add ``BlockDist`` module and replace the range with a block-distributed domain
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.dist.chpl
+      :emphasize-lines: 2
+
+      use GPUIterator
+      use BlockDist;
+
+
+   Then, declare two Chapel arrays with a block-distributed domain ``D``.
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.dist.chpl
+      :emphasize-lines: 2-4
+
+      config const n = 32: int;
+      var D: domain(1) dmapped Block(boundingBox = {1..n}) = {1..n};
+      var A: [D] real(32);
+      var B: [D] real(32);
+      // var A: [1..n] real(32); /* single locale version */
+      // var B: [1..n] real(32); /* single locale version */
+
+
+2. Update ``GPUCallBack``
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.dist.chpl
+      :emphasize-lines: 3-7
+
+      // lo, hi, and N are automatically computed by the GPUIterator
+      proc GPUCallBack(lo: int, hi: int, N: int) {
+        // the first element of lA is lA(lo), which corresponds to A[0] in the vcCUDA part.
+        ref lA = A.localSlice(lo..hi);
+        // the first element of lB is lB(lo), which corresponds to B[0] in the vcCUDA part.        
+        ref lB = B.localSlice(lo..hi); 
+        vcCUDA(lA, lB, 0, hi-lo, N);
+        //vcCUDA(A, B, lo, hi, N); /* single locale version */
+      }
+
+
+   While the code looks pretty much similar to the single-locale version, since the two arrays are distributed, the following two additional things need to be done:
+
+   a. Using ``localSlice()`` API
+
+      .. code-block:: chapel
+
+         // for GPU X on locale Y, (locale- and device-neutral)
+         ref lA = A.localSlice(lo..hi);
+                   
+
+      Similar to the single-locale + multiple GPUs case discussed `above <callback_>`_,  multiple instances of ``GPUCallBack()`` will be invoked for each GPU on different locales. However, you can still write the callback in a way that is locale and GPU ID independent by utilizing Chapel's ``localSlice(d: domain)`` API (`link <https://chapel-lang.org/docs/builtins/ChapelArray.html#ChapelArray.localSlice>`_). Essentially, feeding the automatically computed subrange (``lo..hi``) to the API returns a proper slice of a distributed array in a specific instance of ``GPUCallBack()``.
+      
+
+   b. Updating the arguments to ``vcCUDA()``
+
+      .. code-block:: chapel
+
+         // call to the external GPU program
+         vcCUDA(lA, lB, 0, hi-lo, N);
+
+         
+      Let us first explain how the local reference (say ``lA``) can be accessed in the GPU program (``vcCUDA``). To give you a concrete example, suppose ``n=2048, nLocales=2, CPUPercent=50``, in which ``A(1..1024)`` resides on `Locale 0`, and ``A(1025..2048)`` resides on `Locale 1`. The table below summarizes how ``lA`` corresponds to the C array (``A``) in each instance of the callback:
+      
+      .. list-table:: n=2048, nLocales=2, nGPUs=2
+         :widths: 15 15 15 15 15 15 15
+         :header-rows: 1
+
+         * -
+           - Locales[0]
+           -
+           -
+           - Locales[1]
+           -
+           -
+         * -
+           - CPUs
+           - GPU0
+           - GPU1
+           - CPUs
+           - GPU0
+           - GPU1
+         * - ``lo..hi``
+           - ``1..512``
+           - ``513..768``
+           - ``769..1024``
+           - ``1025..1536``
+           - ``1537..1792``
+           - ``1793..2048``
+         * - ``GPUCallBack(lo,hi,N);``
+           - N/A
+           - ``GPUCallBack(513,768,256);``
+           - ``GPUCallBack(769,1024,256);``
+           - N/A
+           - ``GPUCallBack(1537,1792,256);``
+           - ``GPUCallBack(1793,2048,256);``
+         * - ``lA = A.localSlice(lo..hi)``
+           - N/A
+           - ``A.localSlice(513..768);``
+           - ``A.localSlice(769..1024);``
+           - N/A
+           - ``A.localSlice(1537..1792);``
+           - ``A.localSlice(1793..2048);``             
+         * - ``A[0]`` in ``vcCUDA`` corresponds to
+           - N/A
+           - ``lA(513)``
+           - ``lA(769)``
+           - N/A
+           - ``lA(1537)``
+           - ``lA(1793)``
+
+
+      Notice that ``A[0]`` in ``vcCUDA(float *A, ...)`` corresponds to the first element of the local slice, which is why the third argument is zero (= ``start``) and thr fourth argument is ``hi-lo`` (= ``end``).
+
+3. Update ``GPU()``
+
+   Finally, give the distributed domain (``D``) to ``GPU()``:
+
+   .. code-block:: chapel
+      :caption: vc.hybrid.dist.chpl
+
+      forall i in GPU(D, GPUCallBack) {
+      //forall i in GPU(1..n, GPUCallBack) {
+        // CPU Version
+        A(i) = B(i);
+      }
+
+
+4. Compile and Run
+
+   See :doc:`Compiling and running <compile>`
+      
diff --git a/doc/rst/instructions/mid.rst b/doc/rst/instructions/mid.rst
new file mode 100644
index 0000000..1329d8b
--- /dev/null
+++ b/doc/rst/instructions/mid.rst
@@ -0,0 +1,56 @@
+.. default-domain:: chpl
+
+=============================================
+Writing MID-level programs
+=============================================
+
+MID-level API
+######################
+
+To reiterate, the biggest motivation for introducing ``LOW-MID`` and ``MID`` -level GPU API is moving some of low-level GPU operations to the Chapel-level. Consider the following GPU callback function and C function:
+
+.. code-block:: chapel
+   :caption: vc.hybrid.chpl
+
+   // lo, hi, and N are automatically computed by the GPUIterator
+   proc GPUCallBack(lo: int, hi: int, N: int) {
+     vcCUDA(A, B, lo, hi, N);
+   }
+
+.. code-block:: c
+   :caption: vc.cu
+
+   extern "C" {
+     void vcCUDA(float* A, float *B, int start, int end, int GPUN) {
+       float *dA, *dB;
+       cudaMalloc(&dA, sizeof(float) * GPUN);
+       cudaMalloc(&dB, sizeof(float) * GPUN);
+       cudaMemcpy(dB, B + start, sizeof(float) * GPUN, cudaMemcpyHostToDevice);
+       vc<<<ceil(((float)GPUN)/1024), 1024>>>(dA, dB, GPUN);
+       cudaDeviceSynchronize();
+       cudaMemcpy(A + start, dA, sizeof(float) * GPUN, cudaMemcpyDeviceToHost);
+       cudaFree(dA);
+       cudaFree(dB);
+     }
+   }
+
+At the MID-level, most of the CUDA/HIP/OpenCL-level 1) device memory allocation, 2) device synchronization, and 3) data transfer can be written in Chapel. Also, unlike the LOW-MID level, the MID-level API is more Chapel programmer-friendly, where you can allocate GPU memory using the ``new`` keyword and no longer need to directly manipulate C types. Here is an example program written with the MID-level API:
+
+
+.. code-block:: chapel
+   :caption: vc.hybrid.chpl
+
+   proc GPUCallBack(lo: int, hi: int, N: int) {
+     // n * sizeof(int) will be automatically allocated onto the device
+     var dA = new GPUArray(A);
+     var dB = new GPUArray(B);
+     dB.toDevice();
+     LaunchVC(dA.dPtr(), dB.dPtr(), N: size_t);
+     dA.fromDevice();
+     free(dA, dB);
+   }
+
+.. tip:: The MID-level API can interoperate with the LOW-MID-level API.
+
+.. seealso:: :ref:`MID-level API Reference`
+
diff --git a/doc/rst/instructions/write.rst b/doc/rst/instructions/write.rst
new file mode 100644
index 0000000..36bb19c
--- /dev/null
+++ b/doc/rst/instructions/write.rst
@@ -0,0 +1,13 @@
+================
+Using Chapel-GPU
+================
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Step-by-step Guide
+
+   low
+   low-mid
+   mid
+   compile
+   guide

From 6e2a0c8d12c89339e688dfec1f7d58b5017cd879 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 14:52:16 -0400
Subject: [PATCH 111/118] Add Makefile for rst

---
 doc/Makefile | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)
 create mode 100644 doc/Makefile

diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 0000000..ff19d34
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = /usr/local/opt/sphinx-doc/bin/sphinx-build
+SPHINXPROJ    = Chapel-GPU
+SOURCEDIR     = rst
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

From 448c45d0a9d86bebfa16003092d60b1a5856ee81 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 15:26:30 -0400
Subject: [PATCH 112/118] Update docs

---
 LICENSE                        |  2 +-
 Mason.toml                     | 10 ++++++----
 doc/rst/conf.py                |  2 +-
 doc/rst/instructions/build.rst |  3 +++
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/LICENSE b/LICENSE
index 6aa85c4..717b75a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,5 +1,5 @@
 Copyright (c) 2019, Rice University
-Copyright (c) 2019, Georgia Institute of Technology
+Copyright (c) 2019-2020, Georgia Institute of Technology
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/Mason.toml b/Mason.toml
index 85874c6..0c8ecfb 100644
--- a/Mason.toml
+++ b/Mason.toml
@@ -3,11 +3,13 @@
 name = "GPUIterator"
 version = "0.1.0"
 chplVersion = "1.16.0..1.22.1"
+tests = ["GPUIteratorRangeTest.chpl",
+         "GPUIteratorZipTest.chpl"]
+
+# Disable the use of low-level GPU API to build and run
+# examples and tests on non-GPU platforms
+compopts = "-sdisableMultiGPUs"
 
 [dependencies]
 
 
-[examples]
-examples = ["vc.chpl"]
-[examples.vc]
-compopts = "-sdisableMultiGPUs"
\ No newline at end of file
diff --git a/doc/rst/conf.py b/doc/rst/conf.py
index f178b46..6792fc8 100755
--- a/doc/rst/conf.py
+++ b/doc/rst/conf.py
@@ -62,7 +62,7 @@
 # The short X.Y version.
 version = ''
 # The full version, including alpha/beta/rc tags.
-release = '0.1'
+release = '0.2'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/doc/rst/instructions/build.rst b/doc/rst/instructions/build.rst
index c70a33f..dd814e7 100644
--- a/doc/rst/instructions/build.rst
+++ b/doc/rst/instructions/build.rst
@@ -28,6 +28,9 @@ Instructions
 
    git clone https://github.com/ahayashi/chapel-gpu.git
 
+   .. note:: For now, please use the ``feature/explicit`` branch:
+             ``git fetch`` and then ``git checkout feature/explicit``.
+
 2. Build ``libGPUAPI`` library using ``cmake``
 
 .. code-block:: bash

From 65f9319a674dfcdf7e899cb4c4915891aacb043c Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 15:29:07 -0400
Subject: [PATCH 113/118] Update docs

---
 doc/rst/instructions/build.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/doc/rst/instructions/build.rst b/doc/rst/instructions/build.rst
index dd814e7..98b888a 100644
--- a/doc/rst/instructions/build.rst
+++ b/doc/rst/instructions/build.rst
@@ -28,8 +28,7 @@ Instructions
 
    git clone https://github.com/ahayashi/chapel-gpu.git
 
-   .. note:: For now, please use the ``feature/explicit`` branch:
-             ``git fetch`` and then ``git checkout feature/explicit``.
+.. note:: For now, please use the ``feature/explicit`` branch: ``git fetch`` and then ``git checkout feature/explicit``.
 
 2. Build ``libGPUAPI`` library using ``cmake``
 

From 7fd47d8df624085b5cd14e5309af60dcaf2f1236 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 15:36:09 -0400
Subject: [PATCH 114/118] Update docs

---
 README.md | 195 ++----------------------------------------------------
 1 file changed, 6 insertions(+), 189 deletions(-)

diff --git a/README.md b/README.md
index 3fbdda4..99df5ed 100644
--- a/README.md
+++ b/README.md
@@ -1,194 +1,11 @@
-# GPUIterator
+# Chapel-GPU: GPUIterator and GPUAPI module for Chapel
 
-## Summary
-A primary goal of this module is to provide an appropriate interface between Chapel and accelerator programs such that expert accelerator programmers can explore different variants in a portable way (i.e., CPU-only, GPU-only, X% for CPU + Y% for GPU on a single or multiple CPU+GPU node(s)). To address these challenges, here we introduce a Chapel module, ```GPUIterator```, which can be invoked in a ```forall``` loop like this:
-```chapel
-// original forall
-forall i in 1..n { }
-
-// The GPUIterator
-forall i in GPU(1..n, GPUWrapper, CPUPercent) { }
-```
-
-## Motivation
-Chapel allows expert GPU programmers to develop manually prepared GPU programs that can be callable from a Chapel program. This can be done by invoking CUDA, OpenCL, or other C/C++ based acclerator programs using [the C interoperability feature](https://chapel-lang.org/docs/master/technotes/extern.html).
-
-To understand this, consider the following baseline ```forall``` implementation that performs vector copy:
-```chapel
-// Chapel file
-var A: [1..n] real(32);
-var B: [1..n] real(32);
-forall i in 1..n {
-  A(i) = B(i);
-}
-```
-
-The equivalent Chapel+GPU code is shown below:
-```chapel
-// Chapel file
-extern proc GPUfunc(A: [] real(32), B: [] real(32),
-                      lo: int, hi: int);
-
-var A: [1..n] real(32);
-var B: [1..n] real(32);
-GPUfunc(A, B, 1, n);
-```
-
-```c
-// Separate C file
-void GPUfunc(float *A, float *B, int start, int end) {
-  // GPU Implementation (CUDA/OpenCL API)
-  // 1. device memory allocations
-  // 2. host-to-device data transfers
-  // 3. GPU kernel compilations (if needed)
-  // 4. GPU kernel invocations
-  // 5. device-to-host data transfers
-  // 6. clean up
-  // Note: A[0] and B[0] here corresponds to
-  // A(1) and B(1) in the Chapel part respectively
-}
-```
-
-The key difference is that the original ```forall``` loop is replaced with the function call to the native function that should include typical host and device operations including device memory allocations, data transfers, and kernel invocations.
-
-Unfortunately, the source code is not very portable particularly when the user wants to explore different variants to get higher performance. Since GPUs are not always faster than CPUs (and vice versa), the user has to be juggling ```forall``` with ```GPUfunc()``` depending on the data size and the complexity of the computation (e.g., by commenting in/out each version). One intuitive workaround is to put an if statement to decide whether to use which version (CPUs or GPUs). However, this raises another problem: it is still not very portable when doing 1) multi-locale CPU+GPU execution, and 2) further advanced workload distributions such as hybrid execution of the CPU and GPU versions, the latter of which could give additional performance improvement for a certain class of applications and platforms.
-
-One may argue that it is still technically possible to do so at the user-level. For multi-locale GPU execution, we could do like this with appropriate arguments to ```GPUfunc``` - i.e., a local portion of a distributed array, and a subspace of original iteration space ```coforall loc in Locales { on loc { GPUfunc(...); } }```. For hybrid CPU+GPU execution, one could create c tasks and g tasks that take care of a subspace of the original iteration space per locale, where c and g are the numbers of CPUs and GPUs. However, that is what we want to let the ```GPUIterator``` to reduce the complexity of the user-level code.
-
-## How to Use the GPUIterator
-Here is an example code of the GPUIterator:
-
-```chapel
-use GPUIterator;
-
-extern proc GPUfunc(A: [] real(32), B: [] real(32),
-                      lo:int, hi: int, N: int);
-
-var A: [1..n] real(32);
-var B: [1..n] real(32);
-
-// Users need to prepare a callback function which is
-// invoked after the GPUIterator has computed the GPU portion
-var GPUWrapper = lambda (lo:int, hi: int, n: int) {
-  GPUfunc(A, B, lo, hi, n);
-};
-var CPUPercent = 50; // CPUPercent is optional
-forall i in GPU(1..n, GPUWrapper, CPUPercent) {
-  // CPU code
-  A(i) = B(i);
-}
-```
-
-```c
-// Separate C file
-void GPUfunc(float *A, float *B, int start, int end, int n) {
-  // GPU Implementation (CUDA/OpenCL API)
-  // 1. device memory allocations
-  // 2. host-to-device data transfers
-  // 3. GPU kernel compilations (if needed)
-  // 4. GPU kernel invocations
-  // 5. device-to-host data transfers
-  // 6. clean up
-  // Note: A[0] and B[0] here corresponds to
-  // A(1) and B(1) in the Chapel part respectively
-}
-```
-
-You need to 1) import the GPUIterator module, 2) create a wrapper function (```GPUWrapper```) which is a callback function invoked after the module has created a task for the GPU portion of the iteration space (```lo```, ```hi```, ```n```) and eventually invokes the GPU function (```GPUfunc```), 3) then wrap the iteration space using ```GPU()``` with the wrapper function ```GPUWrapper```. Note that the last argument (```CPUPercent```), the percentage of the iteration space will be executed on the CPU, is optional. The default number for it is zero, meaning the whole itreration space goes to the GPU side.
-
-It is worth noting that the GPUIterator gives freedom to you of designing ```GPUfunc()```. In addition to the automatically computed numbers (```lo```, ```hi```, and ```n```), you are required to give appropriate arguments so that the GPU part can work properly. We will discuss how to write the GPU part below. (Note: ```n``` is actually redundant and prepared for verification purposes because it can be computed by ```hi-lo+1```. ```n``` may be deleted in future releases.)
-
-~~Also, currently you need to use [our Chapel compiler](https://github.com/ahayashi/chapel/tree/gpu-iterator) that includes the GPU locale model tailored for this module. Define```CHPL_LOCALE_MODEL=gpu``` when compiling a Chapel program with ```GPUIterator```.~~ You do not need to set ```CHPL_LOCALE_MODEL=gpu``` unless you are working with ```release/beta``` branch.
-
-## How to Compile Your Chapel Programs with the GPUIterator
-Here we explain how to compile a Chapel program with the GPUIterator. In the following, ```$CHPL_GPU_HOME``` represents the top directory of this repository and we take the vector copy code in ```$CHPL_GPU_HOME/apps/vector_copy``` as an example.
-
-### Create an object file with a GPU compiler
-First, create an object file for the GPU program by compiling it with the ```-c``` option:
-```bash
-$ cd $CHPL_GPU_HOME/apps/vector_copy
-// CUDA
-$ nvcc -O3 -arch sm_60 -std=c++11 -c vc.cu -o vc.gpu.o
-// OpenCL
-$ gcc -O3 -c -std=c++11 -c vc.opencl.c -o vc.gpu.o
-```
-
-### Create an executable with the Chapel compiler
-Then, compile the Chapel program (```vc.hybrid.chpl```) with the object file. You will also required to give the path to the GPUIterator module with the ```-M``` option unless it is in the module search path [$CHPL_MODULE_PATH](https://chapel-lang.org/docs/master/technotes/module_search.html):
-```bash
-// CUDA
-$ chpl --fast -M $CHPL_GPU_HOME/chapel-gpu/src vc.hybrid.chpl vc.gpu.o -lcudart -lcuda
-// OpenCL
-$ chpl --fast -M $CHPL_GPU_HOME/chapel-gpu/src vc.hybrid.chpl vc.gpu.o -lOpenCL
-```
-
-Depending on your setting, it may be required to give the ```-L``` option to let the Chapel compiler know the location of CUDA/OpenCL libraries (e.g., ```-L/usr/local/cuda/lib64```).
-
-### Run
-Now you are ready to run the application. Since the CPU/GPU percentage is defined as [a configurable constant](https://chapel-lang.org/docs/master/users-guide/base/configs.html), you can explore different variants easily (CPU-only, GPU-only, X% for CPU + Y% for GPU).
-
-```bash
-// CPU:0%, GPU:100%
-$ ./vc.hybrid
-// CPU:50%, GPU:50%
-$ ./vc.hybrid --CPUPercent=50
-// CPU:100%, GPU:0%
-$ ./vc.hybrid --CPUPercent=100
-```
-
-## Guide to Write GPU programs with the GPUIterator
-In summary, GPU programs for the GPUIterator should include typical host and device operations including device memory allocations, data transfers, and kernel invocations, which is pretty much the same as typical CUDA/OpenCL programs with the exception that 1) input/output data are passed from/to the Chapel part, and 2) the GPU portion of the original iteration space are provided. Here is a complete example of the GPU part for the vector copy program:
-
-```c
-// Separate .cu file
-
-// CUDA kernel for Vector Copy
-__global__ void vc(float *dA, float *dB, int N) {
-    int id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (id < N) {
-	  dA[id] = dB[id];
-    }
-}
-
-void GPUfunc(float *A, float *B, int start, int end, int n) {
-  // GPU Implementation (CUDA/OpenCL)
-  // Note: A[0] and B[0] here corresponds to
-  // A(1) and B(1) in the Chapel part respectively
-  assert(end - start + 1 == n)
-  if (n > 0) {
-    // device memory allocation
-    cudaMalloc(&dA, sizeof(float) * n));
-    cudaMalloc(&dB, sizeof(float) * n));
-
-    // Optimization 1: only transferring the array B because A will be updated on the device
-    cudaMemcpy(dB, B + start, sizeof(float) * n, cudaMemcpyHostToDevice);
-
-    // kernel invocation
-    vc<<<ceil(((float)n)/1024), 1024>>>(dA, dB, n);
-
-    // wait for the completion of the kernel invocation
-    cudaDeviceSynchronize();
-
-    // Optimization 2: only transferring back the array A because B is not updated on the device
-    cudaMemcpy(A + start, dA, sizeof(float) * n, cudaMemcpyDeviceToHost));
-
-    // device memory deallocation
-    cudaFree(dA);
-    cudaFree(dB);
-  }
-
-}
-```
-
-Again, the code above is pretty much the same as typical CUDA programs. Thanks to the C interoperability feature, the Chapel arrays A and B can be treated as just C pointers, and they can be directly passed to CUDA API functions. Thus, all you have to do is to make sure that 1) the GPU kernel works only for the given subspace of the original iteration space, and 2) pass all the required data to the ```GPUfunc``` from the Chapel side.
-
-Please note that the above example does data transfer optimizations where there is no host-to-device transfer of  ```A``` or device-to-host transfer of ```B```. Also, it only allocates and transfers a subarray of ```A``` and ```B``` and invokes the kernel with the subspace since they are safe to do so in this example. However, please be careful about how to optimize your GPU program because the legality of doing so depends on a kernel. For more details, please see the applications in the ```app``` directory.
-
-### Using Other Accelerators
-Techinically, the GPUIterator can invoke other accelerators' programs (e.g., FPGAs) as long as programmers prepare them in accordance with the instruction above. No modication to the Chapel part is required.
+## Documents
+Detailed description of the modules can be found [here](https://ahayashi.github.io/chapel-gpu/index.html).
 
 ## License
-The GPUIterator module is developed and released under the terms of the Apache 2.0 license. See the LICENSE file in this directory for details.
+The GPUIterator and GPUAPI modules are developed and released under the terms of the Apache 2.0 license. See the LICENSE file in this directory for details.
 
 ## Further Readings
-"[GPUIterator: Bridging the Gap between Chapel and GPU Platforms](https://cpb-us-e1.wpmucdn.com/blogs.rice.edu/dist/1/2385/files/2019/06/CHIUW19-Chapel-GPUIterator.pdf)", Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The ACM SIGPLAN 6th Annual Chapel Implementers and Users Workshop (CHIUW), June 2019. (co-located with PLDI2019/ACM FCRC2019). [slides](https://www.slideshare.net/ahayashi10/gpuiterator-bridging-the-gap-between-chapel-and-gpu-platforms).
+- "[GPUIterator: Bridging the Gap between Chapel and GPU Platforms](https://cpb-us-e1.wpmucdn.com/blogs.rice.edu/dist/1/2385/files/2019/06/CHIUW19-Chapel-GPUIterator.pdf)", Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The ACM SIGPLAN 6th Annual Chapel Implementers and Users Workshop (CHIUW), June 2019. (co-located with PLDI2019/ACM FCRC2019). [slides](https://www.slideshare.net/ahayashi10/gpuiterator-bridging-the-gap-between-chapel-and-gpu-platforms).
+- "[Exploring a multi-resolution GPU programming model for Chapel](https://ieeexplore.ieee.org/document/9150427)" Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The 7th Annual Chapel Implementers and Users Workshop (CHIUW), May 2020.
\ No newline at end of file

From 6edd008ea8c2e2857737d96ea50588c9342f8ecd Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Fri, 28 Aug 2020 15:36:48 -0400
Subject: [PATCH 115/118] Update docs

---
 README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 99df5ed..a076dd5 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,11 @@
-# Chapel-GPU: GPUIterator and GPUAPI module for Chapel
+## Chapel-GPU: GPUIterator and GPUAPI module for Chapel
 
-## Documents
+### Documents
 Detailed description of the modules can be found [here](https://ahayashi.github.io/chapel-gpu/index.html).
 
-## License
+### License
 The GPUIterator and GPUAPI modules are developed and released under the terms of the Apache 2.0 license. See the LICENSE file in this directory for details.
 
-## Further Readings
+### Further Readings
 - "[GPUIterator: Bridging the Gap between Chapel and GPU Platforms](https://cpb-us-e1.wpmucdn.com/blogs.rice.edu/dist/1/2385/files/2019/06/CHIUW19-Chapel-GPUIterator.pdf)", Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The ACM SIGPLAN 6th Annual Chapel Implementers and Users Workshop (CHIUW), June 2019. (co-located with PLDI2019/ACM FCRC2019). [slides](https://www.slideshare.net/ahayashi10/gpuiterator-bridging-the-gap-between-chapel-and-gpu-platforms).
 - "[Exploring a multi-resolution GPU programming model for Chapel](https://ieeexplore.ieee.org/document/9150427)" Akihiro Hayashi, Sri Raj Paul, Vivek Sarkar, The 7th Annual Chapel Implementers and Users Workshop (CHIUW), May 2020.
\ No newline at end of file

From 9c97f28caee09a8c62e96666b760efabd23220aa Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Mon, 14 Sep 2020 09:55:08 +0900
Subject: [PATCH 116/118] Support domain(1) and add tests

---
 Mason.toml                                    |   4 +-
 example/vc.opencl.c                           |   7 +-
 src/GPUAPI.opencl.c                           | 294 +++++++++---------
 src/GPUIterator.chpl                          |  72 +++++
 test/GPUIteratorDomainTest.chpl               |  41 +++
 test/GPUIteratorDomainZipTest.chpl            |  41 +++
 ...Test.chpl => GPUIteratorRangeZipTest.chpl} |   0
 7 files changed, 315 insertions(+), 144 deletions(-)
 create mode 100644 test/GPUIteratorDomainTest.chpl
 create mode 100644 test/GPUIteratorDomainZipTest.chpl
 rename test/{GPUIteratorZipTest.chpl => GPUIteratorRangeZipTest.chpl} (100%)

diff --git a/Mason.toml b/Mason.toml
index 0c8ecfb..5aea905 100644
--- a/Mason.toml
+++ b/Mason.toml
@@ -4,7 +4,9 @@ name = "GPUIterator"
 version = "0.1.0"
 chplVersion = "1.16.0..1.22.1"
 tests = ["GPUIteratorRangeTest.chpl",
-         "GPUIteratorZipTest.chpl"]
+         "GPUIteratorDomainTest.chpl",
+         "GPUIteratorRangeZipTest.chpl",
+         "GPUIteratorDomainZipTest.chpl"]
 
 # Disable the use of low-level GPU API to build and run
 # examples and tests on non-GPU platforms
diff --git a/example/vc.opencl.c b/example/vc.opencl.c
index 1959831..a747552 100644
--- a/example/vc.opencl.c
+++ b/example/vc.opencl.c
@@ -10,6 +10,8 @@
 #include <CL/cl.h>
 #endif
 
+#undef PROF
+
 #define MAX_SOURCE_SIZE (0x100000)
 
 #ifdef __cplusplus
@@ -53,8 +55,9 @@ extern "C" {
         cl_device_id device_id = device_ids[did];
         size_t sret;
         clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(str), str, &sret);
+#ifdef PROF
 		printf("clGetDeviceInfo = %ld, GPU %s\n", sret, str);
-
+#endif
         // Create an OpenCL context
         cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret);
         if (ret != CL_SUCCESS) {
@@ -140,6 +143,7 @@ extern "C" {
         if (ret != CL_SUCCESS) {
             printf("%s\n", openclGetErrorString(ret));
         }
+#if PROF
         cl_ulong time_start;
         cl_ulong time_end;
 
@@ -156,6 +160,7 @@ extern "C" {
         clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
         clGetEventProfilingInfo(d2h_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
         printf("D2H time: %lf seconds \n", (time_end-time_start) / 1000000000.0);
+#endif
     }
 #ifdef __cplusplus
 }
diff --git a/src/GPUAPI.opencl.c b/src/GPUAPI.opencl.c
index 5e351ad..3a8dbc5 100644
--- a/src/GPUAPI.opencl.c
+++ b/src/GPUAPI.opencl.c
@@ -15,169 +15,179 @@
 #define OpenCLSafeCall( err ) __OpenCLSafeCall( err, __FILE__, __LINE__ )
 #define OpenCLCheckError()    __OpenCLCheckError( __FILE__, __LINE__ )
 
+#undef DEBUG
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-  const char *openclGetErrorString(cl_int error)
-  {
-    switch(error){
-      // run-time and JIT compiler errors
-    case 0: return "CL_SUCCESS";
-    case -1: return "CL_DEVICE_NOT_FOUND";
-    case -2: return "CL_DEVICE_NOT_AVAILABLE";
-    case -3: return "CL_COMPILER_NOT_AVAILABLE";
-    case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
-    case -5: return "CL_OUT_OF_RESOURCES";
-    case -6: return "CL_OUT_OF_HOST_MEMORY";
-    case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
-    case -8: return "CL_MEM_COPY_OVERLAP";
-    case -9: return "CL_IMAGE_FORMAT_MISMATCH";
-    case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
-    case -11: return "CL_BUILD_PROGRAM_FAILURE";
-    case -12: return "CL_MAP_FAILURE";
-    case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
-    case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
-    case -15: return "CL_COMPILE_PROGRAM_FAILURE";
-    case -16: return "CL_LINKER_NOT_AVAILABLE";
-    case -17: return "CL_LINK_PROGRAM_FAILURE";
-    case -18: return "CL_DEVICE_PARTITION_FAILED";
-    case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
-
-    // compile-time errors
-    case -30: return "CL_INVALID_VALUE";
-    case -31: return "CL_INVALID_DEVICE_TYPE";
-    case -32: return "CL_INVALID_PLATFORM";
-    case -33: return "CL_INVALID_DEVICE";
-    case -34: return "CL_INVALID_CONTEXT";
-    case -35: return "CL_INVALID_QUEUE_PROPERTIES";
-    case -36: return "CL_INVALID_COMMAND_QUEUE";
-    case -37: return "CL_INVALID_HOST_PTR";
-    case -38: return "CL_INVALID_MEM_OBJECT";
-    case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
-    case -40: return "CL_INVALID_IMAGE_SIZE";
-    case -41: return "CL_INVALID_SAMPLER";
-    case -42: return "CL_INVALID_BINARY";
-    case -43: return "CL_INVALID_BUILD_OPTIONS";
-    case -44: return "CL_INVALID_PROGRAM";
-    case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
-    case -46: return "CL_INVALID_KERNEL_NAME";
-    case -47: return "CL_INVALID_KERNEL_DEFINITION";
-    case -48: return "CL_INVALID_KERNEL";
-    case -49: return "CL_INVALID_ARG_INDEX";
-    case -50: return "CL_INVALID_ARG_VALUE";
-    case -51: return "CL_INVALID_ARG_SIZE";
-    case -52: return "CL_INVALID_KERNEL_ARGS";
-    case -53: return "CL_INVALID_WORK_DIMENSION";
-    case -54: return "CL_INVALID_WORK_GROUP_SIZE";
-    case -55: return "CL_INVALID_WORK_ITEM_SIZE";
-    case -56: return "CL_INVALID_GLOBAL_OFFSET";
-    case -57: return "CL_INVALID_EVENT_WAIT_LIST";
-    case -58: return "CL_INVALID_EVENT";
-    case -59: return "CL_INVALID_OPERATION";
-    case -60: return "CL_INVALID_GL_OBJECT";
-    case -61: return "CL_INVALID_BUFFER_SIZE";
-    case -62: return "CL_INVALID_MIP_LEVEL";
-    case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
-    case -64: return "CL_INVALID_PROPERTY";
-    case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
-    case -66: return "CL_INVALID_COMPILER_OPTIONS";
-    case -67: return "CL_INVALID_LINKER_OPTIONS";
-    case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
-
-    // extension errors
-    case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
-    case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
-    case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
-    case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
-    case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
-    case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
-    default: return "Unknown OpenCL error";
+    const char *openclGetErrorString(cl_int error)
+    {
+        switch(error){
+            // run-time and JIT compiler errors
+        case 0: return "CL_SUCCESS";
+        case -1: return "CL_DEVICE_NOT_FOUND";
+        case -2: return "CL_DEVICE_NOT_AVAILABLE";
+        case -3: return "CL_COMPILER_NOT_AVAILABLE";
+        case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
+        case -5: return "CL_OUT_OF_RESOURCES";
+        case -6: return "CL_OUT_OF_HOST_MEMORY";
+        case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
+        case -8: return "CL_MEM_COPY_OVERLAP";
+        case -9: return "CL_IMAGE_FORMAT_MISMATCH";
+        case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
+        case -11: return "CL_BUILD_PROGRAM_FAILURE";
+        case -12: return "CL_MAP_FAILURE";
+        case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
+        case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
+        case -15: return "CL_COMPILE_PROGRAM_FAILURE";
+        case -16: return "CL_LINKER_NOT_AVAILABLE";
+        case -17: return "CL_LINK_PROGRAM_FAILURE";
+        case -18: return "CL_DEVICE_PARTITION_FAILED";
+        case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
+
+            // compile-time errors
+        case -30: return "CL_INVALID_VALUE";
+        case -31: return "CL_INVALID_DEVICE_TYPE";
+        case -32: return "CL_INVALID_PLATFORM";
+        case -33: return "CL_INVALID_DEVICE";
+        case -34: return "CL_INVALID_CONTEXT";
+        case -35: return "CL_INVALID_QUEUE_PROPERTIES";
+        case -36: return "CL_INVALID_COMMAND_QUEUE";
+        case -37: return "CL_INVALID_HOST_PTR";
+        case -38: return "CL_INVALID_MEM_OBJECT";
+        case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
+        case -40: return "CL_INVALID_IMAGE_SIZE";
+        case -41: return "CL_INVALID_SAMPLER";
+        case -42: return "CL_INVALID_BINARY";
+        case -43: return "CL_INVALID_BUILD_OPTIONS";
+        case -44: return "CL_INVALID_PROGRAM";
+        case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
+        case -46: return "CL_INVALID_KERNEL_NAME";
+        case -47: return "CL_INVALID_KERNEL_DEFINITION";
+        case -48: return "CL_INVALID_KERNEL";
+        case -49: return "CL_INVALID_ARG_INDEX";
+        case -50: return "CL_INVALID_ARG_VALUE";
+        case -51: return "CL_INVALID_ARG_SIZE";
+        case -52: return "CL_INVALID_KERNEL_ARGS";
+        case -53: return "CL_INVALID_WORK_DIMENSION";
+        case -54: return "CL_INVALID_WORK_GROUP_SIZE";
+        case -55: return "CL_INVALID_WORK_ITEM_SIZE";
+        case -56: return "CL_INVALID_GLOBAL_OFFSET";
+        case -57: return "CL_INVALID_EVENT_WAIT_LIST";
+        case -58: return "CL_INVALID_EVENT";
+        case -59: return "CL_INVALID_OPERATION";
+        case -60: return "CL_INVALID_GL_OBJECT";
+        case -61: return "CL_INVALID_BUFFER_SIZE";
+        case -62: return "CL_INVALID_MIP_LEVEL";
+        case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
+        case -64: return "CL_INVALID_PROPERTY";
+        case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
+        case -66: return "CL_INVALID_COMPILER_OPTIONS";
+        case -67: return "CL_INVALID_LINKER_OPTIONS";
+        case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
+
+            // extension errors
+        case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
+        case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
+        case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
+        case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
+        case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
+        case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
+        default: return "Unknown OpenCL error";
+        }
     }
-  }
 
-  void __OpenCLSafeCall( cl_int err, const char *file, const int line ) {
+    void __OpenCLSafeCall( cl_int err, const char *file, const int line ) {
 #ifdef OPENCL_ERROR_CHECK
-    if ( CL_SUCCESS != err )
-      {
-        fprintf( stderr, "OpenCLSafeCall() failed at %s:%i : %s\n",
-                 file, line, openclGetErrorString( err ) );
-        exit( -1 );
-      }
+        if ( CL_SUCCESS != err )
+        {
+            fprintf( stderr, "OpenCLSafeCall() failed at %s:%i : %s\n",
+                     file, line, openclGetErrorString( err ) );
+            exit( -1 );
+        }
 #endif
 
-    return;
-  }
-
-  void GetDeviceCount(int *count) {
-    cl_platform_id platforms[MAX_PLATFORM_ENTRIES];
-    cl_uint num_platforms;
-    OpenCLSafeCall(clGetPlatformIDs(MAX_PLATFORM_ENTRIES, platforms, &num_platforms));
-    printf("GPUAPI: %d OpenCL platform(s) found\n", num_platforms);
-    char *env = getenv("CHPL_GPU_PLATFORM_ID");
-    int specified_pid = -1;
-    if (env) {
-      specified_pid = atoi(env);
-      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is specified: %d\n", specified_pid);
-    } else {
-      specified_pid = 0;
-      printf("GPUAPI: CHPL_GPU_PLATFORM_ID is NOT specified. Set to 0\n");
+        return;
     }
-    *count = 0;
-    for (int i = 0; i < num_platforms; i++) {
-      char buffer[1024];
-      OpenCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, buffer, NULL));
-      printf("GPUAPI: platform[%d].VENDOR = %s\n", i, buffer);
-      cl_device_id devices[MAX_DEVICE_ENTRIES];
-      cl_uint num_devices;
-      OpenCLSafeCall(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, MAX_DEVICE_ENTRIES, devices, &num_devices));
-      printf("GPUAPI: \t%d OpenCL device(s)\n", num_devices);
-      if (specified_pid == i) {
-	*count = num_devices;
-      }
-      for (int i = 0; i < num_devices; i++) {
-	OpenCLSafeCall(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
-	printf("GPUAPI: \tdevice[%d].NAME = %s\n", i, buffer);
-      }
+
+    void GetDeviceCount(int *count) {
+        cl_platform_id platforms[MAX_PLATFORM_ENTRIES];
+        cl_uint num_platforms;
+        OpenCLSafeCall(clGetPlatformIDs(MAX_PLATFORM_ENTRIES, platforms, &num_platforms));
+#ifdef DEBUG
+        printf("GPUAPI: %d OpenCL platform(s) found\n", num_platforms);
+#endif
+        char *env = getenv("CHPL_GPU_PLATFORM_ID");
+        int specified_pid = -1;
+        if (env) {
+            specified_pid = atoi(env);
+            printf("GPUAPI: CHPL_GPU_PLATFORM_ID is specified: %d\n", specified_pid);
+        } else {
+            specified_pid = 0;
+            printf("GPUAPI: CHPL_GPU_PLATFORM_ID is NOT specified. Set to 0\n");
+        }
+        *count = 0;
+        for (int i = 0; i < num_platforms; i++) {
+            char buffer[1024];
+            OpenCLSafeCall(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 1024, buffer, NULL));
+#ifdef DEBUG
+            printf("GPUAPI: platform[%d].VENDOR = %s\n", i, buffer);
+#endif
+            cl_device_id devices[MAX_DEVICE_ENTRIES];
+            cl_uint num_devices;
+            OpenCLSafeCall(clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, MAX_DEVICE_ENTRIES, devices, &num_devices));
+#ifdef DEBUG
+            printf("GPUAPI: \t%d OpenCL device(s)\n", num_devices);
+#endif
+            if (specified_pid == i) {
+                *count = num_devices;
+            }
+            for (int i = 0; i < num_devices; i++) {
+                OpenCLSafeCall(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL));
+#ifdef DEBUG
+                printf("GPUAPI: \tdevice[%d].NAME = %s\n", i, buffer);
+#endif
+            }
+        }
     }
-  }
 
-  void GetDevice(int *device) {
+    void GetDevice(int *device) {
 
-  }
+    }
 
-  void SetDevice(int device) {
+    void SetDevice(int device) {
 
-  }
+    }
 
-  void ProfilerStart() {
-  }
+    void ProfilerStart() {
+    }
 
-  void ProfilerStop() {
-  }
+    void ProfilerStop() {
+    }
 
-  void DeviceSynchronize() {
-  }
+    void DeviceSynchronize() {
+    }
 
-  void Malloc(void** devPtr, size_t size) {
-  }
+    void Malloc(void** devPtr, size_t size) {
+    }
 
-  void Memcpy(void* dst, void* src, size_t count, int kind) {
-      switch (kind) {
-      case 0:
-	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
-          break;
-      case 1:
-	//CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
-          break;
-      default:
-          printf("Fatal: Wrong Memcpy kind!\n");
-          exit(1);
-      }
-  }
+    void Memcpy(void* dst, void* src, size_t count, int kind) {
+        switch (kind) {
+        case 0:
+            //CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyHostToDevice));
+            break;
+        case 1:
+            //CudaSafeCall(cudaMemcpy(dst, src, count, cudaMemcpyDeviceToHost));
+            break;
+        default:
+            printf("Fatal: Wrong Memcpy kind!\n");
+            exit(1);
+        }
+    }
 
-  void Free(void* devPtr) {
-  }
+    void Free(void* devPtr) {
+    }
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/GPUIterator.chpl b/src/GPUIterator.chpl
index d81e5d0..931c0cd 100644
--- a/src/GPUIterator.chpl
+++ b/src/GPUIterator.chpl
@@ -316,6 +316,78 @@ module GPUIterator {
       }
     }
 
+    // leader (domain)
+    iter GPU(param tag: iterKind,
+             D: domain(1),
+             GPUWrapper,
+             CPUPercent: int = 0
+             )
+      where tag == iterKind.leader {
+
+      if (debugGPUIterator) then
+	    writeln("[DEBUG GPUITERATOR] In GPUIterator (leader range)");
+
+      var r = D.low..D.high;
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
+        yield i;
+      }
+    }
+
+    // follower (domain)
+    iter GPU(param tag: iterKind,
+             D: domain(1),
+             GPUWrapper,
+             CPUPercent: int = 0,
+             followThis
+             )
+      where tag == iterKind.follower
+      && followThis.size == 1 {
+
+      // index-neutral
+      const (followInds,) = followThis;
+      const lowBasedIters = followInds.translate(D.low);
+
+      if (debugGPUIterator) {
+        writeln("[DEBUG GPUITERATOR] GPUIterator (follower)");
+        writeln("[DEBUG GPUITERATOR] Follower received ", followThis, " as work chunk; shifting to ",
+                lowBasedIters);
+      }
+
+      for i in lowBasedIters do
+        yield i;
+    }
+
+    // standalone (domain)
+    iter GPU(param tag: iterKind,
+             D: domain(1),
+             GPUWrapper,
+             CPUPercent: int = 0
+             )
+  	  where tag == iterKind.standalone {
+
+      if (debugGPUIterator) then
+	    writeln("[DEBUG GPUITERATOR] In GPUIterator (standalone)");
+
+      var r = D.low..D.high;
+      const (CPURange, GPURange) = computeSubranges(r, CPUPercent);
+      for i in createTaskAndYield(tag, r, CPURange, GPURange, GPUWrapper) {
+        yield i;
+      }
+    }
+
+    // serial iterators (domain)
+    iter GPU(D: domain(1),
+             GPUWrapper,
+             CPUPercent: int = 0
+             ) {
+      if (debugGPUIterator) then
+        writeln("[DEBUG GPUITERATOR] In GPUIterator (serial)");
+
+      for i in D do
+        yield i;
+    }
+
     // leader (range)
     iter GPU(param tag: iterKind,
              r: range(?),
diff --git a/test/GPUIteratorDomainTest.chpl b/test/GPUIteratorDomainTest.chpl
new file mode 100644
index 0000000..fc7a232
--- /dev/null
+++ b/test/GPUIteratorDomainTest.chpl
@@ -0,0 +1,41 @@
+use GPUIterator;
+
+const n = 1024;
+var A: [1..n] real(32);
+var B: [1..n] real(32);
+
+for CPUPercent in (0, 25, 50, 75, 100) {
+
+  for i in 1..n {
+    A(i) = -1: real(32);
+    B(i) = i: real(32);
+  }
+
+  var GPUCallBack = lambda(lo: int, hi: int, nElems: int) {
+    if (hi-lo+1 != nElems) {
+      exit(1);
+    }
+    // this is where an external GPU function is supposed to be invoked
+    // for testing purpose, do nothing
+  };
+
+  // Vector Copy with GPUIterator
+  var D: domain(1) = {1..n};
+  forall i in GPU(D, GPUCallBack, CPUPercent) {
+    A(i) = B(i);
+  }
+
+  // verify
+  for i in 1..n {
+    if (i <= n * CPUPercent/100) {
+      if (A(i) != i) {
+        exit(1);
+      }
+    } else {
+      if (A(i) != -1) {
+        exit(1);
+      }
+    }
+  }
+  writeln("CPUPercent: ", CPUPercent, " (Verified)");
+}
diff --git a/test/GPUIteratorDomainZipTest.chpl b/test/GPUIteratorDomainZipTest.chpl
new file mode 100644
index 0000000..bd227d1
--- /dev/null
+++ b/test/GPUIteratorDomainZipTest.chpl
@@ -0,0 +1,41 @@
+use GPUIterator;
+
+const n = 1024;
+var A: [1..n] real(32);
+var B: [1..n] real(32);
+
+for CPUPercent in (0, 25, 50, 75, 100) {
+
+  for i in 1..n {
+    A(i) = -1: real(32);
+    B(i) = i: real(32);
+  }
+
+  var GPUCallBack = lambda(lo: int, hi: int, nElems: int) {
+    if (hi-lo+1 != nElems) {
+      exit(1);
+    }
+    // this is where an external GPU function is supposed to be invoked
+    // for testing purpose, do nothing
+  };
+
+  // Vector Copy with GPUIterator
+  var D: domain(1) = {1..n};
+  forall (_, a, b) in zip(GPU(D, GPUCallBack, CPUPercent), A, B) {
+    a = b;
+  }
+
+  // verify
+  for i in 1..n {
+    if (i <= n * CPUPercent/100) {
+      if (A(i) != i) {
+        exit(1);
+      }
+    } else {
+      if (A(i) != -1) {
+        exit(1);
+      }
+    }
+  }
+  writeln("CPUPercent: ", CPUPercent, " (Verified)");
+}
diff --git a/test/GPUIteratorZipTest.chpl b/test/GPUIteratorRangeZipTest.chpl
similarity index 100%
rename from test/GPUIteratorZipTest.chpl
rename to test/GPUIteratorRangeZipTest.chpl

From ee7b0ae31e675b6b3043b55d4dee1ac75a239b70 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Sun, 13 Sep 2020 21:12:42 -0400
Subject: [PATCH 117/118] Update doc

---
 doc/rst/index.rst              | 2 --
 doc/rst/instructions/build.rst | 2 --
 2 files changed, 4 deletions(-)

diff --git a/doc/rst/index.rst b/doc/rst/index.rst
index 113fde3..13a63a7 100644
--- a/doc/rst/index.rst
+++ b/doc/rst/index.rst
@@ -4,8 +4,6 @@
 Chapel GPU Documentation
 ================================
 
-.. note:: This document is mostly complete, but still under internal review. If you wish to try Chapel-GPU right now, please use `the feature/explicit branch <https://github.com/ahayashi/chapel-gpu/tree/feature/explicit>`_. We'll soon release version 0.2.0, which includes all the features described in this document, and update `the master branch <https://github.com/ahayashi/chapel-gpu>`_ accordingly.
-
 Overview
 --------
 This document describes the following two Chapel modules that facilitate GPU programming in Chapel:
diff --git a/doc/rst/instructions/build.rst b/doc/rst/instructions/build.rst
index 98b888a..c70a33f 100644
--- a/doc/rst/instructions/build.rst
+++ b/doc/rst/instructions/build.rst
@@ -28,8 +28,6 @@ Instructions
 
    git clone https://github.com/ahayashi/chapel-gpu.git
 
-.. note:: For now, please use the ``feature/explicit`` branch: ``git fetch`` and then ``git checkout feature/explicit``.
-
 2. Build ``libGPUAPI`` library using ``cmake``
 
 .. code-block:: bash

From 79d1f80827fa4503322bf06f5f0a320d222125d2 Mon Sep 17 00:00:00 2001
From: Akihiro Hayashi <ahayashi@gatech.edu>
Date: Sun, 13 Sep 2020 21:22:43 -0400
Subject: [PATCH 118/118] Update doc

---
 doc/rst/instructions/build.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/rst/instructions/build.rst b/doc/rst/instructions/build.rst
index c70a33f..4717baf 100644
--- a/doc/rst/instructions/build.rst
+++ b/doc/rst/instructions/build.rst
@@ -46,7 +46,7 @@ This produces the following files:
    :widths: 20, 20, 20, 50
 
    env.sh, Shell Script, ``bin``, Sets environment variables.
-   libGPUAPIX.so, Shared Library, ``lib``, "X is eiher CUDA, HIP, or OpenCL."
+   libGPUAPIX.so, Shared Library, ``lib``, "X is either CUDA, HIP, or OpenCL."
    libGPUAPIX_static.a, Static Library, ``lib``, "A static version of libGPUAPI. Mainly used in this document."
    GPUAPI.h, Header File, ``include``, "A header file that includes the declaration of GPUAPI functions."
    lambda.h, Header File, ``include``, "A header file that facilitates writing device lambdas."