From c0941a81164e14ecc4fba415e4964cc4282b561f Mon Sep 17 00:00:00 2001
From: Dan Giles <daniel.giles@ucdconnect.ie>
Date: Wed, 14 Jul 2021 13:43:05 +0100
Subject: [PATCH 1/4] Test version of the spherical polar

---
 sp/NumericalFluxes_sph.h |  17 +--
 sp/computeFluxes.h       |   8 +-
 sp/computeFluxes_sph.h   | 222 +++++++++++++++++++--------------------
 volna_init/mesh.hpp      |  84 +++++++++++----
 4 files changed, 187 insertions(+), 144 deletions(-)

diff --git a/sp/NumericalFluxes_sph.h b/sp/NumericalFluxes_sph.h
index beee7f0..58946e5 100644
--- a/sp/NumericalFluxes_sph.h
+++ b/sp/NumericalFluxes_sph.h
@@ -1,25 +1,26 @@
 inline void NumericalFluxes_sph(float *left, //OP_INC
               float *right, //OP_INC
               const float *leftcellCenters, const float *rightcellCenters,
+              // const float *edgeCenters,
               const float *edgeFluxes, //OP_READ
               const float *bathySource, //OP_READ
               const float *edgeNormals, const int *isRightBoundary,
               const float *cellVolumes0, //OP_READ
               const float *cellVolumes1) //OP_READ)
 {
-  left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]);
+  left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
   left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
-  left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]);
+  left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
   // Added centered Source Term
-  left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
-  left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]);
+  // left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*leftcellCenters[1]/180.0));
+  left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
   if (!*isRightBoundary) {
-    right[0] += edgeFluxes[0]/cellVolumes1[0];
+    right[0] += (edgeFluxes[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
     right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
-    right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]);
+    right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
     // Added centered Source Term
-    right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
-    right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]);
+    // right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*rightcellCenters[1]/180.0));
+    right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
    } else {
    right[0] += 0.0f;
    right[1] += 0.0f;
diff --git a/sp/computeFluxes.h b/sp/computeFluxes.h
index dde89ba..748746c 100644
--- a/sp/computeFluxes.h
+++ b/sp/computeFluxes.h
@@ -29,7 +29,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
   leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
   leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
-  if (leftCellValues[0] >= 1e-3){
+  if (leftCellValues[0] >= EPS){
      uL = leftCellValues[1]/leftCellValues[0];
      vL = leftCellValues[2]/leftCellValues[0];
   } else {
@@ -49,7 +49,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
     rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
     rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
     rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
-    if (rightCellValues[0] >= 1e-3){
+    if (rightCellValues[0] >= EPS){
        uR = rightCellValues[1]/rightCellValues[0];
        vR = rightCellValues[2]/rightCellValues[0];
      } else {
@@ -135,12 +135,12 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/
           (hR*(uRn - sR) - hL*(uLn - sL));
 
-  if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) {
+  if ((hL <= EPS) && (hR > EPS)) {
       sL = uRn - 2.0f*cR;
       sR = uRn + cR;
       sStar = sL;
   }
-  if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) {
+  if ((hR <= EPS) && (hL > EPS)) {
       sR = uLn + 2.0f*cL;
       sL =  uLn - cL;
       sStar = sR;
diff --git a/sp/computeFluxes_sph.h b/sp/computeFluxes_sph.h
index 13c07dd..f5133be 100644
--- a/sp/computeFluxes_sph.h
+++ b/sp/computeFluxes_sph.h
@@ -19,17 +19,18 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   leftCellValues[2] = cellLeft[2];
   leftCellValues[3] = cellLeft[3];
   float dxl, dyl, dxr, dyr;
-  dxl = (edgeCenters[0] - leftcellCenters[0]);
-  dyl = (edgeCenters[1] - leftcellCenters[1]);
-  dxr = (edgeCenters[0] - rightcellCenters[0]);
-  dyr = (edgeCenters[1] - rightcellCenters[1]);
+  // Convert to arc length
+  dxl = (edgeCenters[0] - leftcellCenters[0])*Radius;
+  dyl = (edgeCenters[1] - leftcellCenters[1])*Radius;
+  dxr = (edgeCenters[0] - rightcellCenters[0])*Radius;
+  dyr = (edgeCenters[1] - rightcellCenters[1])*Radius;
 
   // Second order Reconstruction
-  leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
-  leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
-  leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
-  leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
-  if (leftCellValues[0] >= 1e-3){
+  // leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
+  // leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
+  // leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
+  // leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
+  if (leftCellValues[0] >= EPS){
      uL = leftCellValues[1]/leftCellValues[0];
      vL = leftCellValues[2]/leftCellValues[0];
   } else {
@@ -37,6 +38,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
      vL = 0.0f;
   }
   zL = cellLeft[3] - cellLeft[0];
+  zR = cellRight[3] - cellRight[0];
 
   if (!*isRightBoundary) {
     rightCellValues[0] = cellRight[0];
@@ -45,11 +47,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
     rightCellValues[3] = cellRight[3];
 
     // Second order Reconstruction
-    rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
-    rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
-    rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
-    rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
-    if (rightCellValues[0] >= 1e-3){
+    // rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
+    // rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
+    // rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
+    // rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
+    if (rightCellValues[0] >= EPS){
        uR = rightCellValues[1]/rightCellValues[0];
        vR = rightCellValues[2]/rightCellValues[0];
      } else {
@@ -57,37 +59,40 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
        vR = 0.0f;
      }
    } else {
-    float nx = edgeNormals[0];
-    float ny = edgeNormals[1];
-    float inNormalVelocity = uL * nx + vL * ny;
-    float inTangentVelocity = -1.0f * uL * ny + vL * nx;
-    float outNormalVelocity = 0.0f;
-    float outTangentVelocity = 0.0f;
-    //rightCellValues[3] = leftCellValues[3];
-    // Outflow
-    // float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS;
-    // rightCellValues[3] = zL + wet;
-    // rightCellValues[0] = wet;
-    rightCellValues[3] = leftCellValues[3];
-    rightCellValues[0] = leftCellValues[0];
-    outNormalVelocity = inNormalVelocity;
-    outTangentVelocity = inTangentVelocity;
-    uR = outNormalVelocity * nx - outTangentVelocity * ny;
-    vR = outNormalVelocity * ny + outTangentVelocity * nx;
-
-    // Wall
-    // rightCellValues[3] = leftCellValues[3];
-    // rightCellValues[0] = leftCellValues[0];
-    // outNormalVelocity =  -1.0f*inNormalVelocity;
-    // outTangentVelocity = inTangentVelocity;
-    // uR = outNormalVelocity * nx - outTangentVelocity * ny;
-    // vR = outNormalVelocity * ny + outTangentVelocity * nx;
+     float nx = edgeNormals[0];
+     float ny = edgeNormals[1];
+     float inNormalVelocity = uL * nx + vL * ny;
+     float inTangentVelocity = -1.0f * uL * ny + vL * nx;
+     float outNormalVelocity = 0.0f;
+     float outTangentVelocity = 0.0f;
 
-
-    rightCellValues[1] = uR*rightCellValues[0];
-    rightCellValues[2] = vR*rightCellValues[0];
+     // Outflow
+     float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS;
+     float critical = sqrt(uL*uL + vL*vL);
+     outTangentVelocity = inTangentVelocity;
+     if (critical < sqrt(g*leftCellValues[0])){
+       rightCellValues[0] = wet;
+       rightCellValues[3] = wet + zL;
+       outNormalVelocity = 0.0f;
+     } else {
+       rightCellValues[0] = leftCellValues[0];
+       rightCellValues[3] = leftCellValues[3];
+       outNormalVelocity = inNormalVelocity;
+     }
+     /*
+     // Wall
+     rightCellValues[3] = leftCellValues[3];
+     rightCellValues[0] = leftCellValues[0];
+     outNormalVelocity =  -1.0f*inNormalVelocity;
+     outTangentVelocity = inTangentVelocity;
+     */
+     uR = outNormalVelocity * nx - outTangentVelocity * ny;
+     vR = outNormalVelocity * ny + outTangentVelocity * nx;
+     rightCellValues[1] = uR*rightCellValues[0];
+     rightCellValues[2] = vR*rightCellValues[0];
+     zR = zL;
   }
-  zR = cellRight[3] - cellRight[0];
+
   rightCellValues[3] -= rightCellValues[0];
   leftCellValues[3] -= leftCellValues[0];
   // ------------------------------------------------------------------------------------
@@ -104,10 +109,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   hR = hR > 0.0f ? hR : 0.0f;
   bathySource[0] -= .5f * g * (hL * hL);
   bathySource[1] -= .5f * g * (hR * hR);
-  // Audusse Reconstruction(2005) 2nd order Centered term
-  bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL);
-  bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR);
-
+  // Audusse Reconstruction(2005) 2nd order centered term
+  // bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL);
+  // bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR);
+  bathySource[2] = .5f * g * (hL * hL)*sin(M_PI*leftcellCenters[1]/180.0);
+  bathySource[3] = .5f * g * (hR * hR)*sin(M_PI*rightcellCenters[1]/180.0);
   bathySource[0] *= *edgeLength;
   bathySource[1] *= *edgeLength;
   bathySource[2] *= *edgeLength;
@@ -134,14 +140,12 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/
           (hR*(uRn - sR) - hL*(uLn - sL));
 
-  //if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) {
-  if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) {
+  if ((hL <= EPS) && (hR > EPS)) {
       sL = uRn - 2.0f*cR;
       sR = uRn + cR;
       sStar = sL;
   }
-  //if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) {
-  if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) {
+  if ((hR <= EPS) && (hL > EPS)) {
       sR = uLn + 2.0f*cL;
       sL =  uLn - cL;
       sStar = sR;
@@ -152,35 +156,29 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   float uRp = vR*edgeNormals[0] - uR*edgeNormals[1];
 
   float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N;
-  //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes);
-  float HuDotN = (leftCellValues[1]/cos(M_PI*leftcellCenters[1]/180.0)) * edgeNormals[0] +
-  (leftCellValues[2]) * edgeNormals[1];
-
+  // float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1];
+  float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1];
   LeftFluxes_H = HuDotN;
   LeftFluxes_U = HuDotN * uL;
   LeftFluxes_V = HuDotN * vL;
   // Normal Momentum flux term
   LeftFluxes_N = HuDotN * uLn;
 
-  LeftFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hL * hL ))/(cos(M_PI*leftcellCenters[1]/180.0));
-  LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL );
+  LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL);
+  LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL* cos(M_PI*edgeCenters[1]/180.0));
   LeftFluxes_N += (.5f * g ) * ( hL * hL );
-  //end of inlined
 
   float RightFluxes_H, RightFluxes_U, RightFluxes_V, RightFluxes_N;
-  //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes);
-  HuDotN = (rightCellValues[1]/cos(M_PI*rightcellCenters[1]/180.0)) * edgeNormals[0] +
-  (rightCellValues[2]) * edgeNormals[1];
-
+  HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1];
+  // HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1];
   RightFluxes_H =   HuDotN;
   RightFluxes_U =   HuDotN * uR;
   RightFluxes_V =   HuDotN * vR;
   // Normal Momentum flux term
   RightFluxes_N =   HuDotN * uRn;
-
-  RightFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hR * hR ))/(cos(M_PI*rightcellCenters[1]/180.0));
-  RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR );
-  // RightFluxes_N += (.5f * g ) * ( hR * hR );
+  RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( hR * hR);
+  RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR* cos(M_PI*edgeCenters[1]/180.0));
+  RightFluxes_N += (.5f * g ) * ( hR * hR );
   // ------------------------------------------------------------------------
   //    HLLC Flux Solver (Batten et al. 1997)
   //    "On the choice of wavespeeds for the HLLC Reimann solver"
@@ -212,7 +210,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   //------------------------------------------------------------------------
   // HLL Flux Solver
   // ------------------------------------------------------------------------
-  /*float sLMinus = sL < 0.0f ? sL : 0.0f;
+  float sLMinus = sL < 0.0f ? sL : 0.0f;
   float sRPlus = sR > 0.0f ? sR : 0.0f;
   float sRMinussL = sRPlus - sLMinus;
   sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
@@ -223,67 +221,65 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   ( t1 * LeftFluxes_H ) +
   ( t2 * RightFluxes_H ) +
   ( t3 * ( hR - hL ) );
-
+  // op_printf("out %f \n", out[0]);
   out[1] =
   ( t1 * LeftFluxes_U ) +
   ( t2 * RightFluxes_U ) +
-  ( t3 * ( (rightCellValues[1]) -
-          (leftCellValues[1]) ) );
+  ( t3 * ( (hR*uR) - (hL*uL) ) );
 
   out[2] =
   ( t1 * LeftFluxes_V ) +
   ( t2 * RightFluxes_V ) +
-  ( t3 * ( (rightCellValues[2]) -
-          (leftCellValues[2]) ) );
-  */
+  ( t3 * ( (hR*vR) - (hL*vL) ) );
+
   // ------------------------------------------------------------------------
   //     HLLC Flux Solver (Huang et al. 2013)
   //     "Well-balanced finite volume scheme for shallow water flooding and drying
   //     over arbitrary topography"
   // ------------------------------------------------------------------------
-  float sLMinus = sL < 0.0f ? sL : 0.0f;
-  float sRPlus = sR > 0.0f ? sR : 0.0f;
-  float sRMinussL = sRPlus - sLMinus;
-  sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
-  float t1 = sRPlus / sRMinussL;
-  float t2 = ( -1.0 * sLMinus ) / sRMinussL;
-  float t3 = ( sRPlus * sLMinus ) / sRMinussL;
-  float FStar[3];
-  FStar[0] =
-  ( t1 * LeftFluxes_H ) +
-  ( t2 * RightFluxes_H ) +
-  ( t3 * ( hR - hL ) );
-
-
-  FStar[1] =
-  ( t1 * LeftFluxes_N ) +
-  ( t2 * RightFluxes_N ) +
-  ( t3 * ( (hR * uRn) -
-          (hL * uLn) ) );
-
-  if( sL >= 0.0f) {
-    out[0] = t1*LeftFluxes_H;
-    out[1] = t1*LeftFluxes_U;
-    out[2] = t1*LeftFluxes_V;
-  } else if ((sL < 0.0f) && (sStar >= 0.0f)){
-    out[0] = FStar[0];
-    FStar[2] = FStar[0] * uLp;
-    out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
-    out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
-  } else if((sStar < 0.0f) && (sR >= 0.0f)){
-    out[0] = FStar[0];
-    FStar[2] = FStar[0] * uRp;
-    out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
-    out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
-  } else {
-    out[0] = t2*RightFluxes_H;
-    out[1] = t2*RightFluxes_U;
-    out[2] = t2*RightFluxes_V;
-  }
+  // float sLMinus = sL < 0.0f ? sL : 0.0f;
+  // float sRPlus = sR > 0.0f ? sR : 0.0f;
+  // float sRMinussL = sRPlus - sLMinus;
+  // sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
+  // float t1 = sRPlus / sRMinussL;
+  // float t2 = ( -1.0 * sLMinus ) / sRMinussL;
+  // float t3 = ( sRPlus * sLMinus ) / sRMinussL;
+  // float FStar[3];
+  // FStar[0] =
+  // ( t1 * LeftFluxes_H ) +
+  // ( t2 * RightFluxes_H ) +
+  // ( t3 * ( hR - hL ) );
+  //
+  //
+  // FStar[1] =
+  // ( t1 * LeftFluxes_N ) +
+  // ( t2 * RightFluxes_N ) +
+  // ( t3 * ( (hR * uRn) -
+  //         (hL * uLn) ) );
+  //
+  // if( sL >= 0.0f) {
+  //   out[0] = t1*LeftFluxes_H;
+  //   out[1] = t1*LeftFluxes_U;
+  //   out[2] = t1*LeftFluxes_V;
+  // } else if ((sL < 0.0f) && (sStar >= 0.0f)){
+  //   out[0] = FStar[0];
+  //   FStar[2] = FStar[0] * uLp;
+  //   out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
+  //   out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
+  // } else if((sStar < 0.0f) && (sR >= 0.0f)){
+  //   out[0] = FStar[0];
+  //   FStar[2] = FStar[0] * uRp;
+  //   out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
+  //   out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
+  // } else {
+  //   out[0] = t2*RightFluxes_H;
+  //   out[1] = t2*RightFluxes_U;
+  //   out[2] = t2*RightFluxes_V;
+  // }
   out[0] *= *edgeLength;
   out[1] *= *edgeLength;
   out[2] *= *edgeLength;
-  // op_printf("Length %f \n", *edgeLength);
+
   float maximum = fabs(uLn + cL);
   maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL);
   maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR);
diff --git a/volna_init/mesh.hpp b/volna_init/mesh.hpp
index 066e760..bc442da 100755
--- a/volna_init/mesh.hpp
+++ b/volna_init/mesh.hpp
@@ -18,6 +18,7 @@
 #include "meshObjects.hpp"
 #include "meshIo.hpp"
 #include "mathParser.hpp"
+namespace bg = boost::geometry;
 
 inline bool PairCompare( const std::pair<int,Face> &elem1,
                          const std::pair<int,Face> &elem2 )
@@ -63,7 +64,7 @@ class Mesh {
   void WriteMeshBandwith( std::ofstream & );
   void ComputeConnectivity();
   void LegacyInterface();
-  void ComputeGeometricQuantities();
+  void ComputeGeometricQuantities(unsigned int&);
   // void ComputeGradientInterpolator();
 };
 
@@ -408,14 +409,14 @@ void Mesh::RCMRenumbering() {
 
 	if ( neighbors.at(j) != -1 )
 	  trueNeighbors.push_back( Cells.at(j) );
-	
+
       }
-   
+
 
       for ( int j = 0; j < 3; ++j ) {
 
         int neighbor = neighbors.at( 2 - j );
-	
+
         if ( neighbor != -1 && parent[neighbor] == -1 ) {
 
           // neighbor is unvisited
@@ -612,7 +613,7 @@ void Mesh::LegacyInterface() {
 
     // detect boundary Triangles
     if ( Cells.at(i).neighbors()[2] == -1 ) {
-      
+
       Cells.at(i).boundary = true;
       Cells.at(i).boundary_type = -1;
 
@@ -675,10 +676,14 @@ void Mesh::LegacyInterface() {
 //
 // }
 
-void Mesh::ComputeGeometricQuantities() {
+void Mesh::ComputeGeometricQuantities(unsigned int& spherical) {
 
   std::cerr << "Computing mesh geometric quantities... ";
-
+  if (spherical){
+    std::cerr << "Spherical coordinate system" << std::endl;
+  } else {
+    std::cerr << "Cartesian coordinate system" << std::endl;
+  }
   CellCenters = GeomValues( NVolumes );
   CellVolumes = ScalarValue::Zero( NVolumes );
 #pragma omp parallel for
@@ -696,19 +701,37 @@ void Mesh::ComputeGeometricQuantities() {
     CellCenters.y( i ) = center( 1 );
     CellCenters.z( i ) = center( 2 );
 
-    // Cell volumes
+
     Point p1 = Nodes[ Cells.at(i).vertices()[0] ];
     Point p2 = Nodes[ Cells.at(i).vertices()[1] ];
     Point p3 = Nodes[ Cells.at(i).vertices()[2] ];
-
-    RealType volume = 0.5 * std::abs( orient2d( p1, p2, p3 ) );
-
+    RealType volume;
+
+    if (spherical){
+      // Area is computed using boost libraries
+      typedef bg::model::point
+      <double, 2, bg::cs::spherical_equatorial<bg::degree>> spherical_point;
+      // <double, 2, bg::cs::geographic<bg::degree>> spherical_point;
+      spherical_point vert1(p1.x(), p1.y());
+      spherical_point vert2(p2.x(), p2.y());
+      spherical_point vert3(p3.x(), p3.y());
+      bg::model::polygon< spherical_point > sph_poly;
+      bg::append( sph_poly, vert1);
+      bg::append( sph_poly, vert2);
+      bg::append( sph_poly, vert3);
+      bg::append( sph_poly, vert1);
+      bg::correct(sph_poly);
+      double const earth_radius = 6378136.6; // miles
+      bg::strategy::area::spherical< spherical_point > spherical_earth(earth_radius);
+      volume = bg::area(sph_poly,spherical_earth);
+    } else {
+      volume = 0.5 * std::abs( orient2d( p1, p2, p3 ) );
+    }
     CellVolumes( i ) = volume;
 
   }
-
-  FacetCenters = GeomValues::GeomValues( NFaces );
-  FacetNormals = GeomValues::GeomValues( NFaces );
+  FacetCenters = GeomValues( NFaces );
+  FacetNormals = GeomValues( NFaces );
   FacetVolumes = ScalarValue::Zero( NFaces );
 #pragma omp parallel for
   for (  int i = 0; i < NFaces; ++i ) {
@@ -728,17 +751,40 @@ void Mesh::ComputeGeometricQuantities() {
     // Facet volume (e.g. length)
     Point p1 = Nodes[ Facets.at(i).vertices()[0] ];
     Point p2 = Nodes[ Facets.at(i).vertices()[1] ];
-
-    RealType volume = (p2 - p1).norm();
-
+    RealType volume;
+
+    if (spherical){
+       // Length is computed using n-vector formulation (spherical assumption)
+       // double const earth_radius = 6378136.6;
+       // double rad = M_PI/180.0;
+       // double test_length = earth_radius*acos(cos(p1.x()*rad)*cos(p1.y()*rad)*cos(p2.x()*rad)*cos(p2.y()*rad) +
+       //                   cos(p1.y()*rad)*sin(p1.x()*rad)*cos(p2.y()*rad)*sin(p2.x()*rad) +
+       //                   sin(p1.y()*rad)*sin(p2.y()*rad));
+       // Length is computed using boost libraries (geoid assumption)
+       typedef bg::model::point
+       <double, 2, bg::cs::spherical_equatorial<bg::degree>> spherical_point;
+       spherical_point vert1(p1.x(), p1.y());
+       spherical_point vert2(p2.x(), p2.y());
+       double const earth_radius = 6378136.6; // meters
+       volume = bg::distance(vert1, vert2)*earth_radius;
+    } else {
+       volume = (p2 - p1).norm();
+    }
     FacetVolumes( i ) = volume;
 
     // Facet normal
     Vector normal = Vector::UnitX();
     normal.x() =  - ( p2 - p1 ).y();
     normal.y() = ( p2 - p1).x();
-    normal.normalize();
-
+    // normal.normalize();
+    if (spherical){
+       double rad = M_PI/180.0;
+       double sigma = sqrt((normal.x()/cos(FacetCenters.y(i)*rad))*(normal.x()/cos(FacetCenters.y(i)*rad)) + (normal.y()*normal.y()));
+       normal.x() = normal.x()/(sigma*cos(FacetCenters.y(i)*rad));
+       normal.y() = normal.y()/(sigma);
+    } else {
+       normal.normalize();
+    }
     // Flip the normal if needed.
     // RealType
     //   ScalarProduct = ( FacetCenters.col(i) -

From 1a93fa5856e7445214d587fca380b11da9ba8b72 Mon Sep 17 00:00:00 2001
From: DanGiles <daniel.giles@ucdconnect.ie>
Date: Mon, 15 Nov 2021 12:48:01 +0000
Subject: [PATCH 2/4] Removal of EvolveValuesRK3* and support for a100 GPUs

---
 sp/Makefile | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/sp/Makefile b/sp/Makefile
index d10997e..200f08a 100755
--- a/sp/Makefile
+++ b/sp/Makefile
@@ -42,7 +42,7 @@ ifdef PTSCOTCH_INSTALL_PATH
 PTSCOTCH_INC 	= -I$(PTSCOTCH_INSTALL_PATH)/include -DHAVE_PTSCOTCH
 PTSCOTCH_INC   = -I$(PTSCOTCH_INSTALL_PATH)/include
 PTSCOTCH_LIB 	= -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotch \
-	-L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotcherr
+	-L$(PTSCOTCH_INSTALL_PATH)/lib/ -lscotch -lptscotcherr
 endif
 
 ifeq ($(OP2_COMPILER),gnu)
@@ -56,7 +56,7 @@ else
 ifeq ($(OP2_COMPILER),intel)
 	CPP = icpc
 #	CPPFLAGS = -g -O0
-	CPPFLAGS = -O3 -xHost -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX
+	CPPFLAGS = -O3 -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX
 #	CPPFLAGS = -g -O0 -vec-report -xSSE4.2 -parallel
 	OMPFLAGS = -qopenmp
 	MPICPP = $(MPI_INSTALL_PATH)/bin/mpicxx
@@ -75,7 +75,7 @@ endif
 #
 
 
-NVCCFLAGS	= -arch=sm_60 -Xptxas=-v -O2 -m64 -g #-G
+NVCCFLAGS	= -arch=sm_80 -Xptxas=-v -O2 -m64 -g #-G
 
 #
 # master to make all versions
@@ -114,16 +114,15 @@ else
 endif
 
 
-cuda/volna_kernels_cu.o:	cuda/volna_kernels.cu \
-	EvolveValuesRK2_1.h EvolveValuesRK2_2.h EvolveValuesRK3_1.h EvolveValuesRK3_2.h  EvolveValuesRK3_3.h EvolveValuesRK3_4.h applyConst.h getMaxElevation.h getTotalVol.h \
+cuda/volna_kernels_cu.o:        cuda/volna_kernels.cu \
+	EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \
 	initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \
-	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h SpaceDiscretization.h NumericalFluxes.h simulation_1.h \
-	values_operation2.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu  cuda/EvolveValuesRK3_1_kernel.cu\
-	cuda/EvolveValuesRK3_2_kernel.cu cuda/EvolveValuesRK3_3_kernel.cu cuda/EvolveValuesRK3_4_kernel.cu cuda/computeGradient_kernel.cu \
-        cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/SpaceDiscretization_kernel.cu cuda/NumericalFluxes_kernel.cu \
-	cuda/simulation_1_kernel.cu \
+	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h \
+	NumericalFluxes.h simulation_1.h values_operation2.h cuda/applyConst_kernel.cu \
+	cuda/EvolveValuesRK2_1_kernel.cu  cuda/EvolveValuesRK2_2_kernel.cu cuda/computeFluxes_kernel.cu \
+	cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \
+	cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \
 	cuda/values_operation2_kernel.cu Makefile
-
 	nvcc  $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu
 
 volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile

From dd6f10a83e9abe6574518f7850056c48fbb9dfb1 Mon Sep 17 00:00:00 2001
From: DanGiles <daniel.giles@ucdconnect.ie>
Date: Mon, 15 Nov 2021 13:00:37 +0000
Subject: [PATCH 3/4] Removal of EvolveValuesRK3* from Makefile, support for
 A100 GPUs and Regenerated cuda files

---
 sp/Friction_manning.h                       |   4 +-
 sp/Makefile                                 |  22 +-
 sp/NumericalFluxes_sph.h                    |  17 +-
 sp/computeFluxes.h                          |  17 +-
 sp/computeFluxes_sph.h                      | 222 +++----
 sp/cuda/EvolveValuesRK2_1_kernel.cu         | 126 +---
 sp/cuda/EvolveValuesRK2_2_kernel.cu         | 105 +---
 sp/cuda/Friction_manning_kernel.cu          | 125 ++++
 sp/cuda/NumericalFluxes_kernel.cu           | 246 ++++----
 sp/cuda/Timestep_kernel.cu                  | 218 +++++++
 sp/cuda/applyConst_kernel.cu                |  30 +-
 sp/cuda/computeFluxes_kernel.cu             | 253 +++++---
 sp/cuda/computeGradient_kernel.cu           | 160 +++--
 sp/cuda/gatherLocations_kernel.cu           |  51 +-
 sp/cuda/getMaxElevation_kernel.cu           |  29 +-
 sp/cuda/getMaxSpeed_kernel.cu               |  27 +-
 sp/cuda/getTotalVol_kernel.cu               |  10 +-
 sp/cuda/incConst_kernel.cu                  |  34 +-
 sp/cuda/initBathyRelative_formula_kernel.cu |  34 +-
 sp/cuda/initBathymetry_formula_kernel.cu    |  32 +-
 sp/cuda/initBathymetry_large_kernel.cu      |  84 ++-
 sp/cuda/initBathymetry_update_kernel.cu     |  82 ++-
 sp/cuda/initBore_select_kernel.cu           |  26 +-
 sp/cuda/initEta_formula_kernel.cu           |  29 +-
 sp/cuda/initGaussianLandslide_kernel.cu     |  26 +-
 sp/cuda/initU_formula_kernel.cu             |  27 +-
 sp/cuda/initV_formula_kernel.cu             |  28 +-
 sp/cuda/limiter_kernel.cu                   |  67 +-
 sp/cuda/simulation_1_kernel.cu              |  26 +-
 sp/cuda/values_operation2_kernel.cu         |  24 +-
 sp/cuda/volna_hybkernels.cu                 | 644 ++++++++++++++------
 sp/cuda/volna_kernels.cu                    |  42 +-
 sp/volna_op.cpp                             |  52 +-
 sp/volna_simulation_op.cpp                  | 113 +++-
 34 files changed, 1935 insertions(+), 1097 deletions(-)
 create mode 100644 sp/cuda/Friction_manning_kernel.cu
 create mode 100644 sp/cuda/Timestep_kernel.cu

diff --git a/sp/Friction_manning.h b/sp/Friction_manning.h
index 2652b19..0fa4e4b 100644
--- a/sp/Friction_manning.h
+++ b/sp/Friction_manning.h
@@ -14,10 +14,10 @@ inline void Friction_manning(const float *dT,const float *M_n, //OP_RW, discard
   // Update Momentum
   values[0] = TruncatedH;
   values[3] = values[3];
-  if (values[0] <= 1e-3){
+  if (values[0] <= EPS){
      values[1] = 0.0f;
      values[2] = 0.0f;
-  } else if (1e-3 < values[0] <= 50.0f) {
+  } else if (EPS < values[0] <= 50.0f) {
      values[1] = values[1] / (1.0f + Fr * *dT);
      values[2] = values[2] / (1.0f + Fr * *dT);
   } else {
diff --git a/sp/Makefile b/sp/Makefile
index d10997e..101973a 100755
--- a/sp/Makefile
+++ b/sp/Makefile
@@ -42,7 +42,7 @@ ifdef PTSCOTCH_INSTALL_PATH
 PTSCOTCH_INC 	= -I$(PTSCOTCH_INSTALL_PATH)/include -DHAVE_PTSCOTCH
 PTSCOTCH_INC   = -I$(PTSCOTCH_INSTALL_PATH)/include
 PTSCOTCH_LIB 	= -L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotch \
-	-L$(PTSCOTCH_INSTALL_PATH)/lib/ -lptscotcherr
+	-L$(PTSCOTCH_INSTALL_PATH)/lib/ -lscotch -lptscotcherr
 endif
 
 ifeq ($(OP2_COMPILER),gnu)
@@ -56,7 +56,7 @@ else
 ifeq ($(OP2_COMPILER),intel)
 	CPP = icpc
 #	CPPFLAGS = -g -O0
-	CPPFLAGS = -O3 -xHost -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX
+	CPPFLAGS = -O3 -parallel -g -DMPICH_IGNORE_CXX_SEEK -DMPICH_SKIP_MPICXX
 #	CPPFLAGS = -g -O0 -vec-report -xSSE4.2 -parallel
 	OMPFLAGS = -qopenmp
 	MPICPP = $(MPI_INSTALL_PATH)/bin/mpicxx
@@ -75,7 +75,7 @@ endif
 #
 
 
-NVCCFLAGS	= -arch=sm_60 -Xptxas=-v -O2 -m64 -g #-G
+NVCCFLAGS	= -arch=sm_80 -Xptxas=-v -O2 -m64 -g #-G
 
 #
 # master to make all versions
@@ -114,16 +114,14 @@ else
 endif
 
 
-cuda/volna_kernels_cu.o:	cuda/volna_kernels.cu \
-	EvolveValuesRK2_1.h EvolveValuesRK2_2.h EvolveValuesRK3_1.h EvolveValuesRK3_2.h  EvolveValuesRK3_3.h EvolveValuesRK3_4.h applyConst.h getMaxElevation.h getTotalVol.h \
+cuda/volna_kernels_cu.o:        cuda/volna_kernels.cu \
+	EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \
 	initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \
-	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h SpaceDiscretization.h NumericalFluxes.h simulation_1.h \
-	values_operation2.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu cuda/EvolveValuesRK2_2_kernel.cu  cuda/EvolveValuesRK3_1_kernel.cu\
-	cuda/EvolveValuesRK3_2_kernel.cu cuda/EvolveValuesRK3_3_kernel.cu cuda/EvolveValuesRK3_4_kernel.cu cuda/computeGradient_kernel.cu \
-        cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/SpaceDiscretization_kernel.cu cuda/NumericalFluxes_kernel.cu \
-	cuda/simulation_1_kernel.cu \
-	cuda/values_operation2_kernel.cu Makefile
-
+	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h NumericalFluxes.h simulation_1.h \
+	values_operation2.h Friction_manning.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu  cuda/EvolveValuesRK2_2_kernel.cu \
+	cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \
+	cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \
+	cuda/values_operation2_kernel.cu cuda/Friction_manning_kernel.cu Makefile
 	nvcc  $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu
 
 volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile
diff --git a/sp/NumericalFluxes_sph.h b/sp/NumericalFluxes_sph.h
index 58946e5..beee7f0 100644
--- a/sp/NumericalFluxes_sph.h
+++ b/sp/NumericalFluxes_sph.h
@@ -1,26 +1,25 @@
 inline void NumericalFluxes_sph(float *left, //OP_INC
               float *right, //OP_INC
               const float *leftcellCenters, const float *rightcellCenters,
-              // const float *edgeCenters,
               const float *edgeFluxes, //OP_READ
               const float *bathySource, //OP_READ
               const float *edgeNormals, const int *isRightBoundary,
               const float *cellVolumes0, //OP_READ
               const float *cellVolumes1) //OP_READ)
 {
-  left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
+  left[0] -= (edgeFluxes[0])/(Radius*cellVolumes0[0]);
   left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
-  left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
+  left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/(Radius*cellVolumes0[0]);
   // Added centered Source Term
-  // left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*leftcellCenters[1]/180.0));
-  left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
+  left[1] += (bathySource[2] *edgeNormals[0])/(Radius*cellVolumes0[0]*cos(M_PI*leftcellCenters[1]/180.0));
+  left[2] += (bathySource[2] *edgeNormals[1])/(Radius*cellVolumes0[0]);
   if (!*isRightBoundary) {
-    right[0] += (edgeFluxes[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
+    right[0] += edgeFluxes[0]/cellVolumes1[0];
     right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
-    right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
+    right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/(Radius*cellVolumes1[0]);
     // Added centered Source Term
-    // right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*46.0/180.0));//*cos(M_PI*rightcellCenters[1]/180.0));
-    right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
+    right[1] -= (bathySource[3] *edgeNormals[0])/(Radius*cellVolumes1[0]*cos(M_PI*rightcellCenters[1]/180.0));
+    right[2] -= (bathySource[3] *edgeNormals[1])/(Radius*cellVolumes1[0]);
    } else {
    right[0] += 0.0f;
    right[1] += 0.0f;
diff --git a/sp/computeFluxes.h b/sp/computeFluxes.h
index 748746c..e9cf409 100644
--- a/sp/computeFluxes.h
+++ b/sp/computeFluxes.h
@@ -29,7 +29,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
   leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
   leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
-  if (leftCellValues[0] >= EPS){
+  if (leftCellValues[0] >= 1e-3){
      uL = leftCellValues[1]/leftCellValues[0];
      vL = leftCellValues[2]/leftCellValues[0];
   } else {
@@ -49,7 +49,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
     rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
     rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
     rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
-    if (rightCellValues[0] >= EPS){
+    if (rightCellValues[0] >= 1e-3){
        uR = rightCellValues[1]/rightCellValues[0];
        vR = rightCellValues[2]/rightCellValues[0];
      } else {
@@ -77,9 +77,9 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
       rightCellValues[3] = leftCellValues[3];
       outNormalVelocity = inNormalVelocity;
     }
-    /*
+    
     // Wall
-    rightCellValues[3] = leftCellValues[3];
+    /*rightCellValues[3] = leftCellValues[3];
     rightCellValues[0] = leftCellValues[0];
     outNormalVelocity =  -1.0f*inNormalVelocity;
     outTangentVelocity = inTangentVelocity;
@@ -135,12 +135,14 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/
           (hR*(uRn - sR) - hL*(uLn - sL));
 
-  if ((hL <= EPS) && (hR > EPS)) {
+  if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) {
+  // if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) {
       sL = uRn - 2.0f*cR;
       sR = uRn + cR;
       sStar = sL;
   }
-  if ((hR <= EPS) && (hL > EPS)) {
+  if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) {
+  // if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) {
       sR = uLn + 2.0f*cL;
       sL =  uLn - cL;
       sStar = sR;
@@ -151,6 +153,7 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   float uRp = vR*edgeNormals[0] - uR*edgeNormals[1];
 
   float LeftFluxes_H, LeftFluxes_N, LeftFluxes_U, LeftFluxes_V;
+  //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes);
   float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1];
 
   LeftFluxes_H = HuDotN;
@@ -162,8 +165,10 @@ inline void computeFluxes(const float *cellLeft, const float *cellRight,
   LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL );
   LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL );
   LeftFluxes_N += (.5f * g ) * ( hL * hL );
+  //end of inlined
 
   float RightFluxes_H,RightFluxes_N, RightFluxes_U, RightFluxes_V;
+  //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes);
   HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1];
 
   RightFluxes_H =   HuDotN;
diff --git a/sp/computeFluxes_sph.h b/sp/computeFluxes_sph.h
index f5133be..13c07dd 100644
--- a/sp/computeFluxes_sph.h
+++ b/sp/computeFluxes_sph.h
@@ -19,18 +19,17 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   leftCellValues[2] = cellLeft[2];
   leftCellValues[3] = cellLeft[3];
   float dxl, dyl, dxr, dyr;
-  // Convert to arc length
-  dxl = (edgeCenters[0] - leftcellCenters[0])*Radius;
-  dyl = (edgeCenters[1] - leftcellCenters[1])*Radius;
-  dxr = (edgeCenters[0] - rightcellCenters[0])*Radius;
-  dyr = (edgeCenters[1] - rightcellCenters[1])*Radius;
+  dxl = (edgeCenters[0] - leftcellCenters[0]);
+  dyl = (edgeCenters[1] - leftcellCenters[1]);
+  dxr = (edgeCenters[0] - rightcellCenters[0]);
+  dyr = (edgeCenters[1] - rightcellCenters[1]);
 
   // Second order Reconstruction
-  // leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
-  // leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
-  // leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
-  // leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
-  if (leftCellValues[0] >= EPS){
+  leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
+  leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
+  leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
+  leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
+  if (leftCellValues[0] >= 1e-3){
      uL = leftCellValues[1]/leftCellValues[0];
      vL = leftCellValues[2]/leftCellValues[0];
   } else {
@@ -38,7 +37,6 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
      vL = 0.0f;
   }
   zL = cellLeft[3] - cellLeft[0];
-  zR = cellRight[3] - cellRight[0];
 
   if (!*isRightBoundary) {
     rightCellValues[0] = cellRight[0];
@@ -47,11 +45,11 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
     rightCellValues[3] = cellRight[3];
 
     // Second order Reconstruction
-    // rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
-    // rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
-    // rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
-    // rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
-    if (rightCellValues[0] >= EPS){
+    rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
+    rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
+    rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
+    rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
+    if (rightCellValues[0] >= 1e-3){
        uR = rightCellValues[1]/rightCellValues[0];
        vR = rightCellValues[2]/rightCellValues[0];
      } else {
@@ -59,40 +57,37 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
        vR = 0.0f;
      }
    } else {
-     float nx = edgeNormals[0];
-     float ny = edgeNormals[1];
-     float inNormalVelocity = uL * nx + vL * ny;
-     float inTangentVelocity = -1.0f * uL * ny + vL * nx;
-     float outNormalVelocity = 0.0f;
-     float outTangentVelocity = 0.0f;
+    float nx = edgeNormals[0];
+    float ny = edgeNormals[1];
+    float inNormalVelocity = uL * nx + vL * ny;
+    float inTangentVelocity = -1.0f * uL * ny + vL * nx;
+    float outNormalVelocity = 0.0f;
+    float outTangentVelocity = 0.0f;
+    //rightCellValues[3] = leftCellValues[3];
+    // Outflow
+    // float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS;
+    // rightCellValues[3] = zL + wet;
+    // rightCellValues[0] = wet;
+    rightCellValues[3] = leftCellValues[3];
+    rightCellValues[0] = leftCellValues[0];
+    outNormalVelocity = inNormalVelocity;
+    outTangentVelocity = inTangentVelocity;
+    uR = outNormalVelocity * nx - outTangentVelocity * ny;
+    vR = outNormalVelocity * ny + outTangentVelocity * nx;
 
-     // Outflow
-     float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS;
-     float critical = sqrt(uL*uL + vL*vL);
-     outTangentVelocity = inTangentVelocity;
-     if (critical < sqrt(g*leftCellValues[0])){
-       rightCellValues[0] = wet;
-       rightCellValues[3] = wet + zL;
-       outNormalVelocity = 0.0f;
-     } else {
-       rightCellValues[0] = leftCellValues[0];
-       rightCellValues[3] = leftCellValues[3];
-       outNormalVelocity = inNormalVelocity;
-     }
-     /*
-     // Wall
-     rightCellValues[3] = leftCellValues[3];
-     rightCellValues[0] = leftCellValues[0];
-     outNormalVelocity =  -1.0f*inNormalVelocity;
-     outTangentVelocity = inTangentVelocity;
-     */
-     uR = outNormalVelocity * nx - outTangentVelocity * ny;
-     vR = outNormalVelocity * ny + outTangentVelocity * nx;
-     rightCellValues[1] = uR*rightCellValues[0];
-     rightCellValues[2] = vR*rightCellValues[0];
-     zR = zL;
-  }
+    // Wall
+    // rightCellValues[3] = leftCellValues[3];
+    // rightCellValues[0] = leftCellValues[0];
+    // outNormalVelocity =  -1.0f*inNormalVelocity;
+    // outTangentVelocity = inTangentVelocity;
+    // uR = outNormalVelocity * nx - outTangentVelocity * ny;
+    // vR = outNormalVelocity * ny + outTangentVelocity * nx;
 
+
+    rightCellValues[1] = uR*rightCellValues[0];
+    rightCellValues[2] = vR*rightCellValues[0];
+  }
+  zR = cellRight[3] - cellRight[0];
   rightCellValues[3] -= rightCellValues[0];
   leftCellValues[3] -= leftCellValues[0];
   // ------------------------------------------------------------------------------------
@@ -109,11 +104,10 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   hR = hR > 0.0f ? hR : 0.0f;
   bathySource[0] -= .5f * g * (hL * hL);
   bathySource[1] -= .5f * g * (hR * hR);
-  // Audusse Reconstruction(2005) 2nd order centered term
-  // bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL);
-  // bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR);
-  bathySource[2] = .5f * g * (hL * hL)*sin(M_PI*leftcellCenters[1]/180.0);
-  bathySource[3] = .5f * g * (hR * hR)*sin(M_PI*rightcellCenters[1]/180.0);
+  // Audusse Reconstruction(2005) 2nd order Centered term
+  bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL);
+  bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR);
+
   bathySource[0] *= *edgeLength;
   bathySource[1] *= *edgeLength;
   bathySource[2] *= *edgeLength;
@@ -140,12 +134,14 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/
           (hR*(uRn - sR) - hL*(uLn - sL));
 
-  if ((hL <= EPS) && (hR > EPS)) {
+  //if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) {
+  if ((leftCellValues[0] <= 1e-3) && (rightCellValues[0] > 1e-3)) {
       sL = uRn - 2.0f*cR;
       sR = uRn + cR;
       sStar = sL;
   }
-  if ((hR <= EPS) && (hL > EPS)) {
+  //if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) {
+  if ((rightCellValues[0] <= 1e-3) && (leftCellValues[0] > 1e-3)) {
       sR = uLn + 2.0f*cL;
       sL =  uLn - cL;
       sStar = sR;
@@ -156,29 +152,35 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   float uRp = vR*edgeNormals[0] - uR*edgeNormals[1];
 
   float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N;
-  // float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1];
-  float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1];
+  //inlined ProjectedPhysicalFluxes(leftCellValues, Normals, params, LeftFluxes);
+  float HuDotN = (leftCellValues[1]/cos(M_PI*leftcellCenters[1]/180.0)) * edgeNormals[0] +
+  (leftCellValues[2]) * edgeNormals[1];
+
   LeftFluxes_H = HuDotN;
   LeftFluxes_U = HuDotN * uL;
   LeftFluxes_V = HuDotN * vL;
   // Normal Momentum flux term
   LeftFluxes_N = HuDotN * uLn;
 
-  LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( hL * hL);
-  LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL* cos(M_PI*edgeCenters[1]/180.0));
+  LeftFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hL * hL ))/(cos(M_PI*leftcellCenters[1]/180.0));
+  LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( hL * hL );
   LeftFluxes_N += (.5f * g ) * ( hL * hL );
+  //end of inlined
 
   float RightFluxes_H, RightFluxes_U, RightFluxes_V, RightFluxes_N;
-  HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR*cos(M_PI*edgeCenters[1]/180.0)) * edgeNormals[1];
-  // HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1];
+  //inlined ProjectedPhysicalFluxes(rightCellValues, Normals, params, RightFluxes);
+  HuDotN = (rightCellValues[1]/cos(M_PI*rightcellCenters[1]/180.0)) * edgeNormals[0] +
+  (rightCellValues[2]) * edgeNormals[1];
+
   RightFluxes_H =   HuDotN;
   RightFluxes_U =   HuDotN * uR;
   RightFluxes_V =   HuDotN * vR;
   // Normal Momentum flux term
   RightFluxes_N =   HuDotN * uRn;
-  RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( hR * hR);
-  RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR* cos(M_PI*edgeCenters[1]/180.0));
-  RightFluxes_N += (.5f * g ) * ( hR * hR );
+
+  RightFluxes_U += ((.5f * g * edgeNormals[0] ) * ( hR * hR ))/(cos(M_PI*rightcellCenters[1]/180.0));
+  RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( hR * hR );
+  // RightFluxes_N += (.5f * g ) * ( hR * hR );
   // ------------------------------------------------------------------------
   //    HLLC Flux Solver (Batten et al. 1997)
   //    "On the choice of wavespeeds for the HLLC Reimann solver"
@@ -210,7 +212,7 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   //------------------------------------------------------------------------
   // HLL Flux Solver
   // ------------------------------------------------------------------------
-  float sLMinus = sL < 0.0f ? sL : 0.0f;
+  /*float sLMinus = sL < 0.0f ? sL : 0.0f;
   float sRPlus = sR > 0.0f ? sR : 0.0f;
   float sRMinussL = sRPlus - sLMinus;
   sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
@@ -221,65 +223,67 @@ inline void computeFluxes_sph(const float *cellLeft, const float *cellRight,
   ( t1 * LeftFluxes_H ) +
   ( t2 * RightFluxes_H ) +
   ( t3 * ( hR - hL ) );
-  // op_printf("out %f \n", out[0]);
+
   out[1] =
   ( t1 * LeftFluxes_U ) +
   ( t2 * RightFluxes_U ) +
-  ( t3 * ( (hR*uR) - (hL*uL) ) );
+  ( t3 * ( (rightCellValues[1]) -
+          (leftCellValues[1]) ) );
 
   out[2] =
   ( t1 * LeftFluxes_V ) +
   ( t2 * RightFluxes_V ) +
-  ( t3 * ( (hR*vR) - (hL*vL) ) );
-
+  ( t3 * ( (rightCellValues[2]) -
+          (leftCellValues[2]) ) );
+  */
   // ------------------------------------------------------------------------
   //     HLLC Flux Solver (Huang et al. 2013)
   //     "Well-balanced finite volume scheme for shallow water flooding and drying
   //     over arbitrary topography"
   // ------------------------------------------------------------------------
-  // float sLMinus = sL < 0.0f ? sL : 0.0f;
-  // float sRPlus = sR > 0.0f ? sR : 0.0f;
-  // float sRMinussL = sRPlus - sLMinus;
-  // sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
-  // float t1 = sRPlus / sRMinussL;
-  // float t2 = ( -1.0 * sLMinus ) / sRMinussL;
-  // float t3 = ( sRPlus * sLMinus ) / sRMinussL;
-  // float FStar[3];
-  // FStar[0] =
-  // ( t1 * LeftFluxes_H ) +
-  // ( t2 * RightFluxes_H ) +
-  // ( t3 * ( hR - hL ) );
-  //
-  //
-  // FStar[1] =
-  // ( t1 * LeftFluxes_N ) +
-  // ( t2 * RightFluxes_N ) +
-  // ( t3 * ( (hR * uRn) -
-  //         (hL * uLn) ) );
-  //
-  // if( sL >= 0.0f) {
-  //   out[0] = t1*LeftFluxes_H;
-  //   out[1] = t1*LeftFluxes_U;
-  //   out[2] = t1*LeftFluxes_V;
-  // } else if ((sL < 0.0f) && (sStar >= 0.0f)){
-  //   out[0] = FStar[0];
-  //   FStar[2] = FStar[0] * uLp;
-  //   out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
-  //   out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
-  // } else if((sStar < 0.0f) && (sR >= 0.0f)){
-  //   out[0] = FStar[0];
-  //   FStar[2] = FStar[0] * uRp;
-  //   out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
-  //   out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
-  // } else {
-  //   out[0] = t2*RightFluxes_H;
-  //   out[1] = t2*RightFluxes_U;
-  //   out[2] = t2*RightFluxes_V;
-  // }
+  float sLMinus = sL < 0.0f ? sL : 0.0f;
+  float sRPlus = sR > 0.0f ? sR : 0.0f;
+  float sRMinussL = sRPlus - sLMinus;
+  sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
+  float t1 = sRPlus / sRMinussL;
+  float t2 = ( -1.0 * sLMinus ) / sRMinussL;
+  float t3 = ( sRPlus * sLMinus ) / sRMinussL;
+  float FStar[3];
+  FStar[0] =
+  ( t1 * LeftFluxes_H ) +
+  ( t2 * RightFluxes_H ) +
+  ( t3 * ( hR - hL ) );
+
+
+  FStar[1] =
+  ( t1 * LeftFluxes_N ) +
+  ( t2 * RightFluxes_N ) +
+  ( t3 * ( (hR * uRn) -
+          (hL * uLn) ) );
+
+  if( sL >= 0.0f) {
+    out[0] = t1*LeftFluxes_H;
+    out[1] = t1*LeftFluxes_U;
+    out[2] = t1*LeftFluxes_V;
+  } else if ((sL < 0.0f) && (sStar >= 0.0f)){
+    out[0] = FStar[0];
+    FStar[2] = FStar[0] * uLp;
+    out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
+    out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
+  } else if((sStar < 0.0f) && (sR >= 0.0f)){
+    out[0] = FStar[0];
+    FStar[2] = FStar[0] * uRp;
+    out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
+    out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
+  } else {
+    out[0] = t2*RightFluxes_H;
+    out[1] = t2*RightFluxes_U;
+    out[2] = t2*RightFluxes_V;
+  }
   out[0] *= *edgeLength;
   out[1] *= *edgeLength;
   out[2] *= *edgeLength;
-
+  // op_printf("Length %f \n", *edgeLength);
   float maximum = fabs(uLn + cL);
   maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL);
   maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR);
diff --git a/sp/cuda/EvolveValuesRK2_1_kernel.cu b/sp/cuda/EvolveValuesRK2_1_kernel.cu
index 5cc23d8..6f3cb62 100644
--- a/sp/cuda/EvolveValuesRK2_1_kernel.cu
+++ b/sp/cuda/EvolveValuesRK2_1_kernel.cu
@@ -3,42 +3,26 @@
 //
 
 //user function
-__device__
-inline void EvolveValuesRK2_1_gpu(const float *dT, float *midPointConservative, //OP_RW //temp
-            const float *in, //OP_READ
-            float *inConservative, //OP_WRITE //temp
-            float *midPoint) //OP_WRITE
-{
-  midPointConservative[0] *= *dT;
-  midPointConservative[1] *= *dT;
-  midPointConservative[2] *= *dT;
-
-  //call to ToConservativeVariables inlined
-  inConservative[0] = in[0];
-  inConservative[1] = in[0] * (in[1]);
-  inConservative[2] = in[0] * (in[2]);
-  inConservative[3] = in[3];
-
-  midPointConservative[0] += inConservative[0];
-  midPointConservative[1] += inConservative[1];
-  midPointConservative[2] += inConservative[2];
-  midPointConservative[3] += inConservative[3];
-
-  //call to ToPhysicalVariables inlined
-  float TruncatedH = midPointConservative[0] < EPS ? EPS : midPointConservative[0];
-  midPoint[0] = midPointConservative[0];
-  midPoint[1] = midPointConservative[1] / TruncatedH;
-  midPoint[2] = midPointConservative[2] / TruncatedH;
-  midPoint[3] = midPointConservative[3];
+__device__ void EvolveValuesRK2_1_gpu( const float *dT, const float *Lw_n,
+            const float *in,
+            float *out) {
+  out[0] = Lw_n[0] * *dT + in[0];
+  out[1] = Lw_n[1] * *dT + in[1];
+  out[2] = Lw_n[2] * *dT + in[2];
+  out[3] = in[3]-in[0];
+
+  float TruncatedH = out[0] < EPS_cuda ? EPS_cuda : out[0];
+  out[0] = TruncatedH;
+  out[3] += TruncatedH;
+
 }
 
 // CUDA kernel function
 __global__ void op_cuda_EvolveValuesRK2_1(
   const float *arg0,
-  float *arg1,
+  const float *__restrict arg1,
   const float *__restrict arg2,
   float *arg3,
-  float *arg4,
   int   set_size ) {
 
 
@@ -49,29 +33,26 @@ __global__ void op_cuda_EvolveValuesRK2_1(
     EvolveValuesRK2_1_gpu(arg0,
                       arg1+n*4,
                       arg2+n*4,
-                      arg3+n*4,
-                      arg4+n*4);
+                      arg3+n*4);
   }
 }
 
 
-//GPU host stub function
-void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set,
+//host stub function
+void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
+  op_arg arg3){
 
   float*arg0h = (float *)arg0.data;
-  int nargs = 5;
-  op_arg args[5];
+  int nargs = 4;
+  op_arg args[4];
 
   args[0] = arg0;
   args[1] = arg1;
   args[2] = arg2;
   args[3] = arg3;
-  args[4] = arg4;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
@@ -79,15 +60,14 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set,
   op_timers_core(&cpu_t1, &wall_t1);
   OP_kernels[0].name      = name;
   OP_kernels[0].count    += 1;
-  if (OP_kernels[0].count==1) op_register_strides();
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  EvolveValuesRK2_1");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -107,7 +87,6 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set,
       int nthread = OP_BLOCK_SIZE_0;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -117,69 +96,16 @@ void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set,
       (float *) arg1.data_d,
       (float *) arg2.data_d,
       (float *) arg3.data_d,
-      (float *) arg4.data_d,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[0].time     += wall_t2 - wall_t1;
-  OP_kernels[0].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[0].transfer += (float)set->size * arg1.size;
   OP_kernels[0].transfer += (float)set->size * arg2.size;
-  OP_kernels[0].transfer += (float)set->size * arg3.size;
-  OP_kernels[0].transfer += (float)set->size * arg4.size;
+  OP_kernels[0].transfer += (float)set->size * arg3.size * 2.0f;
 }
-
-void op_par_loop_EvolveValuesRK2_1_cpu(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4);
-
-
-//GPU host stub function
-#if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK2_1_gpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-    }else{
-    op_par_loop_EvolveValuesRK2_1_cpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-  }
-}
-#else
-void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  op_par_loop_EvolveValuesRK2_1_gpu(name, set,
-    arg0,
-    arg1,
-    arg2,
-    arg3,
-    arg4);
-
-  }
-#endif //OP_HYBRID_GPU
diff --git a/sp/cuda/EvolveValuesRK2_2_kernel.cu b/sp/cuda/EvolveValuesRK2_2_kernel.cu
index a2cdfcd..f0a89d9 100644
--- a/sp/cuda/EvolveValuesRK2_2_kernel.cu
+++ b/sp/cuda/EvolveValuesRK2_2_kernel.cu
@@ -3,32 +3,24 @@
 //
 
 //user function
-__device__
-inline void EvolveValuesRK2_2_gpu(const float *dT, float *outConservative, //OP_RW, discard
-            const float *inConservative, //OP_READ, discard
-            const float *midPointConservative, //OP_READ, discard
-            float *out) //OP_WRITE
-
-{
-  outConservative[0] = 0.5*(outConservative[0] * *dT + midPointConservative[0] + inConservative[0]);
-  outConservative[1] = 0.5*(outConservative[1] * *dT + midPointConservative[1] + inConservative[1]);
-  outConservative[2] = 0.5*(outConservative[2] * *dT + midPointConservative[2] + inConservative[2]);
-
-  outConservative[0] = outConservative[0] <= EPS ? EPS : outConservative[0];
-  outConservative[3] = inConservative[3];
-
-  //call to ToPhysicalVariables inlined
-  float TruncatedH = outConservative[0] < EPS ? EPS : outConservative[0];
-  out[0] = outConservative[0];
-  out[1] = outConservative[1] / TruncatedH;
-  out[2] = outConservative[2] / TruncatedH;
-  out[3] = outConservative[3];
+__device__ void EvolveValuesRK2_2_gpu( const float *dT,const float *Lw_1,
+            const float *values,
+            const float *w_1,
+            float *out) {
+  out[0] = 0.5*(Lw_1[0] * *dT + w_1[0] + values[0]);
+  out[1] = 0.5*(Lw_1[1] * *dT + w_1[1] + values[1]);
+  out[2] = 0.5*(Lw_1[2] * *dT + w_1[2] + values[2]);
+  out[3] = values[3]-values[0];
+  float TruncatedH = out[0] < EPS_cuda ? EPS_cuda : out[0];
+  out[0] = TruncatedH;
+  out[3] += TruncatedH;
+
 }
 
 // CUDA kernel function
 __global__ void op_cuda_EvolveValuesRK2_2(
   const float *arg0,
-  float *arg1,
+  const float *__restrict arg1,
   const float *__restrict arg2,
   const float *__restrict arg3,
   float *arg4,
@@ -48,8 +40,8 @@ __global__ void op_cuda_EvolveValuesRK2_2(
 }
 
 
-//GPU host stub function
-void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set,
+//host stub function
+void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -72,15 +64,14 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set,
   op_timers_core(&cpu_t1, &wall_t1);
   OP_kernels[1].name      = name;
   OP_kernels[1].count    += 1;
-  if (OP_kernels[1].count==1) op_register_strides();
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  EvolveValuesRK2_2");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -100,7 +91,6 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set,
       int nthread = OP_BLOCK_SIZE_1;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -114,65 +104,14 @@ void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[1].time     += wall_t2 - wall_t1;
-  OP_kernels[1].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[1].transfer += (float)set->size * arg1.size;
   OP_kernels[1].transfer += (float)set->size * arg2.size;
   OP_kernels[1].transfer += (float)set->size * arg3.size;
-  OP_kernels[1].transfer += (float)set->size * arg4.size;
+  OP_kernels[1].transfer += (float)set->size * arg4.size * 2.0f;
 }
-
-void op_par_loop_EvolveValuesRK2_2_cpu(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4);
-
-
-//GPU host stub function
-#if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK2_2_gpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-    }else{
-    op_par_loop_EvolveValuesRK2_2_cpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-  }
-}
-#else
-void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  op_par_loop_EvolveValuesRK2_2_gpu(name, set,
-    arg0,
-    arg1,
-    arg2,
-    arg3,
-    arg4);
-
-  }
-#endif //OP_HYBRID_GPU
diff --git a/sp/cuda/Friction_manning_kernel.cu b/sp/cuda/Friction_manning_kernel.cu
new file mode 100644
index 0000000..9f7f9f8
--- /dev/null
+++ b/sp/cuda/Friction_manning_kernel.cu
@@ -0,0 +1,125 @@
+//
+// auto-generated by op2.py
+//
+
+//user function
+__device__ void Friction_manning_gpu( const float *dT,const float *M_n,
+            float *values) {
+  float Fr;
+  float TruncatedH = values[0] < EPS_cuda ? EPS_cuda : values[0];
+  float u = values[1]/TruncatedH;
+  float v = values[2]/TruncatedH;
+  float speed = sqrt(u*u + v*v);
+  Fr = g_cuda* (*M_n * *M_n) *speed;
+  Fr = Fr/(pow(TruncatedH,4.0f/3.0f));
+
+
+
+  values[0] = TruncatedH;
+  values[3] = values[3];
+  if (values[0] <= EPS_cuda){
+     values[1] = 0.0f;
+     values[2] = 0.0f;
+  } else if (EPS_cuda < values[0] <= 50.0f) {
+     values[1] = values[1] / (1.0f + Fr * *dT);
+     values[2] = values[2] / (1.0f + Fr * *dT);
+  } else {
+     values[1] = values[1];
+     values[2] = values[2];
+  }
+
+}
+
+// CUDA kernel function
+__global__ void op_cuda_Friction_manning(
+  const float *arg0,
+  const float *arg1,
+  float *arg2,
+  int   set_size ) {
+
+
+  //process set elements
+  for ( int n=threadIdx.x+blockIdx.x*blockDim.x; n<set_size; n+=blockDim.x*gridDim.x ){
+
+    //user-supplied kernel call
+    Friction_manning_gpu(arg0,
+                     arg1,
+                     arg2+n*4);
+  }
+}
+
+
+//host stub function
+void op_par_loop_Friction_manning(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2){
+
+  float*arg0h = (float *)arg0.data;
+  float*arg1h = (float *)arg1.data;
+  int nargs = 3;
+  op_arg args[3];
+
+  args[0] = arg0;
+  args[1] = arg1;
+  args[2] = arg2;
+
+  // initialise timers
+  double cpu_t1, cpu_t2, wall_t1, wall_t2;
+  op_timing_realloc(2);
+  op_timers_core(&cpu_t1, &wall_t1);
+  OP_kernels[2].name      = name;
+  OP_kernels[2].count    += 1;
+
+
+  if (OP_diags>2) {
+    printf(" kernel routine w/o indirection:  Friction_manning");
+  }
+
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
+
+    //transfer constants to GPU
+    int consts_bytes = 0;
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    reallocConstArrays(consts_bytes);
+    consts_bytes = 0;
+    arg0.data   = OP_consts_h + consts_bytes;
+    arg0.data_d = OP_consts_d + consts_bytes;
+    for ( int d=0; d<1; d++ ){
+      ((float *)arg0.data)[d] = arg0h[d];
+    }
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    arg1.data   = OP_consts_h + consts_bytes;
+    arg1.data_d = OP_consts_d + consts_bytes;
+    for ( int d=0; d<1; d++ ){
+      ((float *)arg1.data)[d] = arg1h[d];
+    }
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    mvConstArraysToDevice(consts_bytes);
+
+    //set CUDA execution parameters
+    #ifdef OP_BLOCK_SIZE_2
+      int nthread = OP_BLOCK_SIZE_2;
+    #else
+      int nthread = OP_block_size;
+    #endif
+
+    int nblocks = 200;
+
+    op_cuda_Friction_manning<<<nblocks,nthread>>>(
+      (float *) arg0.data_d,
+      (float *) arg1.data_d,
+      (float *) arg2.data_d,
+      set->size );
+  }
+  op_mpi_set_dirtybit_cuda(nargs, args);
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
+  //update kernel record
+  op_timers_core(&cpu_t2, &wall_t2);
+  OP_kernels[2].time     += wall_t2 - wall_t1;
+  OP_kernels[2].transfer += (float)set->size * arg2.size * 2.0f;
+}
diff --git a/sp/cuda/NumericalFluxes_kernel.cu b/sp/cuda/NumericalFluxes_kernel.cu
index a93ef4a..41f2d39 100644
--- a/sp/cuda/NumericalFluxes_kernel.cu
+++ b/sp/cuda/NumericalFluxes_kernel.cu
@@ -3,34 +3,48 @@
 //
 
 //user function
-__device__ void NumericalFluxes_gpu( const float *maxEdgeEigenvalues0,
-          const float *maxEdgeEigenvalues1,
-          const float *maxEdgeEigenvalues2,
-          const float *EdgeVolumes0,
-          const float *EdgeVolumes1,
-          const float *EdgeVolumes2,
-          const float *cellVolumes,
-            float *zeroInit, float *minTimeStep ) {
-  float local = 0.0f;
-  local += *maxEdgeEigenvalues0 * *(EdgeVolumes0);
-  local += *maxEdgeEigenvalues1 * *(EdgeVolumes1);
-  local += *maxEdgeEigenvalues2 * *(EdgeVolumes2);
-  zeroInit[0] = 0.0f;
-  zeroInit[1] = 0.0f;
-  zeroInit[2] = 0.0f;
-  zeroInit[3] = 0.0f;
-
-  *minTimeStep = MIN(*minTimeStep, 2.0f * *cellVolumes / local);
+__device__ void NumericalFluxes_gpu( float *left,
+              float *right,
+              const float *edgeFluxes,
+              const float *bathySource,
+              const float *edgeNormals, const int *isRightBoundary,
+              const float *cellVolumes0,
+              const float *cellVolumes1) {
+  left[0] -= (edgeFluxes[0])/cellVolumes0[0];
+  left[1] -= (edgeFluxes[1] + bathySource[0] * edgeNormals[0])/cellVolumes0[0];
+  left[2] -= (edgeFluxes[2] + bathySource[0] * edgeNormals[1])/cellVolumes0[0];
+
+  left[1] += (bathySource[2] *edgeNormals[0])/cellVolumes0[0];
+  left[2] += (bathySource[2] *edgeNormals[1])/cellVolumes0[0];
+  if (!*isRightBoundary) {
+    right[0] += edgeFluxes[0]/cellVolumes1[0];
+    right[1] += (edgeFluxes[1] + bathySource[1] * edgeNormals[0])/cellVolumes1[0];
+    right[2] += (edgeFluxes[2] + bathySource[1] * edgeNormals[1])/cellVolumes1[0];
+
+    right[1] -= (bathySource[3] *edgeNormals[0])/cellVolumes1[0];
+    right[2] -= (bathySource[3] *edgeNormals[1])/cellVolumes1[0];
+   } else {
+   right[0] += 0.0f;
+   right[1] += 0.0f;
+   right[2] += 0.0f;
+  }
+
+
 }
 
 // CUDA kernel function
 __global__ void op_cuda_NumericalFluxes(
-  const float *__restrict ind_arg0,
+  float *__restrict ind_arg0,
   const float *__restrict ind_arg1,
   const int *__restrict opDat0Map,
-  const float *__restrict arg6,
-  float *arg7,
-  float *arg8,
+  const float *__restrict arg2,
+  const float *__restrict arg3,
+  const float *__restrict arg4,
+  const int *__restrict arg5,
+  int   *ind_map,
+  short *arg_map,
+  int   *ind_arg_sizes,
+  int   *ind_arg_offs,
   int    block_offset,
   int   *blkmap,
   int   *offset,
@@ -39,11 +53,13 @@ __global__ void op_cuda_NumericalFluxes(
   int   *colors,
   int   nblocks,
   int   set_size) {
-  float arg8_l[1];
-  for ( int d=0; d<1; d++ ){
-    arg8_l[d]=arg8[d+blockIdx.x*1];
-  }
+  float arg0_l[4];
+  float arg1_l[4];
 
+  __shared__  int  *ind_arg0_map, ind_arg0_size;
+  __shared__  float *ind_arg0_s;
+
+  __shared__ int    nelems2, ncolor;
   __shared__ int    nelem, offset_b;
 
   extern __shared__ char shared[];
@@ -60,34 +76,86 @@ __global__ void op_cuda_NumericalFluxes(
     nelem    = nelems[blockId];
     offset_b = offset[blockId];
 
+    nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);
+    ncolor   = ncolors[blockId];
+
+    ind_arg0_size = ind_arg_sizes[0+blockId*1];
+
+    ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1];
+
+    //set shared memory pointers
+    int nbytes = 0;
+    ind_arg0_s = (float *) &shared[nbytes];
   }
   __syncthreads(); // make sure all of above completed
 
-  for ( int n=threadIdx.x; n<nelem; n+=blockDim.x ){
+  for ( int n=threadIdx.x; n<ind_arg0_size*4; n+=blockDim.x ){
+    ind_arg0_s[n] = ZERO_float;
+  }
+
+  __syncthreads();
+
+  for ( int n=threadIdx.x; n<nelems2; n+=blockDim.x ){
+    int col2 = -1;
     int map0idx;
     int map1idx;
-    int map2idx;
-    map0idx = opDat0Map[n + offset_b + set_size * 0];
-    map1idx = opDat0Map[n + offset_b + set_size * 1];
-    map2idx = opDat0Map[n + offset_b + set_size * 2];
+    if (n<nelem) {
+      //initialise local variables
+      for ( int d=0; d<4; d++ ){
+        arg0_l[d] = ZERO_float;
+      }
+      for ( int d=0; d<4; d++ ){
+        arg1_l[d] = ZERO_float;
+      }
+      map0idx = opDat0Map[n + offset_b + set_size * 0];
+      map1idx = opDat0Map[n + offset_b + set_size * 1];
 
 
-    //user-supplied kernel call
-    NumericalFluxes_gpu(ind_arg0+map0idx*1,
-                    ind_arg0+map1idx*1,
-                    ind_arg0+map2idx*1,
+      //user-supplied kernel call
+      NumericalFluxes_gpu(arg0_l,
+                    arg1_l,
+                    arg2+(n+offset_b)*3,
+                    arg3+(n+offset_b)*4,
+                    arg4+(n+offset_b)*2,
+                    arg5+(n+offset_b)*1,
                     ind_arg1+map0idx*1,
-                    ind_arg1+map1idx*1,
-                    ind_arg1+map2idx*1,
-                    arg6+(n+offset_b)*1,
-                    arg7+(n+offset_b)*4,
-                    arg8_l);
-  }
+                    ind_arg1+map1idx*1);
+      col2 = colors[n+offset_b];
+    }
 
-  //global reductions
+    //store local variables
+
+    int arg0_map;
+    int arg1_map;
+    if (col2>=0) {
+      arg0_map = arg_map[0*set_size+n+offset_b];
+      arg1_map = arg_map[1*set_size+n+offset_b];
+    }
 
-  for ( int d=0; d<1; d++ ){
-    op_reduction<OP_MIN>(&arg8[d+blockIdx.x*1],arg8_l[d]);
+    for ( int col=0; col<ncolor; col++ ){
+      if (col2==col) {
+        arg0_l[0] += ind_arg0_s[0+arg0_map*4];
+        arg0_l[1] += ind_arg0_s[1+arg0_map*4];
+        arg0_l[2] += ind_arg0_s[2+arg0_map*4];
+        arg0_l[3] += ind_arg0_s[3+arg0_map*4];
+        ind_arg0_s[0+arg0_map*4] = arg0_l[0];
+        ind_arg0_s[1+arg0_map*4] = arg0_l[1];
+        ind_arg0_s[2+arg0_map*4] = arg0_l[2];
+        ind_arg0_s[3+arg0_map*4] = arg0_l[3];
+        arg1_l[0] += ind_arg0_s[0+arg1_map*4];
+        arg1_l[1] += ind_arg0_s[1+arg1_map*4];
+        arg1_l[2] += ind_arg0_s[2+arg1_map*4];
+        arg1_l[3] += ind_arg0_s[3+arg1_map*4];
+        ind_arg0_s[0+arg1_map*4] = arg1_l[0];
+        ind_arg0_s[1+arg1_map*4] = arg1_l[1];
+        ind_arg0_s[2+arg1_map*4] = arg1_l[2];
+        ind_arg0_s[3+arg1_map*4] = arg1_l[3];
+      }
+      __syncthreads();
+    }
+  }
+  for ( int n=threadIdx.x; n<ind_arg0_size*4; n+=blockDim.x ){
+    ind_arg0[n%4+ind_arg0_map[n/4]*4] += ind_arg0_s[n];
   }
 }
 
@@ -101,12 +169,10 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7,
-  op_arg arg8){
+  op_arg arg7){
 
-  float*arg8h = (float *)arg8.data;
-  int nargs = 9;
-  op_arg args[9];
+  int nargs = 8;
+  op_arg args[8];
 
   args[0] = arg0;
   args[1] = arg1;
@@ -116,65 +182,43 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
   args[5] = arg5;
   args[6] = arg6;
   args[7] = arg7;
-  args[8] = arg8;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(24);
+  op_timing_realloc(26);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[24].name      = name;
-  OP_kernels[24].count    += 1;
+  OP_kernels[26].name      = name;
+  OP_kernels[26].count    += 1;
 
 
   int    ninds   = 2;
-  int    inds[9] = {0,0,0,1,1,1,-1,-1,-1};
+  int    inds[8] = {0,0,-1,-1,-1,-1,1,1};
 
   if (OP_diags>2) {
     printf(" kernel routine with indirection: NumericalFluxes\n");
   }
 
   //get plan
-  #ifdef OP_PART_SIZE_24
-    int part_size = OP_PART_SIZE_24;
+  #ifdef OP_PART_SIZE_26
+    int part_size = OP_PART_SIZE_26;
   #else
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
-    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
-
-    //transfer global reduction data to GPU
-    int maxblocks = 0;
-    for ( int col=0; col<Plan->ncolors; col++ ){
-      maxblocks = MAX(maxblocks,Plan->ncolblk[col]);
-    }
-    int reduct_bytes = 0;
-    int reduct_size  = 0;
-    reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float));
-    reduct_size   = MAX(reduct_size,sizeof(float));
-    reallocReductArrays(reduct_bytes);
-    reduct_bytes = 0;
-    arg8.data   = OP_reduct_h + reduct_bytes;
-    arg8.data_d = OP_reduct_d + reduct_bytes;
-    for ( int b=0; b<maxblocks; b++ ){
-      for ( int d=0; d<1; d++ ){
-        ((float *)arg8.data)[d+b*1] = arg8h[d];
-      }
-    }
-    reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float));
-    mvReductArraysToDevice(reduct_bytes);
+    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);
 
     //execute plan
 
     int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
-      #ifdef OP_BLOCK_SIZE_24
-      int nthread = OP_BLOCK_SIZE_24;
+      #ifdef OP_BLOCK_SIZE_26
+      int nthread = OP_BLOCK_SIZE_26;
       #else
       int nthread = OP_block_size;
       #endif
@@ -182,14 +226,19 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
       dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],
       Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);
       if (Plan->ncolblk[col] > 0) {
-        int nshared = reduct_size*nthread;
+        int nshared = Plan->nsharedCol[col];
         op_cuda_NumericalFluxes<<<nblocks,nthread,nshared>>>(
         (float *)arg0.data_d,
-        (float *)arg3.data_d,
+        (float *)arg6.data_d,
         arg0.map_data_d,
-        (float*)arg6.data_d,
-        (float*)arg7.data_d,
-        (float*)arg8.data_d,
+        (float*)arg2.data_d,
+        (float*)arg3.data_d,
+        (float*)arg4.data_d,
+        (int*)arg5.data_d,
+        Plan->ind_map,
+        Plan->loc_map,
+        Plan->ind_sizes,
+        Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
@@ -199,26 +248,17 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
         Plan->ncolblk[col],
         set->size+set->exec_size);
 
-        //transfer global reduction data back to CPU
-        if (col == Plan->ncolors_owned-1) {
-          mvReductArraysToHost(reduct_bytes);
-        }
       }
       block_offset += Plan->ncolblk[col];
     }
-    OP_kernels[24].transfer  += Plan->transfer;
-    OP_kernels[24].transfer2 += Plan->transfer2;
-    for ( int b=0; b<maxblocks; b++ ){
-      for ( int d=0; d<1; d++ ){
-        arg8h[d] = MIN(arg8h[d],((float *)arg8.data)[d+b*1]);
-      }
-    }
-    arg8.data = (char *)arg8h;
-    op_mpi_reduce(&arg8,arg8h);
+    OP_kernels[26].transfer  += Plan->transfer;
+    OP_kernels[26].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[24].time     += wall_t2 - wall_t1;
+  OP_kernels[26].time     += wall_t2 - wall_t1;
 }
diff --git a/sp/cuda/Timestep_kernel.cu b/sp/cuda/Timestep_kernel.cu
new file mode 100644
index 0000000..6675f74
--- /dev/null
+++ b/sp/cuda/Timestep_kernel.cu
@@ -0,0 +1,218 @@
+//
+// auto-generated by op2.py
+//
+
+//user function
+__device__ void Timestep_gpu( const float *maxEdgeEigenvalues0,
+          const float *maxEdgeEigenvalues1,
+          const float *maxEdgeEigenvalues2,
+          const float *EdgeVolumes0,
+          const float *EdgeVolumes1,
+          const float *EdgeVolumes2,
+          const float *cellVolumes,
+          float *minTimeStep ) {
+  float local = 0.0f;
+  local += *maxEdgeEigenvalues0 * *(EdgeVolumes0);
+  local += *maxEdgeEigenvalues1 * *(EdgeVolumes1);
+  local += *maxEdgeEigenvalues2 * *(EdgeVolumes2);
+  *minTimeStep = MIN(*minTimeStep, 2.0f * *cellVolumes / local);
+
+}
+
+// CUDA kernel function
+__global__ void op_cuda_Timestep(
+  const float *__restrict ind_arg0,
+  const float *__restrict ind_arg1,
+  const int *__restrict opDat0Map,
+  const float *__restrict arg6,
+  float *arg7,
+  int    block_offset,
+  int   *blkmap,
+  int   *offset,
+  int   *nelems,
+  int   *ncolors,
+  int   *colors,
+  int   nblocks,
+  int   set_size) {
+  float arg7_l[1];
+  for ( int d=0; d<1; d++ ){
+    arg7_l[d]=arg7[d+blockIdx.x*1];
+  }
+
+
+  __shared__ int    nelem, offset_b;
+
+  extern __shared__ char shared[];
+
+  if (blockIdx.x+blockIdx.y*gridDim.x >= nblocks) {
+    return;
+  }
+  if (threadIdx.x==0) {
+
+    //get sizes and shift pointers and direct-mapped data
+
+    int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];
+
+    nelem    = nelems[blockId];
+    offset_b = offset[blockId];
+
+  }
+  __syncthreads(); // make sure all of above completed
+
+  for ( int n=threadIdx.x; n<nelem; n+=blockDim.x ){
+    int map0idx;
+    int map1idx;
+    int map2idx;
+    map0idx = opDat0Map[n + offset_b + set_size * 0];
+    map1idx = opDat0Map[n + offset_b + set_size * 1];
+    map2idx = opDat0Map[n + offset_b + set_size * 2];
+
+
+    //user-supplied kernel call
+    Timestep_gpu(ind_arg0+map0idx*1,
+             ind_arg0+map1idx*1,
+             ind_arg0+map2idx*1,
+             ind_arg1+map0idx*1,
+             ind_arg1+map1idx*1,
+             ind_arg1+map2idx*1,
+             arg6+(n+offset_b)*1,
+             arg7_l);
+  }
+
+  //global reductions
+
+  for ( int d=0; d<1; d++ ){
+    op_reduction<OP_MIN>(&arg7[d+blockIdx.x*1],arg7_l[d]);
+  }
+}
+
+
+//host stub function
+void op_par_loop_Timestep(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7){
+
+  float*arg7h = (float *)arg7.data;
+  int nargs = 8;
+  op_arg args[8];
+
+  args[0] = arg0;
+  args[1] = arg1;
+  args[2] = arg2;
+  args[3] = arg3;
+  args[4] = arg4;
+  args[5] = arg5;
+  args[6] = arg6;
+  args[7] = arg7;
+
+  // initialise timers
+  double cpu_t1, cpu_t2, wall_t1, wall_t2;
+  op_timing_realloc(25);
+  op_timers_core(&cpu_t1, &wall_t1);
+  OP_kernels[25].name      = name;
+  OP_kernels[25].count    += 1;
+
+
+  int    ninds   = 2;
+  int    inds[8] = {0,0,0,1,1,1,-1,-1};
+
+  if (OP_diags>2) {
+    printf(" kernel routine with indirection: Timestep\n");
+  }
+
+  //get plan
+  #ifdef OP_PART_SIZE_25
+    int part_size = OP_PART_SIZE_25;
+  #else
+    int part_size = OP_part_size;
+  #endif
+
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
+
+    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
+
+    //transfer global reduction data to GPU
+    int maxblocks = 0;
+    for ( int col=0; col<Plan->ncolors; col++ ){
+      maxblocks = MAX(maxblocks,Plan->ncolblk[col]);
+    }
+    int reduct_bytes = 0;
+    int reduct_size  = 0;
+    reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float));
+    reduct_size   = MAX(reduct_size,sizeof(float));
+    reallocReductArrays(reduct_bytes);
+    reduct_bytes = 0;
+    arg7.data   = OP_reduct_h + reduct_bytes;
+    arg7.data_d = OP_reduct_d + reduct_bytes;
+    for ( int b=0; b<maxblocks; b++ ){
+      for ( int d=0; d<1; d++ ){
+        ((float *)arg7.data)[d+b*1] = arg7h[d];
+      }
+    }
+    reduct_bytes += ROUND_UP(maxblocks*1*sizeof(float));
+    mvReductArraysToDevice(reduct_bytes);
+
+    //execute plan
+
+    int block_offset = 0;
+    for ( int col=0; col<Plan->ncolors; col++ ){
+      if (col==Plan->ncolors_core) {
+        op_mpi_wait_all_grouped(nargs, args, 2);
+      }
+      #ifdef OP_BLOCK_SIZE_25
+      int nthread = OP_BLOCK_SIZE_25;
+      #else
+      int nthread = OP_block_size;
+      #endif
+
+      dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],
+      Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);
+      if (Plan->ncolblk[col] > 0) {
+        int nshared = MAX(Plan->nshared,reduct_size*nthread);
+        op_cuda_Timestep<<<nblocks,nthread,nshared>>>(
+        (float *)arg0.data_d,
+        (float *)arg3.data_d,
+        arg0.map_data_d,
+        (float*)arg6.data_d,
+        (float*)arg7.data_d,
+        block_offset,
+        Plan->blkmap,
+        Plan->offset,
+        Plan->nelems,
+        Plan->nthrcol,
+        Plan->thrcol,
+        Plan->ncolblk[col],
+        set->size+set->exec_size);
+
+        //transfer global reduction data back to CPU
+        if (col == Plan->ncolors_owned-1) {
+          mvReductArraysToHost(reduct_bytes);
+        }
+      }
+      block_offset += Plan->ncolblk[col];
+    }
+    OP_kernels[25].transfer  += Plan->transfer;
+    OP_kernels[25].transfer2 += Plan->transfer2;
+    for ( int b=0; b<maxblocks; b++ ){
+      for ( int d=0; d<1; d++ ){
+        arg7h[d] = MIN(arg7h[d],((float *)arg7.data)[d+b*1]);
+      }
+    }
+    arg7.data = (char *)arg7h;
+    op_mpi_reduce(&arg7,arg7h);
+  }
+  op_mpi_set_dirtybit_cuda(nargs, args);
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
+  //update kernel record
+  op_timers_core(&cpu_t2, &wall_t2);
+  OP_kernels[25].time     += wall_t2 - wall_t1;
+}
diff --git a/sp/cuda/applyConst_kernel.cu b/sp/cuda/applyConst_kernel.cu
index 8acabea..eccfb93 100644
--- a/sp/cuda/applyConst_kernel.cu
+++ b/sp/cuda/applyConst_kernel.cu
@@ -17,9 +17,10 @@ __device__ void applyConst_gpu( const float *in, float *out, const int *variable
 
   }
   if (*variables & 8) {
-    out[3] = *in;
+    *out = *in;
 
   }
+
 }
 
 // CUDA kernel function
@@ -35,7 +36,7 @@ __global__ void op_cuda_applyConst(
 
     //user-supplied kernel call
     applyConst_gpu(arg0+n*1,
-               arg1+n*4,
+               arg1+n*1,
                arg2);
   }
 }
@@ -57,18 +58,18 @@ void op_par_loop_applyConst(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(10);
+  op_timing_realloc(8);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[10].name      = name;
-  OP_kernels[10].count    += 1;
+  OP_kernels[8].name      = name;
+  OP_kernels[8].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  applyConst");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -84,11 +85,10 @@ void op_par_loop_applyConst(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_10
-      int nthread = OP_BLOCK_SIZE_10;
+    #ifdef OP_BLOCK_SIZE_8
+      int nthread = OP_BLOCK_SIZE_8;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -100,10 +100,12 @@ void op_par_loop_applyConst(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[10].time     += wall_t2 - wall_t1;
-  OP_kernels[10].transfer += (float)set->size * arg0.size;
-  OP_kernels[10].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[8].time     += wall_t2 - wall_t1;
+  OP_kernels[8].transfer += (float)set->size * arg0.size;
+  OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/computeFluxes_kernel.cu b/sp/cuda/computeFluxes_kernel.cu
index fb203d9..983eace 100644
--- a/sp/cuda/computeFluxes_kernel.cu
+++ b/sp/cuda/computeFluxes_kernel.cu
@@ -11,11 +11,13 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight
                                 const float *leftGradient, const float *rightGradient,
                                 const int *isRightBoundary,
                                 float *bathySource, float *out,
-                                float *maxEdgeEigenvalues) {
+                                float *maxEdgeEigenvalues, const float *zmin) {
 
   float leftCellValues[4];
   float rightCellValues[4];
   float InterfaceBathy;
+  float zL, zR;
+  float uR,vR,uL,vL;
   leftCellValues[0] = cellLeft[0];
   leftCellValues[1] = cellLeft[1];
   leftCellValues[2] = cellLeft[2];
@@ -23,65 +25,86 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight
   float dxl, dyl, dxr, dyr;
   dxl = (edgeCenters[0] - leftcellCenters[0]);
   dyl = (edgeCenters[1] - leftcellCenters[1]);
-
   dxr = (edgeCenters[0] - rightcellCenters[0]);
   dyr = (edgeCenters[1] - rightcellCenters[1]);
 
+  leftCellValues[0] += alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
+  leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
+  leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
+  leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
+  if (leftCellValues[0] >= 1e-3){
+     uL = leftCellValues[1]/leftCellValues[0];
+     vL = leftCellValues[2]/leftCellValues[0];
+  } else {
+     uL = 0.0f;
+     vL = 0.0f;
+  }
+  zL = cellLeft[3] - cellLeft[0];
+  zR = cellRight[3] - cellRight[0];
   if (!*isRightBoundary) {
     rightCellValues[0] = cellRight[0];
     rightCellValues[1] = cellRight[1];
     rightCellValues[2] = cellRight[2];
     rightCellValues[3] = cellRight[3];
 
-  } else {
-    rightCellValues[3] = cellLeft[3];
+    rightCellValues[0] += alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
+    rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
+    rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
+    rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
+    if (rightCellValues[0] >= 1e-3){
+       uR = rightCellValues[1]/rightCellValues[0];
+       vR = rightCellValues[2]/rightCellValues[0];
+     } else {
+       uR = 0.0f;
+       vR = 0.0f;
+     }
+   } else {
     float nx = edgeNormals[0];
     float ny = edgeNormals[1];
-    float inNormalVelocity = cellLeft[1] * nx + cellLeft[2] * ny;
-    float inTangentVelocity = -1.0f *  cellLeft[1] * ny + cellLeft[2] * nx;
+    float inNormalVelocity = uL * nx + vL * ny;
+    float inTangentVelocity = -1.0f * uL * ny + vL * nx;
     float outNormalVelocity = 0.0f;
     float outTangentVelocity = 0.0f;
 
-    rightCellValues[0] = cellLeft[0];
-    outNormalVelocity =  -1.0f*inNormalVelocity;
+    float wet = (fabs(*zmin) - zL) > 0.0f ? (fabs(*zmin) - zL) : EPS_cuda;
+    float critical = sqrt(uL*uL + vL*vL);
     outTangentVelocity = inTangentVelocity;
+    if (critical < sqrt(g_cuda*leftCellValues[0])){
+      rightCellValues[0] = wet;
+      rightCellValues[3] = wet + zL;
+      outNormalVelocity = 0.0f;
+    } else {
+      rightCellValues[0] = leftCellValues[0];
+      rightCellValues[3] = leftCellValues[3];
+      outNormalVelocity = inNormalVelocity;
+    }
+
 
 
-    rightCellValues[1] = outNormalVelocity * nx - outTangentVelocity * ny;
-    rightCellValues[2] = outNormalVelocity * ny + outTangentVelocity * nx;
+    uR = outNormalVelocity * nx - outTangentVelocity * ny;
+    vR = outNormalVelocity * ny + outTangentVelocity * nx;
+    rightCellValues[1] = uR*rightCellValues[0];
+    rightCellValues[2] = vR*rightCellValues[0];
+    zR = zL;
   }
 
+  rightCellValues[3] -= rightCellValues[0];
+  leftCellValues[3] -= leftCellValues[0];
 
-  if (!*isRightBoundary) {
-    leftCellValues[0] +=  alphaleft[0] * ((dxl * leftGradient[0])+(dyl * leftGradient[1]));
-    leftCellValues[0] = leftCellValues[0] > 0.0f ? leftCellValues[0] : 0.0f;
 
-    leftCellValues[3] += alphaleft[0] * ((dxl * leftGradient[6])+(dyl * leftGradient[7]));
-    leftCellValues[1] += alphaleft[0] * ((dxl * leftGradient[2])+(dyl * leftGradient[3]));
-    leftCellValues[2] += alphaleft[0] * ((dxl * leftGradient[4])+(dyl * leftGradient[5]));
-
-    rightCellValues[0] +=  alpharight[0] * ((dxr * rightGradient[0])+(dyr * rightGradient[1]));
-    rightCellValues[0] = rightCellValues[0] > 0.0f ? rightCellValues[0] : 0.0f;
-    rightCellValues[3] += alpharight[0] * ((dxr * rightGradient[6])+(dyr * rightGradient[7]));
-    rightCellValues[1] += alpharight[0] * ((dxr * rightGradient[2])+(dyr * rightGradient[3]));
-    rightCellValues[2] += alpharight[0] * ((dxr * rightGradient[4])+(dyr * rightGradient[5]));
-  }
 
   InterfaceBathy = leftCellValues[3] > rightCellValues[3] ? leftCellValues[3] : rightCellValues[3];
-  bathySource[0] =.5f * g * (leftCellValues[0]*leftCellValues[0]);
-  bathySource[1] =.5f * g * (rightCellValues[0]*rightCellValues[0]);
+  bathySource[0] =0.5f * g_cuda * (leftCellValues[0]*leftCellValues[0]);
+  bathySource[1] =0.5f * g_cuda * (rightCellValues[0]*rightCellValues[0]);
   float hL = (leftCellValues[0] + leftCellValues[3] - InterfaceBathy);
-  hL = hL > 0.0f? hL : 0.0f;
+  hL = hL > 0.0f ? hL : 0.0f;
   float hR = (rightCellValues[0] + rightCellValues[3] - InterfaceBathy);
   hR = hR > 0.0f ? hR : 0.0f;
-  bathySource[0] -= .5f * g * (hL * hL);
-  bathySource[1] -= .5f * g * (hR * hR);
-
-  bathySource[2] = -.5f * g *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - cellLeft[3]);
-  bathySource[3] = -.5f * g *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - cellRight[3]);
+  bathySource[0] -= .5f * g_cuda * (hL * hL);
+  bathySource[1] -= .5f * g_cuda * (hR * hR);
 
-  leftCellValues[0] = hL;
-  rightCellValues[0] = hR;
+  bathySource[2] = -.5f * g_cuda *(leftCellValues[0] + cellLeft[0])*(leftCellValues[3] - zL);
+  bathySource[3] = -.5f * g_cuda *(rightCellValues[0] + cellRight[0])*(rightCellValues[3] - zR);
 
   bathySource[0] *= *edgeLength;
   bathySource[1] *= *edgeLength;
@@ -90,13 +113,13 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight
 
 
 
-
-  float cL = sqrt(g * leftCellValues[0]);
+  float cL = sqrt(g_cuda * hL);
   cL = cL > 0.0f ? cL : 0.0f;
-  float cR = sqrt(g * rightCellValues[0]);
+  float cR = sqrt(g_cuda * hR);
   cR = cR > 0.0f ? cR : 0.0f;
-  float uLn = leftCellValues[1] * edgeNormals[0] + leftCellValues[2] * edgeNormals[1];
-  float uRn = rightCellValues[1] * edgeNormals[0] + rightCellValues[2] * edgeNormals[1];
+
+  float uLn = uL * edgeNormals[0] + vL * edgeNormals[1];
+  float uRn = uR * edgeNormals[0] + vR * edgeNormals[1];
 
   float unStar = 0.5f * (uLn + uRn) + (cL-cR);
   float cStar = 0.5f * (cL + cR) - 0.25f* (uRn-uLn);
@@ -104,85 +127,91 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight
   float sR = (uRn + cR) > (unStar + cStar) ? (uRn + cR) : (unStar + cStar);
 
   float sStar;
-  sStar = (sL*rightCellValues[0]*(uRn - sR) - sR*leftCellValues[0]*(uLn - sL))/
-          (rightCellValues[0]*(uRn - sR) - leftCellValues[0]*(uLn - sL));
+  sStar = (sL*hR*(uRn - sR) - sR*hL*(uLn - sL))/
+          (hR*(uRn - sR) - hL*(uLn - sL));
+
+  if ((leftCellValues[0] <= EPS_cuda) && (rightCellValues[0] > EPS_cuda)) {
 
-  if ((leftCellValues[0] <= EPS) && (rightCellValues[0] > EPS)) {
       sL = uRn - 2.0f*cR;
       sR = uRn + cR;
       sStar = sL;
   }
-  if ((rightCellValues[0] <= EPS) && (leftCellValues[0] > EPS)) {
+  if ((rightCellValues[0] <= EPS_cuda) && (leftCellValues[0] > EPS_cuda)) {
+
       sR = uLn + 2.0f*cL;
       sL =  uLn - cL;
       sStar = sR;
   }
 
-  float sLMinus = sL < 0.0f ? sL : 0.0f;
-  float sRPlus = sR > 0.0f ? sR : 0.0f;
-  float sRMinussL = sRPlus - sLMinus;
-  sRMinussL = sRMinussL < EPS ?  EPS : sRMinussL;
 
-  float t1 = sRPlus / sRMinussL;
+  float uLp = vL*edgeNormals[0] - uL*edgeNormals[1];
+  float uRp = vR*edgeNormals[0] - uR*edgeNormals[1];
 
+  float LeftFluxes_H, LeftFluxes_N, LeftFluxes_U, LeftFluxes_V;
 
-  float t2 = ( -1.0 * sLMinus ) / sRMinussL;
+  float HuDotN = (hL*uL) * edgeNormals[0] + (hL*vL) * edgeNormals[1];
 
+  LeftFluxes_H = HuDotN;
+  LeftFluxes_U = HuDotN * uL;
+  LeftFluxes_V = HuDotN * vL;
 
-  float t3 = ( sRPlus * sLMinus ) / sRMinussL;
+  LeftFluxes_N = HuDotN * uLn;
+
+  LeftFluxes_U += (.5f * g_cuda * edgeNormals[0] ) * ( hL * hL );
+  LeftFluxes_V += (.5f * g_cuda * edgeNormals[1] ) * ( hL * hL );
+  LeftFluxes_N += (.5f * g_cuda ) * ( hL * hL );
+
+
+  float RightFluxes_H,RightFluxes_N, RightFluxes_U, RightFluxes_V;
+
+  HuDotN = (hR*uR) * edgeNormals[0] + (hR*vR) * edgeNormals[1];
+
+  RightFluxes_H =   HuDotN;
+  RightFluxes_U =   HuDotN * uR;
+  RightFluxes_V =   HuDotN * vR;
+
+  RightFluxes_N =   HuDotN * uRn;
+
+  RightFluxes_U += (.5f * g_cuda * edgeNormals[0] ) * ( hR * hR );
+  RightFluxes_V += (.5f * g_cuda * edgeNormals[1] ) * ( hR * hR );
+  RightFluxes_N += (.5f * g_cuda ) * ( hR * hR );
 
 
-  float uLp = leftCellValues[2]*edgeNormals[0] - leftCellValues[1]*edgeNormals[1];
-  float uRp = rightCellValues[2]*edgeNormals[0] - rightCellValues[1]*edgeNormals[1];
 
-  float LeftFluxes_H, LeftFluxes_U, LeftFluxes_V, LeftFluxes_N;
 
-  float HuDotN = (leftCellValues[0] * leftCellValues[1]) * edgeNormals[0] +
-  (leftCellValues[0] * leftCellValues[2]) * edgeNormals[1];
 
-  LeftFluxes_H = HuDotN;
-  LeftFluxes_U = HuDotN * leftCellValues[1];
-  LeftFluxes_V = HuDotN * leftCellValues[2];
 
-  LeftFluxes_N = HuDotN * uLn;
 
-  LeftFluxes_U += (.5f * g * edgeNormals[0] ) * ( leftCellValues[0] * leftCellValues[0] );
-  LeftFluxes_V += (.5f * g * edgeNormals[1] ) * ( leftCellValues[0] * leftCellValues[0] );
-  LeftFluxes_N += (.5f * g ) * ( leftCellValues[0] * leftCellValues[0] );
 
 
-  float RightFluxes_H,RightFluxes_U, RightFluxes_V, RightFluxes_N;
 
-  HuDotN = (rightCellValues[0] * rightCellValues[1] * edgeNormals[0]) +
-  (rightCellValues[0] * rightCellValues[2] * edgeNormals[1]);
 
-  RightFluxes_H =   HuDotN;
-  RightFluxes_U =   HuDotN * rightCellValues[1];
-  RightFluxes_V =   HuDotN * rightCellValues[2];
 
-  RightFluxes_N =   HuDotN * uRn;
 
-  RightFluxes_U += (.5f * g * edgeNormals[0] ) * ( rightCellValues[0] * rightCellValues[0] );
-  RightFluxes_V += (.5f * g * edgeNormals[1] ) * ( rightCellValues[0] * rightCellValues[0] );
-  RightFluxes_N += (.5f * g) * ( rightCellValues[0] * rightCellValues[0] );
 
+  float sLMinus = sL < 0.0f ? sL : 0.0f;
+  float sRPlus = sR > 0.0f ? sR : 0.0f;
+  float sRMinussL = sRPlus - sLMinus;
+  sRMinussL = sRMinussL < EPS_cuda ?  EPS_cuda : sRMinussL;
+  float t1 = sRPlus / sRMinussL;
+  float t2 = ( -1.0 * sLMinus ) / sRMinussL;
+  float t3 = ( sRPlus * sLMinus ) / sRMinussL;
   float FStar[3];
   FStar[0] =
   ( t1 * LeftFluxes_H ) +
   ( t2 * RightFluxes_H ) +
-  ( t3 * ( rightCellValues[0] - leftCellValues[0] ) );
+  ( t3 * ( hR - hL ) );
 
   FStar[1] =
   ( t1 * LeftFluxes_N ) +
   ( t2 * RightFluxes_N ) +
-  ( t3 * ( (rightCellValues[0] * uRn) -
-          (leftCellValues[0] * uLn) ) );
-
+  ( t3 * ( (hR * uRn) -
+          (hL * uLn) ) );
 
   if( sL >= 0.0f) {
-    out[0] = LeftFluxes_H;
-    out[1] = LeftFluxes_U;
-    out[2] = LeftFluxes_V;
+    out[0] = t1*LeftFluxes_H;
+    out[1] = t1*LeftFluxes_U;
+    out[2] = t1*LeftFluxes_V;
   } else if ((sL < 0.0f) && (sStar >= 0.0f)){
     out[0] = FStar[0];
     FStar[2] = FStar[0] * uLp;
@@ -194,18 +223,20 @@ __device__ void computeFluxes_gpu( const float *cellLeft, const float *cellRight
     out[1] = FStar[1]*edgeNormals[0] - FStar[2]*edgeNormals[1];
     out[2] = FStar[1]*edgeNormals[1] + FStar[2]*edgeNormals[0];
   } else {
-    out[0] = RightFluxes_H;
-    out[1] = RightFluxes_U;
-    out[2] = RightFluxes_V;
+    out[0] = t2*RightFluxes_H;
+    out[1] = t2*RightFluxes_U;
+    out[2] = t2*RightFluxes_V;
   }
   out[0] *= *edgeLength;
   out[1] *= *edgeLength;
   out[2] *= *edgeLength;
+
   float maximum = fabs(uLn + cL);
   maximum = maximum > fabs(uLn - cL) ? maximum : fabs(uLn - cL);
   maximum = maximum > fabs(uRn + cR) ? maximum : fabs(uRn + cR);
   maximum = maximum > fabs(uRn - cR) ? maximum : fabs(uRn - cR);
   *maxEdgeEigenvalues = maximum;
+
 }
 
 // CUDA kernel function
@@ -222,6 +253,7 @@ __global__ void op_cuda_computeFluxes(
   float *arg12,
   float *arg13,
   float *arg14,
+  const float *arg15,
   int    block_offset,
   int   *blkmap,
   int   *offset,
@@ -231,6 +263,7 @@ __global__ void op_cuda_computeFluxes(
   int   nblocks,
   int   set_size) {
 
+
   __shared__ int    nelem, offset_b;
 
   extern __shared__ char shared[];
@@ -272,7 +305,8 @@ __global__ void op_cuda_computeFluxes(
                   arg11+(n+offset_b)*1,
                   arg12+(n+offset_b)*4,
                   arg13+(n+offset_b)*3,
-                  arg14+(n+offset_b)*1);
+                  arg14+(n+offset_b)*1,
+                  arg15);
   }
 }
 
@@ -293,10 +327,12 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
   op_arg arg11,
   op_arg arg12,
   op_arg arg13,
-  op_arg arg14){
+  op_arg arg14,
+  op_arg arg15){
 
-  int nargs = 15;
-  op_arg args[15];
+  float*arg15h = (float *)arg15.data;
+  int nargs = 16;
+  op_arg args[16];
 
   args[0] = arg0;
   args[1] = arg1;
@@ -313,43 +349,57 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
   args[12] = arg12;
   args[13] = arg13;
   args[14] = arg14;
+  args[15] = arg15;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(23);
+  op_timing_realloc(24);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[23].name      = name;
-  OP_kernels[23].count    += 1;
+  OP_kernels[24].name      = name;
+  OP_kernels[24].count    += 1;
 
 
   int    ninds   = 4;
-  int    inds[15] = {0,0,1,1,-1,-1,2,2,-1,3,3,-1,-1,-1,-1};
+  int    inds[16] = {0,0,1,1,-1,-1,2,2,-1,3,3,-1,-1,-1,-1,-1};
 
   if (OP_diags>2) {
     printf(" kernel routine with indirection: computeFluxes\n");
   }
 
   //get plan
-  #ifdef OP_PART_SIZE_23
-    int part_size = OP_PART_SIZE_23;
+  #ifdef OP_PART_SIZE_24
+    int part_size = OP_PART_SIZE_24;
   #else
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
 
+    //transfer constants to GPU
+    int consts_bytes = 0;
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    reallocConstArrays(consts_bytes);
+    consts_bytes = 0;
+    arg15.data   = OP_consts_h + consts_bytes;
+    arg15.data_d = OP_consts_d + consts_bytes;
+    for ( int d=0; d<1; d++ ){
+      ((float *)arg15.data)[d] = arg15h[d];
+    }
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    mvConstArraysToDevice(consts_bytes);
+
     //execute plan
 
     int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
-      #ifdef OP_BLOCK_SIZE_23
-      int nthread = OP_BLOCK_SIZE_23;
+      #ifdef OP_BLOCK_SIZE_24
+      int nthread = OP_BLOCK_SIZE_24;
       #else
       int nthread = OP_block_size;
       #endif
@@ -370,6 +420,7 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
         (float*)arg12.data_d,
         (float*)arg13.data_d,
         (float*)arg14.data_d,
+        (float*)arg15.data_d,
         block_offset,
         Plan->blkmap,
         Plan->offset,
@@ -382,12 +433,14 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
       }
       block_offset += Plan->ncolblk[col];
     }
-    OP_kernels[23].transfer  += Plan->transfer;
-    OP_kernels[23].transfer2 += Plan->transfer2;
+    OP_kernels[24].transfer  += Plan->transfer;
+    OP_kernels[24].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[23].time     += wall_t2 - wall_t1;
+  OP_kernels[24].time     += wall_t2 - wall_t1;
 }
diff --git a/sp/cuda/computeGradient_kernel.cu b/sp/cuda/computeGradient_kernel.cu
index 98eef74..987edb9 100644
--- a/sp/cuda/computeGradient_kernel.cu
+++ b/sp/cuda/computeGradient_kernel.cu
@@ -13,8 +13,7 @@ __device__ void computeGradient_gpu( const float *center,
                             const float *nb3Center,
                             float *q, float *out) {
 
-
-  if( (cellCenter[0] != nb3Center[0]) && (cellCenter[1] != nb3Center[1])){
+  if(center[0]> EPS_cuda){
     float total, Rhs[8];
     float dh[3], dz[3],du[3], dv[3], weights[3];
     float Gram[2][2], inverse[2][2], delta[3][2];
@@ -27,17 +26,24 @@ __device__ void computeGradient_gpu( const float *center,
     delta[1][0] =  (nb2Center[0] - x);
     delta[1][1] =  (nb2Center[1] - y);
 
-    delta[2][0] =  (nb3Center[0] - x);
-    delta[2][1] =  (nb3Center[1] - y);
+    if( (cellCenter[0] != nb3Center[0]) && (cellCenter[1] != nb3Center[1])){
+      delta[2][0] =  (nb3Center[0] - x);
+      delta[2][1] =  (nb3Center[1] - y);
+    } else {
+      delta[2][0] =  0.5f*(delta[0][0] + delta[1][0]);
+      delta[2][1] =  0.5f*(delta[0][1] + delta[1][1]);
+    }
 
 
     weights[0] = sqrt(delta[0][0] * delta[0][0] + delta[0][1] * delta[0][1]);
     weights[1] = sqrt(delta[1][0] * delta[1][0] + delta[1][1] * delta[1][1]);
     weights[2] = sqrt(delta[2][0] * delta[2][0] + delta[2][1] * delta[2][1]);
+
     total = weights[0] + weights[1] + weights[2];
+
     weights[0] = total/weights[0];
     weights[1] = total/weights[1];
-    weights[2] = total/ weights[2];
+    weights[2] = total/weights[2];
     delta[0][0] *= weights[0];
     delta[0][1] *= weights[0];
 
@@ -105,7 +111,18 @@ __device__ void computeGradient_gpu( const float *center,
     Rhs[7] = (delta[0][1]*dz[0]) + (delta[1][1]*dz[1]) + (delta[2][1]*dz[2]);
     out[6] = (inverse[0][0] * Rhs[6]) + (inverse[0][1] * Rhs[7]);
     out[7] = (inverse[1][0] * Rhs[6]) + (inverse[1][1] * Rhs[7]);
- }else {
+
+    if((isnan(out[0])) || (isnan(out[1])) || (isnan(out[2])) || (isnan(out[3])) || (isnan(out[4])) || (isnan(out[5])) || (isnan(out[6])) || (isnan(out[7]))){
+      out[0] = 0.0f;
+      out[1] = 0.0f;
+      out[2] = 0.0f;
+      out[3] = 0.0f;
+      out[4] = 0.0f;
+      out[5] = 0.0f;
+      out[6] = 0.0f;
+      out[7] = 0.0f;
+    }
+  } else {
 
     out[0] = 0.0f;
     out[1] = 0.0f;
@@ -115,7 +132,7 @@ __device__ void computeGradient_gpu( const float *center,
     out[5] = 0.0f;
     out[6] = 0.0f;
     out[7] = 0.0f;
- }
+  }
 
 
 
@@ -148,65 +165,45 @@ __device__ void computeGradient_gpu( const float *center,
   q[7] = center[3] > neighbour1[3] ? center[3] : neighbour1[3];
   q[7] = q[7] > neighbour2[3] ? q[7] : neighbour2[3];
   q[7] = q[7] > neighbour3[3] ? q[7] : neighbour3[3];
+
 }
 
 // CUDA kernel function
 __global__ void op_cuda_computeGradient(
   const float *__restrict ind_arg0,
   const float *__restrict ind_arg1,
+  float *__restrict ind_arg2,
   const int *__restrict opDat1Map,
   const float *__restrict arg0,
   const float *__restrict arg4,
   float *arg8,
   float *arg9,
-  int    block_offset,
-  int   *blkmap,
-  int   *offset,
-  int   *nelems,
-  int   *ncolors,
-  int   *colors,
-  int   nblocks,
+  int start,
+  int end,
+  int *col_reord,
   int   set_size) {
-
-  __shared__ int    nelem, offset_b;
-
-  extern __shared__ char shared[];
-
-  if (blockIdx.x+blockIdx.y*gridDim.x >= nblocks) {
-    return;
-  }
-  if (threadIdx.x==0) {
-
-    //get sizes and shift pointers and direct-mapped data
-
-    int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x  + block_offset];
-
-    nelem    = nelems[blockId];
-    offset_b = offset[blockId];
-
-  }
-  __syncthreads(); // make sure all of above completed
-
-  for ( int n=threadIdx.x; n<nelem; n+=blockDim.x ){
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid + start < end) {
+    int n = col_reord[tid + start];
+    //initialise local variables
     int map1idx;
     int map2idx;
     int map3idx;
-    map1idx = opDat1Map[n + offset_b + set_size * 0];
-    map2idx = opDat1Map[n + offset_b + set_size * 1];
-    map3idx = opDat1Map[n + offset_b + set_size * 2];
-
+    map1idx = opDat1Map[n + set_size * 0];
+    map2idx = opDat1Map[n + set_size * 1];
+    map3idx = opDat1Map[n + set_size * 2];
 
     //user-supplied kernel call
-    computeGradient_gpu(arg0+(n+offset_b)*4,
+    computeGradient_gpu(arg0+n*4,
                     ind_arg0+map1idx*4,
                     ind_arg0+map2idx*4,
                     ind_arg0+map3idx*4,
-                    arg4+(n+offset_b)*2,
+                    arg4+n*2,
                     ind_arg1+map1idx*2,
                     ind_arg1+map2idx*2,
-                    ind_arg1+map3idx*2,
-                    arg8+(n+offset_b)*8,
-                    arg9+(n+offset_b)*8);
+                    ind_arg2+map3idx*2,
+                    arg8+n*8,
+                    arg9+n*8);
   }
 }
 
@@ -240,73 +237,68 @@ void op_par_loop_computeGradient(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(21);
+  op_timing_realloc(22);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[21].name      = name;
-  OP_kernels[21].count    += 1;
+  OP_kernels[22].name      = name;
+  OP_kernels[22].count    += 1;
 
 
-  int    ninds   = 2;
-  int    inds[10] = {-1,0,0,0,-1,1,1,1,-1,-1};
+  int    ninds   = 3;
+  int    inds[10] = {-1,0,0,0,-1,1,1,2,-1,-1};
 
   if (OP_diags>2) {
     printf(" kernel routine with indirection: computeGradient\n");
   }
 
   //get plan
-  #ifdef OP_PART_SIZE_21
-    int part_size = OP_PART_SIZE_21;
+  #ifdef OP_PART_SIZE_22
+    int part_size = OP_PART_SIZE_22;
   #else
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
-    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
+    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);
 
     //execute plan
-
-    int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
-      #ifdef OP_BLOCK_SIZE_21
-      int nthread = OP_BLOCK_SIZE_21;
+      #ifdef OP_BLOCK_SIZE_22
+      int nthread = OP_BLOCK_SIZE_22;
       #else
       int nthread = OP_block_size;
       #endif
 
-      dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],
-      Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);
-      if (Plan->ncolblk[col] > 0) {
-        op_cuda_computeGradient<<<nblocks,nthread>>>(
-        (float *)arg1.data_d,
-        (float *)arg5.data_d,
-        arg1.map_data_d,
-        (float*)arg0.data_d,
-        (float*)arg4.data_d,
-        (float*)arg8.data_d,
-        (float*)arg9.data_d,
-        block_offset,
-        Plan->blkmap,
-        Plan->offset,
-        Plan->nelems,
-        Plan->nthrcol,
-        Plan->thrcol,
-        Plan->ncolblk[col],
-        set->size+set->exec_size);
+      int start = Plan->col_offsets[0][col];
+      int end = Plan->col_offsets[0][col+1];
+      int nblocks = (end - start - 1)/nthread + 1;
+      op_cuda_computeGradient<<<nblocks,nthread>>>(
+      (float *)arg1.data_d,
+      (float *)arg5.data_d,
+      (float *)arg7.data_d,
+      arg1.map_data_d,
+      (float*)arg0.data_d,
+      (float*)arg4.data_d,
+      (float*)arg8.data_d,
+      (float*)arg9.data_d,
+      start,
+      end,
+      Plan->col_reord,
+      set->size+set->exec_size);
 
-      }
-      block_offset += Plan->ncolblk[col];
     }
-    OP_kernels[21].transfer  += Plan->transfer;
-    OP_kernels[21].transfer2 += Plan->transfer2;
+    OP_kernels[22].transfer  += Plan->transfer;
+    OP_kernels[22].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[21].time     += wall_t2 - wall_t1;
+  OP_kernels[22].time     += wall_t2 - wall_t1;
 }
diff --git a/sp/cuda/gatherLocations_kernel.cu b/sp/cuda/gatherLocations_kernel.cu
index adf6ea9..e66ed62 100644
--- a/sp/cuda/gatherLocations_kernel.cu
+++ b/sp/cuda/gatherLocations_kernel.cu
@@ -3,19 +3,21 @@
 //
 
 //user function
-__device__ void gatherLocations_gpu( const float *values, float *dest) {
-	dest[0] = values[0] + values[3];
+__device__ void gatherLocations_gpu( const float *values, const float *zmin, float *dest) {
+  dest[0] = values[0] + (values[3] - values[0] + *zmin);
   dest[1] = values[0];
-  dest[2] = values[1];
-  dest[3] = values[2];
+  dest[2] = values[1]/values[0];
+  dest[3] = values[2]/values[0];
   dest[4] = values[3];
+
 }
 
 // CUDA kernel function
 __global__ void op_cuda_gatherLocations(
   const float *__restrict ind_arg0,
   const int *__restrict opDat0Map,
-  float *arg1,
+  const float *arg1,
+  float *arg2,
   int    block_offset,
   int   *blkmap,
   int   *offset,
@@ -25,6 +27,7 @@ __global__ void op_cuda_gatherLocations(
   int   nblocks,
   int   set_size) {
 
+
   __shared__ int    nelem, offset_b;
 
   extern __shared__ char shared[];
@@ -51,7 +54,8 @@ __global__ void op_cuda_gatherLocations(
 
     //user-supplied kernel call
     gatherLocations_gpu(ind_arg0+map0idx*4,
-                    arg1+(n+offset_b)*5);
+                    arg1,
+                    arg2+(n+offset_b)*5);
   }
 }
 
@@ -59,13 +63,16 @@ __global__ void op_cuda_gatherLocations(
 //host stub function
 void op_par_loop_gatherLocations(char const *name, op_set set,
   op_arg arg0,
-  op_arg arg1){
+  op_arg arg1,
+  op_arg arg2){
 
-  int nargs = 2;
-  op_arg args[2];
+  float*arg1h = (float *)arg1.data;
+  int nargs = 3;
+  op_arg args[3];
 
   args[0] = arg0;
   args[1] = arg1;
+  args[2] = arg2;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
@@ -76,7 +83,7 @@ void op_par_loop_gatherLocations(char const *name, op_set set,
 
 
   int    ninds   = 1;
-  int    inds[2] = {0,-1};
+  int    inds[3] = {0,-1,-1};
 
   if (OP_diags>2) {
     printf(" kernel routine with indirection: gatherLocations\n");
@@ -89,17 +96,30 @@ void op_par_loop_gatherLocations(char const *name, op_set set,
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
 
+    //transfer constants to GPU
+    int consts_bytes = 0;
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    reallocConstArrays(consts_bytes);
+    consts_bytes = 0;
+    arg1.data   = OP_consts_h + consts_bytes;
+    arg1.data_d = OP_consts_d + consts_bytes;
+    for ( int d=0; d<1; d++ ){
+      ((float *)arg1.data)[d] = arg1h[d];
+    }
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    mvConstArraysToDevice(consts_bytes);
+
     //execute plan
 
     int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
       #ifdef OP_BLOCK_SIZE_20
       int nthread = OP_BLOCK_SIZE_20;
@@ -114,6 +134,7 @@ void op_par_loop_gatherLocations(char const *name, op_set set,
         (float *)arg0.data_d,
         arg0.map_data_d,
         (float*)arg1.data_d,
+        (float*)arg2.data_d,
         block_offset,
         Plan->blkmap,
         Plan->offset,
@@ -130,7 +151,9 @@ void op_par_loop_gatherLocations(char const *name, op_set set,
     OP_kernels[20].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[20].time     += wall_t2 - wall_t1;
diff --git a/sp/cuda/getMaxElevation_kernel.cu b/sp/cuda/getMaxElevation_kernel.cu
index 11f3d8e..4856dc5 100644
--- a/sp/cuda/getMaxElevation_kernel.cu
+++ b/sp/cuda/getMaxElevation_kernel.cu
@@ -4,13 +4,23 @@
 
 //user function
 __device__ void getMaxElevation_gpu( const float* values, float* currentMaxElevation) {
+  float tmp = values[0]+values[3];
+  float tmpmax = currentMaxElevation[0]+currentMaxElevation[3];
 
-  if (values[0] > currentMaxElevation[0]) {
-    currentMaxElevation[0] = values[0];
-    currentMaxElevation[1] = values[1];
-    currentMaxElevation[2] = values[2];
-    currentMaxElevation[3] = values[3];
+  if (tmp > tmpmax ) {
+
+  currentMaxElevation[0] = values[0];
+  currentMaxElevation[1] = values[1];
+  currentMaxElevation[2] = values[2];
+  currentMaxElevation[3] = values[3];
   }
+
+
+
+
+
+
+
 }
 
 // CUDA kernel function
@@ -53,15 +63,14 @@ void op_par_loop_getMaxElevation(char const *name, op_set set,
     printf(" kernel routine w/o indirection:  getMaxElevation");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //set CUDA execution parameters
     #ifdef OP_BLOCK_SIZE_18
       int nthread = OP_BLOCK_SIZE_18;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -72,7 +81,9 @@ void op_par_loop_getMaxElevation(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[18].time     += wall_t2 - wall_t1;
diff --git a/sp/cuda/getMaxSpeed_kernel.cu b/sp/cuda/getMaxSpeed_kernel.cu
index 2cf7676..adb78e8 100644
--- a/sp/cuda/getMaxSpeed_kernel.cu
+++ b/sp/cuda/getMaxSpeed_kernel.cu
@@ -5,12 +5,20 @@
 //user function
 __device__ void getMaxSpeed_gpu( const float* values, float* currentMaxSpeed) {
 
-  if (sqrt(values[1]*values[1]+values[2]*values[2]) > sqrt(currentMaxSpeed[1]*currentMaxSpeed[1]+currentMaxSpeed[2]*currentMaxSpeed[2])) {
-    currentMaxSpeed[0] = values[0];
-    currentMaxSpeed[1] = values[1];
-    currentMaxSpeed[2] = values[2];
-    currentMaxSpeed[3] = values[3];
+  if (values[0] > 1e-3){
+    float TruncatedH = values[0];
+    float u = values[1]/TruncatedH;
+    float v = values[2]/TruncatedH;
+    float umax = currentMaxSpeed[1]/currentMaxSpeed[0];
+    float vmax = currentMaxSpeed[2]/currentMaxSpeed[0];
+    if (sqrt(u*u + v*v) > sqrt(umax*umax+vmax*vmax)) {
+      currentMaxSpeed[0] = values[0];
+      currentMaxSpeed[1] = values[1];
+      currentMaxSpeed[2] = values[2];
+      currentMaxSpeed[3] = values[3];
+    }
   }
+
 }
 
 // CUDA kernel function
@@ -53,15 +61,14 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set,
     printf(" kernel routine w/o indirection:  getMaxSpeed");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //set CUDA execution parameters
     #ifdef OP_BLOCK_SIZE_19
       int nthread = OP_BLOCK_SIZE_19;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -72,7 +79,9 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[19].time     += wall_t2 - wall_t1;
diff --git a/sp/cuda/getTotalVol_kernel.cu b/sp/cuda/getTotalVol_kernel.cu
index 596761d..20ca771 100644
--- a/sp/cuda/getTotalVol_kernel.cu
+++ b/sp/cuda/getTotalVol_kernel.cu
@@ -5,6 +5,7 @@
 //user function
 __device__ void getTotalVol_gpu( const float* cellVolume, const float* value, float* totalVol) {
   (*totalVol) += (*cellVolume) * value[0];
+
 }
 
 // CUDA kernel function
@@ -62,15 +63,14 @@ void op_par_loop_getTotalVol(char const *name, op_set set,
     printf(" kernel routine w/o indirection:  getTotalVol");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //set CUDA execution parameters
     #ifdef OP_BLOCK_SIZE_17
       int nthread = OP_BLOCK_SIZE_17;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -110,7 +110,9 @@ void op_par_loop_getTotalVol(char const *name, op_set set,
     op_mpi_reduce(&arg2,arg2h);
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
   OP_kernels[17].time     += wall_t2 - wall_t1;
diff --git a/sp/cuda/incConst_kernel.cu b/sp/cuda/incConst_kernel.cu
index 21dfe37..9344b64 100644
--- a/sp/cuda/incConst_kernel.cu
+++ b/sp/cuda/incConst_kernel.cu
@@ -4,18 +4,23 @@
 
 //user function
 __device__ void incConst_gpu( const float *in, float *out, const int *variables) {
+  float H;
   if (*variables & 1) {
     out[0] += *in;
+    out[3] += *in;
   }
   if (*variables & 2) {
-    out[1] += *in;
+    H = out[0] > EPS_cuda ? out[0] : EPS_cuda;
+    out[1] += *in * H;
   }
   if (*variables & 4) {
-    out[2] += *in;
+    H = out[0] > EPS_cuda ? out[0] : EPS_cuda;
+    out[2] += *in * H;
   }
   if (*variables & 8) {
     out[3] += *in;
   }
+
 }
 
 // CUDA kernel function
@@ -53,18 +58,18 @@ void op_par_loop_incConst(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(5);
+  op_timing_realloc(4);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[5].name      = name;
-  OP_kernels[5].count    += 1;
+  OP_kernels[4].name      = name;
+  OP_kernels[4].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  incConst");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -80,11 +85,10 @@ void op_par_loop_incConst(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_5
-      int nthread = OP_BLOCK_SIZE_5;
+    #ifdef OP_BLOCK_SIZE_4
+      int nthread = OP_BLOCK_SIZE_4;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -96,10 +100,12 @@ void op_par_loop_incConst(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[5].time     += wall_t2 - wall_t1;
-  OP_kernels[5].transfer += (float)set->size * arg0.size;
-  OP_kernels[5].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[4].time     += wall_t2 - wall_t1;
+  OP_kernels[4].transfer += (float)set->size * arg0.size;
+  OP_kernels[4].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/initBathyRelative_formula_kernel.cu b/sp/cuda/initBathyRelative_formula_kernel.cu
index c986f32..96faa0c 100644
--- a/sp/cuda/initBathyRelative_formula_kernel.cu
+++ b/sp/cuda/initBathyRelative_formula_kernel.cu
@@ -7,8 +7,9 @@ __device__ void initBathyRelative_formula_gpu( const float *coords, float *value
   float x = coords[0];
   float y = coords[1];
   float t = *time;
-  float val = exp(-(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g)*0.01f*t)*(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g)*0.01f*t));;
-  values[3] = *bathy0 + val;
+  float val = exp(-(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g_cuda)*0.01f*t)*(2.f*sqrt(x*0.01f*0.01f/(tan((5.7f*2.f*M_PI)/360.f)))-sqrt(g_cuda)*0.01f*t));;
+  *values = *bathy0 + val;
+
 }
 
 // CUDA kernel function
@@ -25,7 +26,7 @@ __global__ void op_cuda_initBathyRelative_formula(
 
     //user-supplied kernel call
     initBathyRelative_formula_gpu(arg0+n*2,
-                              arg1+n*4,
+                              arg1+n*1,
                               arg2+n*1,
                               arg3);
   }
@@ -50,18 +51,18 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(12);
+  op_timing_realloc(10);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[12].name      = name;
-  OP_kernels[12].count    += 1;
+  OP_kernels[10].name      = name;
+  OP_kernels[10].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initBathyRelative_formula");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -77,11 +78,10 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_12
-      int nthread = OP_BLOCK_SIZE_12;
+    #ifdef OP_BLOCK_SIZE_10
+      int nthread = OP_BLOCK_SIZE_10;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -94,11 +94,13 @@ void op_par_loop_initBathyRelative_formula(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[12].time     += wall_t2 - wall_t1;
-  OP_kernels[12].transfer += (float)set->size * arg0.size;
-  OP_kernels[12].transfer += (float)set->size * arg1.size * 2.0f;
-  OP_kernels[12].transfer += (float)set->size * arg2.size;
+  OP_kernels[10].time     += wall_t2 - wall_t1;
+  OP_kernels[10].transfer += (float)set->size * arg0.size;
+  OP_kernels[10].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[10].transfer += (float)set->size * arg2.size;
 }
diff --git a/sp/cuda/initBathymetry_formula_kernel.cu b/sp/cuda/initBathymetry_formula_kernel.cu
index ba2de25..b9f6c31 100644
--- a/sp/cuda/initBathymetry_formula_kernel.cu
+++ b/sp/cuda/initBathymetry_formula_kernel.cu
@@ -7,8 +7,9 @@ __device__ void initBathymetry_formula_gpu( const float *coords, float *values,
   float x = coords[0];
   float y = coords[1];
   float t = *time;
-  float val = .2f*(-5.0f-x)*(x<0.0f)-(x>=0.0f)+.2f*(t<1.0f)*exp(-(x+3.0f-2.0f*t)*(x+3.0f-2.0f*t)-y*y)+.2f*(t>=1.0f)*exp(-(x+1.0f)*(x+1.0f)-y*y);;
-  values[3] = val;
+  float val = -1.0f ;;
+  *values = val;
+
 }
 
 // CUDA kernel function
@@ -24,7 +25,7 @@ __global__ void op_cuda_initBathymetry_formula(
 
     //user-supplied kernel call
     initBathymetry_formula_gpu(arg0+n*2,
-                           arg1+n*4,
+                           arg1+n*1,
                            arg2);
   }
 }
@@ -46,18 +47,18 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(13);
+  op_timing_realloc(11);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[13].name      = name;
-  OP_kernels[13].count    += 1;
+  OP_kernels[11].name      = name;
+  OP_kernels[11].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initBathymetry_formula");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -73,11 +74,10 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_13
-      int nthread = OP_BLOCK_SIZE_13;
+    #ifdef OP_BLOCK_SIZE_11
+      int nthread = OP_BLOCK_SIZE_11;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -89,10 +89,12 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[13].time     += wall_t2 - wall_t1;
-  OP_kernels[13].transfer += (float)set->size * arg0.size;
-  OP_kernels[13].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[11].time     += wall_t2 - wall_t1;
+  OP_kernels[11].transfer += (float)set->size * arg0.size;
+  OP_kernels[11].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/initBathymetry_large_kernel.cu b/sp/cuda/initBathymetry_large_kernel.cu
index 0973112..fcf9925 100644
--- a/sp/cuda/initBathymetry_large_kernel.cu
+++ b/sp/cuda/initBathymetry_large_kernel.cu
@@ -45,8 +45,9 @@ __device__ void initBathymetry_large_gpu( float *values, const float *cellCenter
     float b = -(node1[0]-node0[0])*(*bathy2-*bathy0)+(node2[0]-node0[0])*(*bathy1-*bathy0);
     float c =  (node1[0]-node0[0])*(node2[1]-node0[1])-(node2[0]-node0[0])*(node1[1]-node0[1]);
 
-    values[3] += *bathy0 - (a*(cellCenter[0]-node0[0]) + b*(cellCenter[1]-node0[1]))/c;
+    *values += *bathy0 - (a*(cellCenter[0]-node0[0]) + b*(cellCenter[1]-node0[1]))/c;
   }
+
 }
 
 // CUDA kernel function
@@ -57,6 +58,10 @@ __global__ void op_cuda_initBathymetry_large(
   const float *__restrict ind_arg3,
   const int *__restrict opDat0Map,
   const int *__restrict opDat2Map,
+  int   *ind_map,
+  short *arg_map,
+  int   *ind_arg_sizes,
+  int   *ind_arg_offs,
   int    block_offset,
   int   *blkmap,
   int   *offset,
@@ -65,7 +70,10 @@ __global__ void op_cuda_initBathymetry_large(
   int   *colors,
   int   nblocks,
   int   set_size) {
-  float arg0_l[4];
+  float arg0_l[1];
+
+  __shared__  int  *ind_arg0_map, ind_arg0_size;
+  __shared__  float *ind_arg0_s;
 
   __shared__ int    nelems2, ncolor;
   __shared__ int    nelem, offset_b;
@@ -87,9 +95,22 @@ __global__ void op_cuda_initBathymetry_large(
     nelems2  = blockDim.x*(1+(nelem-1)/blockDim.x);
     ncolor   = ncolors[blockId];
 
+    ind_arg0_size = ind_arg_sizes[0+blockId*1];
+
+    ind_arg0_map = &ind_map[0*set_size] + ind_arg_offs[0+blockId*1];
+
+    //set shared memory pointers
+    int nbytes = 0;
+    ind_arg0_s = (float *) &shared[nbytes];
   }
   __syncthreads(); // make sure all of above completed
 
+  for ( int n=threadIdx.x; n<ind_arg0_size*1; n+=blockDim.x ){
+    ind_arg0_s[n] = ZERO_float;
+  }
+
+  __syncthreads();
+
   for ( int n=threadIdx.x; n<nelems2; n+=blockDim.x ){
     int col2 = -1;
     int map0idx;
@@ -98,7 +119,7 @@ __global__ void op_cuda_initBathymetry_large(
     int map4idx;
     if (n<nelem) {
       //initialise local variables
-      for ( int d=0; d<4; d++ ){
+      for ( int d=0; d<1; d++ ){
         arg0_l[d] = ZERO_float;
       }
       map0idx = opDat0Map[n + offset_b + set_size * 0];
@@ -121,20 +142,22 @@ __global__ void op_cuda_initBathymetry_large(
 
     //store local variables
 
+    int arg0_map;
+    if (col2>=0) {
+      arg0_map = arg_map[0*set_size+n+offset_b];
+    }
+
     for ( int col=0; col<ncolor; col++ ){
       if (col2==col) {
-        arg0_l[0] += ind_arg0[0+map0idx*4];
-        arg0_l[1] += ind_arg0[1+map0idx*4];
-        arg0_l[2] += ind_arg0[2+map0idx*4];
-        arg0_l[3] += ind_arg0[3+map0idx*4];
-        ind_arg0[0+map0idx*4] = arg0_l[0];
-        ind_arg0[1+map0idx*4] = arg0_l[1];
-        ind_arg0[2+map0idx*4] = arg0_l[2];
-        ind_arg0[3+map0idx*4] = arg0_l[3];
+        arg0_l[0] += ind_arg0_s[0+arg0_map*1];
+        ind_arg0_s[0+arg0_map*1] = arg0_l[0];
       }
       __syncthreads();
     }
   }
+  for ( int n=threadIdx.x; n<ind_arg0_size*1; n+=blockDim.x ){
+    ind_arg0[n%1+ind_arg0_map[n/1]*1] += ind_arg0_s[n];
+  }
 }
 
 
@@ -163,10 +186,10 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(11);
+  op_timing_realloc(9);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[11].name      = name;
-  OP_kernels[11].count    += 1;
+  OP_kernels[9].name      = name;
+  OP_kernels[9].count    += 1;
 
 
   int    ninds   = 4;
@@ -177,26 +200,26 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set,
   }
 
   //get plan
-  #ifdef OP_PART_SIZE_11
-    int part_size = OP_PART_SIZE_11;
+  #ifdef OP_PART_SIZE_9
+    int part_size = OP_PART_SIZE_9;
   #else
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
-    op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
+    op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);
 
     //execute plan
 
     int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
-      #ifdef OP_BLOCK_SIZE_11
-      int nthread = OP_BLOCK_SIZE_11;
+      #ifdef OP_BLOCK_SIZE_9
+      int nthread = OP_BLOCK_SIZE_9;
       #else
       int nthread = OP_block_size;
       #endif
@@ -204,13 +227,18 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set,
       dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],
       Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);
       if (Plan->ncolblk[col] > 0) {
-        op_cuda_initBathymetry_large<<<nblocks,nthread>>>(
+        int nshared = Plan->nsharedCol[col];
+        op_cuda_initBathymetry_large<<<nblocks,nthread,nshared>>>(
         (float *)arg0.data_d,
         (float *)arg1.data_d,
         (float *)arg2.data_d,
         (float *)arg5.data_d,
         arg0.map_data_d,
         arg2.map_data_d,
+        Plan->ind_map,
+        Plan->loc_map,
+        Plan->ind_sizes,
+        Plan->ind_offs,
         block_offset,
         Plan->blkmap,
         Plan->offset,
@@ -223,12 +251,14 @@ void op_par_loop_initBathymetry_large(char const *name, op_set set,
       }
       block_offset += Plan->ncolblk[col];
     }
-    OP_kernels[11].transfer  += Plan->transfer;
-    OP_kernels[11].transfer2 += Plan->transfer2;
+    OP_kernels[9].transfer  += Plan->transfer;
+    OP_kernels[9].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[11].time     += wall_t2 - wall_t1;
+  OP_kernels[9].time     += wall_t2 - wall_t1;
 }
diff --git a/sp/cuda/initBathymetry_update_kernel.cu b/sp/cuda/initBathymetry_update_kernel.cu
index e6fa3b5..48f56e8 100644
--- a/sp/cuda/initBathymetry_update_kernel.cu
+++ b/sp/cuda/initBathymetry_update_kernel.cu
@@ -3,17 +3,31 @@
 //
 
 //user function
-__device__ void initBathymetry_update_gpu( float *values, const int *firstTime) {
-  if (*firstTime)
-    values[0] -= values[3];
+__device__ void initBathymetry_update_gpu( float *values, const float *z_zero, const float *zmin, const int *firstTime) {
+    if (*firstTime){
+      if (*z_zero > 0.0f){
+        values[0] = EPS_cuda;
+        values[3] = -1.0f* *zmin + *z_zero;
+      } else {
+        values[0] = -1.0f* *z_zero;
+        values[3] = -1.0f* *zmin;
+      }
+    } else {
+      if (*z_zero > 0.0f){
+        values[3] = -1.0f* *zmin + *z_zero + values[0];
+      } else {
+        values[3] = values[0] + *z_zero - *zmin;
+      }
+    }
 
-  values[0] = values[0] < EPS ? EPS : values[0];
 }
 
 // CUDA kernel function
 __global__ void op_cuda_initBathymetry_update(
   float *arg0,
-  const int *arg1,
+  const float *__restrict arg1,
+  const float *arg2,
+  const int *arg3,
   int   set_size ) {
 
 
@@ -22,7 +36,9 @@ __global__ void op_cuda_initBathymetry_update(
 
     //user-supplied kernel call
     initBathymetry_update_gpu(arg0+n*4,
-                          arg1);
+                          arg1+n*1,
+                          arg2,
+                          arg3);
   }
 }
 
@@ -30,62 +46,78 @@ __global__ void op_cuda_initBathymetry_update(
 //host stub function
 void op_par_loop_initBathymetry_update(char const *name, op_set set,
   op_arg arg0,
-  op_arg arg1){
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3){
 
-  int*arg1h = (int *)arg1.data;
-  int nargs = 2;
-  op_arg args[2];
+  float*arg2h = (float *)arg2.data;
+  int*arg3h = (int *)arg3.data;
+  int nargs = 4;
+  op_arg args[4];
 
   args[0] = arg0;
   args[1] = arg1;
+  args[2] = arg2;
+  args[3] = arg3;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(14);
+  op_timing_realloc(13);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[14].name      = name;
-  OP_kernels[14].count    += 1;
+  OP_kernels[13].name      = name;
+  OP_kernels[13].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initBathymetry_update");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
+    consts_bytes += ROUND_UP(1*sizeof(float));
     consts_bytes += ROUND_UP(1*sizeof(int));
     reallocConstArrays(consts_bytes);
     consts_bytes = 0;
-    arg1.data   = OP_consts_h + consts_bytes;
-    arg1.data_d = OP_consts_d + consts_bytes;
+    arg2.data   = OP_consts_h + consts_bytes;
+    arg2.data_d = OP_consts_d + consts_bytes;
+    for ( int d=0; d<1; d++ ){
+      ((float *)arg2.data)[d] = arg2h[d];
+    }
+    consts_bytes += ROUND_UP(1*sizeof(float));
+    arg3.data   = OP_consts_h + consts_bytes;
+    arg3.data_d = OP_consts_d + consts_bytes;
     for ( int d=0; d<1; d++ ){
-      ((int *)arg1.data)[d] = arg1h[d];
+      ((int *)arg3.data)[d] = arg3h[d];
     }
     consts_bytes += ROUND_UP(1*sizeof(int));
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_14
-      int nthread = OP_BLOCK_SIZE_14;
+    #ifdef OP_BLOCK_SIZE_13
+      int nthread = OP_BLOCK_SIZE_13;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
 
     op_cuda_initBathymetry_update<<<nblocks,nthread>>>(
       (float *) arg0.data_d,
-      (int *) arg1.data_d,
+      (float *) arg1.data_d,
+      (float *) arg2.data_d,
+      (int *) arg3.data_d,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[14].time     += wall_t2 - wall_t1;
-  OP_kernels[14].transfer += (float)set->size * arg0.size * 2.0f;
+  OP_kernels[13].time     += wall_t2 - wall_t1;
+  OP_kernels[13].transfer += (float)set->size * arg0.size * 2.0f;
+  OP_kernels[13].transfer += (float)set->size * arg1.size;
 }
diff --git a/sp/cuda/initBore_select_kernel.cu b/sp/cuda/initBore_select_kernel.cu
index c47cb90..6e516a6 100644
--- a/sp/cuda/initBore_select_kernel.cu
+++ b/sp/cuda/initBore_select_kernel.cu
@@ -14,6 +14,7 @@ __device__ void initBore_select_gpu( float *values, const float *center,
   values[0] = center[0] < *x0 ? *Hl : *Hr;
   values[1] = center[0] < *x0 ? *ul : *ur;
   values[2] = center[0] < *x0 ? *vl : *vr;
+
 }
 
 // CUDA kernel function
@@ -81,18 +82,18 @@ void op_par_loop_initBore_select(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(15);
+  op_timing_realloc(14);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[15].name      = name;
-  OP_kernels[15].count    += 1;
+  OP_kernels[14].name      = name;
+  OP_kernels[14].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initBore_select");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -150,11 +151,10 @@ void op_par_loop_initBore_select(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_15
-      int nthread = OP_BLOCK_SIZE_15;
+    #ifdef OP_BLOCK_SIZE_14
+      int nthread = OP_BLOCK_SIZE_14;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -172,10 +172,12 @@ void op_par_loop_initBore_select(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[15].time     += wall_t2 - wall_t1;
-  OP_kernels[15].transfer += (float)set->size * arg0.size * 2.0f;
-  OP_kernels[15].transfer += (float)set->size * arg1.size;
+  OP_kernels[14].time     += wall_t2 - wall_t1;
+  OP_kernels[14].transfer += (float)set->size * arg0.size * 2.0f;
+  OP_kernels[14].transfer += (float)set->size * arg1.size;
 }
diff --git a/sp/cuda/initEta_formula_kernel.cu b/sp/cuda/initEta_formula_kernel.cu
index 2edbc43..5e3c07e 100644
--- a/sp/cuda/initEta_formula_kernel.cu
+++ b/sp/cuda/initEta_formula_kernel.cu
@@ -7,9 +7,9 @@ __device__ void initEta_formula_gpu( const float *coords, float *values, const d
   float x = coords[0];
   float y = coords[1];
   float t = *time;
-  float val = 0.0f;
-
+  float val = -1.0f ;;
   values[0] += val;
+
 }
 
 // CUDA kernel function
@@ -47,18 +47,18 @@ void op_par_loop_initEta_formula(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(6);
+  op_timing_realloc(5);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[6].name      = name;
-  OP_kernels[6].count    += 1;
+  OP_kernels[5].name      = name;
+  OP_kernels[5].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initEta_formula");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -74,11 +74,10 @@ void op_par_loop_initEta_formula(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_6
-      int nthread = OP_BLOCK_SIZE_6;
+    #ifdef OP_BLOCK_SIZE_5
+      int nthread = OP_BLOCK_SIZE_5;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -90,10 +89,12 @@ void op_par_loop_initEta_formula(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[6].time     += wall_t2 - wall_t1;
-  OP_kernels[6].transfer += (float)set->size * arg0.size;
-  OP_kernels[6].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[5].time     += wall_t2 - wall_t1;
+  OP_kernels[5].transfer += (float)set->size * arg0.size;
+  OP_kernels[5].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/initGaussianLandslide_kernel.cu b/sp/cuda/initGaussianLandslide_kernel.cu
index f801422..1401fdc 100644
--- a/sp/cuda/initGaussianLandslide_kernel.cu
+++ b/sp/cuda/initGaussianLandslide_kernel.cu
@@ -9,6 +9,7 @@ __device__ void initGaussianLandslide_gpu( const float *center, float *values, c
   values[3] = (*mesh_xmin-x)*(x<0.0)-5.0*(x>=0.0)+
       *A*(*t<1.0/(*v))*exp(-1.0* *lx* *lx*(x+3.0-*v**t)*(x+3.0-*v**t)-*ly**ly*y*y)
       +*A*(*t>=1.0/(*v))*exp(-*lx*(x+3.0-1.0)**lx*(x+3.0-1.0)-*ly**ly*y*y);
+
 }
 
 // CUDA kernel function
@@ -71,18 +72,18 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(16);
+  op_timing_realloc(15);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[16].name      = name;
-  OP_kernels[16].count    += 1;
+  OP_kernels[15].name      = name;
+  OP_kernels[15].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initGaussianLandslide");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -133,11 +134,10 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_16
-      int nthread = OP_BLOCK_SIZE_16;
+    #ifdef OP_BLOCK_SIZE_15
+      int nthread = OP_BLOCK_SIZE_15;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -154,10 +154,12 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[16].time     += wall_t2 - wall_t1;
-  OP_kernels[16].transfer += (float)set->size * arg0.size;
-  OP_kernels[16].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[15].time     += wall_t2 - wall_t1;
+  OP_kernels[15].transfer += (float)set->size * arg0.size;
+  OP_kernels[15].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/initU_formula_kernel.cu b/sp/cuda/initU_formula_kernel.cu
index 078818a..dfb9a63 100644
--- a/sp/cuda/initU_formula_kernel.cu
+++ b/sp/cuda/initU_formula_kernel.cu
@@ -8,8 +8,8 @@ __device__ void initU_formula_gpu( const float *coords, float *values, const dou
   float y = coords[1];
   float t = *time;
   float val = 0.0f;
-
   values[1] += val;
+
 }
 
 // CUDA kernel function
@@ -47,18 +47,18 @@ void op_par_loop_initU_formula(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(7);
+  op_timing_realloc(6);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[7].name      = name;
-  OP_kernels[7].count    += 1;
+  OP_kernels[6].name      = name;
+  OP_kernels[6].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initU_formula");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -74,11 +74,10 @@ void op_par_loop_initU_formula(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_7
-      int nthread = OP_BLOCK_SIZE_7;
+    #ifdef OP_BLOCK_SIZE_6
+      int nthread = OP_BLOCK_SIZE_6;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -90,10 +89,12 @@ void op_par_loop_initU_formula(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[7].time     += wall_t2 - wall_t1;
-  OP_kernels[7].transfer += (float)set->size * arg0.size;
-  OP_kernels[7].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[6].time     += wall_t2 - wall_t1;
+  OP_kernels[6].transfer += (float)set->size * arg0.size;
+  OP_kernels[6].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/initV_formula_kernel.cu b/sp/cuda/initV_formula_kernel.cu
index 566dfbd..e7dad86 100644
--- a/sp/cuda/initV_formula_kernel.cu
+++ b/sp/cuda/initV_formula_kernel.cu
@@ -7,8 +7,9 @@ __device__ void initV_formula_gpu( const float *coords, float *values, const dou
   float x = coords[0];
   float y = coords[1];
   float t = *time;
-  float val =     0.0f  ;;
+  float val = 0.0f ;;
   values[2] += val;
+
 }
 
 // CUDA kernel function
@@ -46,18 +47,18 @@ void op_par_loop_initV_formula(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(8);
+  op_timing_realloc(7);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[8].name      = name;
-  OP_kernels[8].count    += 1;
+  OP_kernels[7].name      = name;
+  OP_kernels[7].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  initV_formula");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -73,11 +74,10 @@ void op_par_loop_initV_formula(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_8
-      int nthread = OP_BLOCK_SIZE_8;
+    #ifdef OP_BLOCK_SIZE_7
+      int nthread = OP_BLOCK_SIZE_7;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -89,10 +89,12 @@ void op_par_loop_initV_formula(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[8].time     += wall_t2 - wall_t1;
-  OP_kernels[8].transfer += (float)set->size * arg0.size;
-  OP_kernels[8].transfer += (float)set->size * arg1.size * 2.0f;
+  OP_kernels[7].time     += wall_t2 - wall_t1;
+  OP_kernels[7].transfer += (float)set->size * arg0.size;
+  OP_kernels[7].transfer += (float)set->size * arg1.size * 2.0f;
 }
diff --git a/sp/cuda/limiter_kernel.cu b/sp/cuda/limiter_kernel.cu
index 94f4e75..93104cc 100644
--- a/sp/cuda/limiter_kernel.cu
+++ b/sp/cuda/limiter_kernel.cu
@@ -6,12 +6,13 @@
 __device__ void limiter_gpu( const float *q, float *lim,
                     const float *value, const float *gradient,
                     const float *edgecenter1, const float *edgecenter2,
-                    const float *edgecenter3, const float *cellcenter) {
+                    const float *edgecenter3,
+                    float *zeroInit,
+                    const float *cellcenter) {
 
   float facevalue[3], dx[3], dy[3];
   int i, j;
   float max[3], edgealpha[3];
-
   dx[0] = (edgecenter1[0] - cellcenter[0]);
   dy[0] = (edgecenter1[1] - cellcenter[1]);
   dx[1] = (edgecenter2[0] - cellcenter[0]);
@@ -19,7 +20,7 @@ __device__ void limiter_gpu( const float *q, float *lim,
   dx[2] = (edgecenter3[0] - cellcenter[0]);
   dy[2] = (edgecenter3[1] - cellcenter[1]);
 
-  if((value[0] > EPS) && (q[0]> EPS)){
+  if(q[0] > EPS_cuda){
 
 
 
@@ -28,10 +29,10 @@ __device__ void limiter_gpu( const float *q, float *lim,
 
   for(j=0;j<4;j++){
    for(i =0 ; i<3; i++){
-    facevalue[i] = value[j] + ((gradient[2*j]*dx[i]) + (gradient[2*j + 1]*dy[i]));
-     if(facevalue[i] > value[j]) {
+    facevalue[i] = value[j] + (((gradient[2*j]*dx[i]) + (gradient[2*j + 1]*dy[i])));
+     if(facevalue[i] > q[2*j + 1]) {
       edgealpha[i] = (q[2*j + 1] - value[j]) / (facevalue[i] - value[j]);
-     } else if (facevalue[i] < value[j]){
+     } else if (facevalue[i] < q[2*j]){
       edgealpha[i] = (q[2*j] - value[j]) / (facevalue[i] - value[j]);
      } else{
       edgealpha[i] = 1.0f;
@@ -44,13 +45,17 @@ __device__ void limiter_gpu( const float *q, float *lim,
   lim[0] = lim[0] < lim[1] ? lim[0]: lim[1];
   lim[0] = lim[0] < lim[2] ? lim[0]: lim[2];
   lim[0] = lim[0] < lim[3] ? lim[0]: lim[3];
-
   } else {
     lim[0] = 0.0f;
     lim[1] = 0.0f;
     lim[2] = 0.0f;
     lim[3] = 0.0f;
   }
+  zeroInit[0] = 0.0f;
+  zeroInit[1] = 0.0f;
+  zeroInit[2] = 0.0f;
+  zeroInit[3] = 0.0f;
+
 }
 
 // CUDA kernel function
@@ -61,7 +66,8 @@ __global__ void op_cuda_limiter(
   float *arg1,
   const float *__restrict arg2,
   const float *__restrict arg3,
-  const float *__restrict arg7,
+  float *arg7,
+  const float *__restrict arg8,
   int    block_offset,
   int   *blkmap,
   int   *offset,
@@ -71,6 +77,7 @@ __global__ void op_cuda_limiter(
   int   nblocks,
   int   set_size) {
 
+
   __shared__ int    nelem, offset_b;
 
   extern __shared__ char shared[];
@@ -107,7 +114,8 @@ __global__ void op_cuda_limiter(
             ind_arg0+map4idx*2,
             ind_arg0+map5idx*2,
             ind_arg0+map6idx*2,
-            arg7+(n+offset_b)*2);
+            arg7+(n+offset_b)*4,
+            arg8+(n+offset_b)*2);
   }
 }
 
@@ -121,10 +129,11 @@ void op_par_loop_limiter(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7){
+  op_arg arg7,
+  op_arg arg8){
 
-  int nargs = 8;
-  op_arg args[8];
+  int nargs = 9;
+  op_arg args[9];
 
   args[0] = arg0;
   args[1] = arg1;
@@ -134,31 +143,32 @@ void op_par_loop_limiter(char const *name, op_set set,
   args[5] = arg5;
   args[6] = arg6;
   args[7] = arg7;
+  args[8] = arg8;
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(22);
+  op_timing_realloc(23);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[22].name      = name;
-  OP_kernels[22].count    += 1;
+  OP_kernels[23].name      = name;
+  OP_kernels[23].count    += 1;
 
 
   int    ninds   = 1;
-  int    inds[8] = {-1,-1,-1,-1,0,0,0,-1};
+  int    inds[9] = {-1,-1,-1,-1,0,0,0,-1,-1};
 
   if (OP_diags>2) {
     printf(" kernel routine with indirection: limiter\n");
   }
 
   //get plan
-  #ifdef OP_PART_SIZE_22
-    int part_size = OP_PART_SIZE_22;
+  #ifdef OP_PART_SIZE_23
+    int part_size = OP_PART_SIZE_23;
   #else
     int part_size = OP_part_size;
   #endif
 
-  int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);
 
@@ -167,10 +177,10 @@ void op_par_loop_limiter(char const *name, op_set set,
     int block_offset = 0;
     for ( int col=0; col<Plan->ncolors; col++ ){
       if (col==Plan->ncolors_core) {
-        op_mpi_wait_all_cuda(nargs, args);
+        op_mpi_wait_all_grouped(nargs, args, 2);
       }
-      #ifdef OP_BLOCK_SIZE_22
-      int nthread = OP_BLOCK_SIZE_22;
+      #ifdef OP_BLOCK_SIZE_23
+      int nthread = OP_BLOCK_SIZE_23;
       #else
       int nthread = OP_block_size;
       #endif
@@ -186,6 +196,7 @@ void op_par_loop_limiter(char const *name, op_set set,
         (float*)arg2.data_d,
         (float*)arg3.data_d,
         (float*)arg7.data_d,
+        (float*)arg8.data_d,
         block_offset,
         Plan->blkmap,
         Plan->offset,
@@ -198,12 +209,14 @@ void op_par_loop_limiter(char const *name, op_set set,
       }
       block_offset += Plan->ncolblk[col];
     }
-    OP_kernels[22].transfer  += Plan->transfer;
-    OP_kernels[22].transfer2 += Plan->transfer2;
+    OP_kernels[23].transfer  += Plan->transfer;
+    OP_kernels[23].transfer2 += Plan->transfer2;
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[22].time     += wall_t2 - wall_t1;
+  OP_kernels[23].time     += wall_t2 - wall_t1;
 }
diff --git a/sp/cuda/simulation_1_kernel.cu b/sp/cuda/simulation_1_kernel.cu
index c17d11f..c32b693 100644
--- a/sp/cuda/simulation_1_kernel.cu
+++ b/sp/cuda/simulation_1_kernel.cu
@@ -8,6 +8,7 @@ __device__ void simulation_1_gpu( float *out, const float *in) {
   out[1] = in[1];
   out[2] = in[2];
   out[3] = in[3];
+
 }
 
 // CUDA kernel function
@@ -40,25 +41,24 @@ void op_par_loop_simulation_1(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(4);
+  op_timing_realloc(3);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[4].name      = name;
-  OP_kernels[4].count    += 1;
+  OP_kernels[3].name      = name;
+  OP_kernels[3].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  simulation_1");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_4
-      int nthread = OP_BLOCK_SIZE_4;
+    #ifdef OP_BLOCK_SIZE_3
+      int nthread = OP_BLOCK_SIZE_3;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -69,10 +69,12 @@ void op_par_loop_simulation_1(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[4].time     += wall_t2 - wall_t1;
-  OP_kernels[4].transfer += (float)set->size * arg0.size * 2.0f;
-  OP_kernels[4].transfer += (float)set->size * arg1.size;
+  OP_kernels[3].time     += wall_t2 - wall_t1;
+  OP_kernels[3].transfer += (float)set->size * arg0.size * 2.0f;
+  OP_kernels[3].transfer += (float)set->size * arg1.size;
 }
diff --git a/sp/cuda/values_operation2_kernel.cu b/sp/cuda/values_operation2_kernel.cu
index 1b0fc07..cc739fb 100644
--- a/sp/cuda/values_operation2_kernel.cu
+++ b/sp/cuda/values_operation2_kernel.cu
@@ -18,6 +18,7 @@ __device__ void values_operation2_gpu( float *values, const int *result, const i
     values[*result] = values[*left] / values[*right];
     break;
   }
+
 }
 
 // CUDA kernel function
@@ -66,18 +67,18 @@ void op_par_loop_values_operation2(char const *name, op_set set,
 
   // initialise timers
   double cpu_t1, cpu_t2, wall_t1, wall_t2;
-  op_timing_realloc(9);
+  op_timing_realloc(16);
   op_timers_core(&cpu_t1, &wall_t1);
-  OP_kernels[9].name      = name;
-  OP_kernels[9].count    += 1;
+  OP_kernels[16].name      = name;
+  OP_kernels[16].count    += 1;
 
 
   if (OP_diags>2) {
     printf(" kernel routine w/o indirection:  values_operation2");
   }
 
-  op_mpi_halo_exchanges_cuda(set, nargs, args);
-  if (set->size > 0) {
+  int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);
+  if (set_size > 0) {
 
     //transfer constants to GPU
     int consts_bytes = 0;
@@ -114,11 +115,10 @@ void op_par_loop_values_operation2(char const *name, op_set set,
     mvConstArraysToDevice(consts_bytes);
 
     //set CUDA execution parameters
-    #ifdef OP_BLOCK_SIZE_9
-      int nthread = OP_BLOCK_SIZE_9;
+    #ifdef OP_BLOCK_SIZE_16
+      int nthread = OP_BLOCK_SIZE_16;
     #else
       int nthread = OP_block_size;
-    //  int nthread = 128;
     #endif
 
     int nblocks = 200;
@@ -132,9 +132,11 @@ void op_par_loop_values_operation2(char const *name, op_set set,
       set->size );
   }
   op_mpi_set_dirtybit_cuda(nargs, args);
-  cutilSafeCall(cudaDeviceSynchronize());
+  if (OP_diags>1) {
+    cutilSafeCall(cudaDeviceSynchronize());
+  }
   //update kernel record
   op_timers_core(&cpu_t2, &wall_t2);
-  OP_kernels[9].time     += wall_t2 - wall_t1;
-  OP_kernels[9].transfer += (float)set->size * arg0.size * 2.0f;
+  OP_kernels[16].time     += wall_t2 - wall_t1;
+  OP_kernels[16].transfer += (float)set->size * arg0.size * 2.0f;
 }
diff --git a/sp/cuda/volna_hybkernels.cu b/sp/cuda/volna_hybkernels.cu
index c96ba4e..b9a2436 100644
--- a/sp/cuda/volna_hybkernels.cu
+++ b/sp/cuda/volna_hybkernels.cu
@@ -4,209 +4,175 @@
 
 //header
 #ifdef GPUPASS
-#define op_par_loop_EvolveValuesRK3_1 op_par_loop_EvolveValuesRK3_1_gpu
-#define op_par_loop_EvolveValuesRK3_2 op_par_loop_EvolveValuesRK3_2_gpu
-#define op_par_loop_EvolveValuesRK3_3 op_par_loop_EvolveValuesRK3_3_gpu
-#define op_par_loop_EvolveValuesRK3_4 op_par_loop_EvolveValuesRK3_4_gpu
+#define op_par_loop_EvolveValuesRK2_1 op_par_loop_EvolveValuesRK2_1_gpu
+#define op_par_loop_EvolveValuesRK2_2 op_par_loop_EvolveValuesRK2_2_gpu
+#define op_par_loop_Friction_manning op_par_loop_Friction_manning_gpu
 #define op_par_loop_simulation_1 op_par_loop_simulation_1_gpu
 #define op_par_loop_incConst op_par_loop_incConst_gpu
 #define op_par_loop_initEta_formula op_par_loop_initEta_formula_gpu
 #define op_par_loop_initU_formula op_par_loop_initU_formula_gpu
 #define op_par_loop_initV_formula op_par_loop_initV_formula_gpu
-#define op_par_loop_values_operation2 op_par_loop_values_operation2_gpu
 #define op_par_loop_applyConst op_par_loop_applyConst_gpu
 #define op_par_loop_initBathymetry_large op_par_loop_initBathymetry_large_gpu
 #define op_par_loop_initBathyRelative_formula op_par_loop_initBathyRelative_formula_gpu
 #define op_par_loop_initBathymetry_formula op_par_loop_initBathymetry_formula_gpu
+#define op_par_loop_zero_bathy op_par_loop_zero_bathy_gpu
 #define op_par_loop_initBathymetry_update op_par_loop_initBathymetry_update_gpu
 #define op_par_loop_initBore_select op_par_loop_initBore_select_gpu
 #define op_par_loop_initGaussianLandslide op_par_loop_initGaussianLandslide_gpu
+#define op_par_loop_values_operation2 op_par_loop_values_operation2_gpu
 #define op_par_loop_getTotalVol op_par_loop_getTotalVol_gpu
 #define op_par_loop_getMaxElevation op_par_loop_getMaxElevation_gpu
 #define op_par_loop_getMaxSpeed op_par_loop_getMaxSpeed_gpu
 #define op_par_loop_gatherLocations op_par_loop_gatherLocations_gpu
+#define op_par_loop_toOutputs op_par_loop_toOutputs_gpu
 #define op_par_loop_computeGradient op_par_loop_computeGradient_gpu
 #define op_par_loop_limiter op_par_loop_limiter_gpu
 #define op_par_loop_computeFluxes op_par_loop_computeFluxes_gpu
+#define op_par_loop_Timestep op_par_loop_Timestep_gpu
 #define op_par_loop_NumericalFluxes op_par_loop_NumericalFluxes_gpu
-#define op_par_loop_SpaceDiscretization op_par_loop_SpaceDiscretization_gpu
+#define op_par_loop_computeFluxes_sph op_par_loop_computeFluxes_sph_gpu
+#define op_par_loop_NumericalFluxes_sph op_par_loop_NumericalFluxes_sph_gpu
 #include "volna_kernels.cu"
-#undef op_par_loop_EvolveValuesRK3_1
-#undef op_par_loop_EvolveValuesRK3_2
-#undef op_par_loop_EvolveValuesRK3_3
-#undef op_par_loop_EvolveValuesRK3_4
+#undef op_par_loop_EvolveValuesRK2_1
+#undef op_par_loop_EvolveValuesRK2_2
+#undef op_par_loop_Friction_manning
 #undef op_par_loop_simulation_1
 #undef op_par_loop_incConst
 #undef op_par_loop_initEta_formula
 #undef op_par_loop_initU_formula
 #undef op_par_loop_initV_formula
-#undef op_par_loop_values_operation2
 #undef op_par_loop_applyConst
 #undef op_par_loop_initBathymetry_large
 #undef op_par_loop_initBathyRelative_formula
 #undef op_par_loop_initBathymetry_formula
+#undef op_par_loop_zero_bathy
 #undef op_par_loop_initBathymetry_update
 #undef op_par_loop_initBore_select
 #undef op_par_loop_initGaussianLandslide
+#undef op_par_loop_values_operation2
 #undef op_par_loop_getTotalVol
 #undef op_par_loop_getMaxElevation
 #undef op_par_loop_getMaxSpeed
 #undef op_par_loop_gatherLocations
+#undef op_par_loop_toOutputs
 #undef op_par_loop_computeGradient
 #undef op_par_loop_limiter
 #undef op_par_loop_computeFluxes
+#undef op_par_loop_Timestep
 #undef op_par_loop_NumericalFluxes
-#undef op_par_loop_SpaceDiscretization
+#undef op_par_loop_computeFluxes_sph
+#undef op_par_loop_NumericalFluxes_sph
 #else
-#define op_par_loop_EvolveValuesRK3_1 op_par_loop_EvolveValuesRK3_1_cpu
-#define op_par_loop_EvolveValuesRK3_2 op_par_loop_EvolveValuesRK3_2_cpu
-#define op_par_loop_EvolveValuesRK3_3 op_par_loop_EvolveValuesRK3_3_cpu
-#define op_par_loop_EvolveValuesRK3_4 op_par_loop_EvolveValuesRK3_4_cpu
+#define op_par_loop_EvolveValuesRK2_1 op_par_loop_EvolveValuesRK2_1_cpu
+#define op_par_loop_EvolveValuesRK2_2 op_par_loop_EvolveValuesRK2_2_cpu
+#define op_par_loop_Friction_manning op_par_loop_Friction_manning_cpu
 #define op_par_loop_simulation_1 op_par_loop_simulation_1_cpu
 #define op_par_loop_incConst op_par_loop_incConst_cpu
 #define op_par_loop_initEta_formula op_par_loop_initEta_formula_cpu
 #define op_par_loop_initU_formula op_par_loop_initU_formula_cpu
 #define op_par_loop_initV_formula op_par_loop_initV_formula_cpu
-#define op_par_loop_values_operation2 op_par_loop_values_operation2_cpu
 #define op_par_loop_applyConst op_par_loop_applyConst_cpu
 #define op_par_loop_initBathymetry_large op_par_loop_initBathymetry_large_cpu
 #define op_par_loop_initBathyRelative_formula op_par_loop_initBathyRelative_formula_cpu
 #define op_par_loop_initBathymetry_formula op_par_loop_initBathymetry_formula_cpu
+#define op_par_loop_zero_bathy op_par_loop_zero_bathy_cpu
 #define op_par_loop_initBathymetry_update op_par_loop_initBathymetry_update_cpu
 #define op_par_loop_initBore_select op_par_loop_initBore_select_cpu
 #define op_par_loop_initGaussianLandslide op_par_loop_initGaussianLandslide_cpu
+#define op_par_loop_values_operation2 op_par_loop_values_operation2_cpu
 #define op_par_loop_getTotalVol op_par_loop_getTotalVol_cpu
 #define op_par_loop_getMaxElevation op_par_loop_getMaxElevation_cpu
 #define op_par_loop_getMaxSpeed op_par_loop_getMaxSpeed_cpu
 #define op_par_loop_gatherLocations op_par_loop_gatherLocations_cpu
+#define op_par_loop_toOutputs op_par_loop_toOutputs_cpu
 #define op_par_loop_computeGradient op_par_loop_computeGradient_cpu
 #define op_par_loop_limiter op_par_loop_limiter_cpu
 #define op_par_loop_computeFluxes op_par_loop_computeFluxes_cpu
+#define op_par_loop_Timestep op_par_loop_Timestep_cpu
 #define op_par_loop_NumericalFluxes op_par_loop_NumericalFluxes_cpu
-#define op_par_loop_SpaceDiscretization op_par_loop_SpaceDiscretization_cpu
+#define op_par_loop_computeFluxes_sph op_par_loop_computeFluxes_sph_cpu
+#define op_par_loop_NumericalFluxes_sph op_par_loop_NumericalFluxes_sph_cpu
 #include "../openmp/volna_kernels.cpp"
-#undef op_par_loop_EvolveValuesRK3_1
-#undef op_par_loop_EvolveValuesRK3_2
-#undef op_par_loop_EvolveValuesRK3_3
-#undef op_par_loop_EvolveValuesRK3_4
+#undef op_par_loop_EvolveValuesRK2_1
+#undef op_par_loop_EvolveValuesRK2_2
+#undef op_par_loop_Friction_manning
 #undef op_par_loop_simulation_1
 #undef op_par_loop_incConst
 #undef op_par_loop_initEta_formula
 #undef op_par_loop_initU_formula
 #undef op_par_loop_initV_formula
-#undef op_par_loop_values_operation2
 #undef op_par_loop_applyConst
 #undef op_par_loop_initBathymetry_large
 #undef op_par_loop_initBathyRelative_formula
 #undef op_par_loop_initBathymetry_formula
+#undef op_par_loop_zero_bathy
 #undef op_par_loop_initBathymetry_update
 #undef op_par_loop_initBore_select
 #undef op_par_loop_initGaussianLandslide
+#undef op_par_loop_values_operation2
 #undef op_par_loop_getTotalVol
 #undef op_par_loop_getMaxElevation
 #undef op_par_loop_getMaxSpeed
 #undef op_par_loop_gatherLocations
+#undef op_par_loop_toOutputs
 #undef op_par_loop_computeGradient
 #undef op_par_loop_limiter
 #undef op_par_loop_computeFluxes
+#undef op_par_loop_Timestep
 #undef op_par_loop_NumericalFluxes
-#undef op_par_loop_SpaceDiscretization
+#undef op_par_loop_computeFluxes_sph
+#undef op_par_loop_NumericalFluxes_sph
 
 //user kernel files
 
-void op_par_loop_EvolveValuesRK3_1_gpu(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_1_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
-  op_arg arg3,
-  op_arg arg4);
+  op_arg arg3);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK3_1(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
+  op_arg arg3){
 
   if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK3_1_gpu(name, set,
+    op_par_loop_EvolveValuesRK2_1_gpu(name, set,
       arg0,
       arg1,
       arg2,
-      arg3,
-      arg4);
+      arg3);
 
     }else{
-    op_par_loop_EvolveValuesRK3_1_cpu(name, set,
+    op_par_loop_EvolveValuesRK2_1_cpu(name, set,
       arg0,
       arg1,
       arg2,
-      arg3,
-      arg4);
+      arg3);
 
   }
 }
 #else
-void op_par_loop_EvolveValuesRK3_1(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_1(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
+  op_arg arg3){
 
-  op_par_loop_EvolveValuesRK3_1_gpu(name, set,
+  op_par_loop_EvolveValuesRK2_1_gpu(name, set,
     arg0,
     arg1,
     arg2,
-    arg3,
-    arg4);
-
-  }
-#endif //OP_HYBRID_GPU
-
-void op_par_loop_EvolveValuesRK3_2_gpu(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2);
-
-//GPU host stub function
-#if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK3_2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2){
-
-  if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK3_2_gpu(name, set,
-      arg0,
-      arg1,
-      arg2);
-
-    }else{
-    op_par_loop_EvolveValuesRK3_2_cpu(name, set,
-      arg0,
-      arg1,
-      arg2);
-
-  }
-}
-#else
-void op_par_loop_EvolveValuesRK3_2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2){
-
-  op_par_loop_EvolveValuesRK3_2_gpu(name, set,
-    arg0,
-    arg1,
-    arg2);
+    arg3);
 
   }
 #endif //OP_HYBRID_GPU
 
-void op_par_loop_EvolveValuesRK3_3_gpu(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_2_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -215,7 +181,7 @@ void op_par_loop_EvolveValuesRK3_3_gpu(char const *name, op_set set,
 
 //GPU host stub function
 #if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -223,7 +189,7 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
   op_arg arg4){
 
   if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK3_3_gpu(name, set,
+    op_par_loop_EvolveValuesRK2_2_gpu(name, set,
       arg0,
       arg1,
       arg2,
@@ -231,7 +197,7 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
       arg4);
 
     }else{
-    op_par_loop_EvolveValuesRK3_3_cpu(name, set,
+    op_par_loop_EvolveValuesRK2_2_cpu(name, set,
       arg0,
       arg1,
       arg2,
@@ -241,14 +207,14 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
   }
 }
 #else
-void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
+void op_par_loop_EvolveValuesRK2_2(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
   op_arg arg3,
   op_arg arg4){
 
-  op_par_loop_EvolveValuesRK3_3_gpu(name, set,
+  op_par_loop_EvolveValuesRK2_2_gpu(name, set,
     arg0,
     arg1,
     arg2,
@@ -258,48 +224,42 @@ void op_par_loop_EvolveValuesRK3_3(char const *name, op_set set,
   }
 #endif //OP_HYBRID_GPU
 
-void op_par_loop_EvolveValuesRK3_4_gpu(char const *name, op_set set,
+void op_par_loop_Friction_manning_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
-  op_arg arg2,
-  op_arg arg3);
+  op_arg arg2);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
-void op_par_loop_EvolveValuesRK3_4(char const *name, op_set set,
+void op_par_loop_Friction_manning(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
-  op_arg arg2,
-  op_arg arg3){
+  op_arg arg2){
 
   if (OP_hybrid_gpu) {
-    op_par_loop_EvolveValuesRK3_4_gpu(name, set,
+    op_par_loop_Friction_manning_gpu(name, set,
       arg0,
       arg1,
-      arg2,
-      arg3);
+      arg2);
 
     }else{
-    op_par_loop_EvolveValuesRK3_4_cpu(name, set,
+    op_par_loop_Friction_manning_cpu(name, set,
       arg0,
       arg1,
-      arg2,
-      arg3);
+      arg2);
 
   }
 }
 #else
-void op_par_loop_EvolveValuesRK3_4(char const *name, op_set set,
+void op_par_loop_Friction_manning(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
-  op_arg arg2,
-  op_arg arg3){
+  op_arg arg2){
 
-  op_par_loop_EvolveValuesRK3_4_gpu(name, set,
+  op_par_loop_Friction_manning_gpu(name, set,
     arg0,
     arg1,
-    arg2,
-    arg3);
+    arg2);
 
   }
 #endif //OP_HYBRID_GPU
@@ -498,58 +458,6 @@ void op_par_loop_initV_formula(char const *name, op_set set,
   }
 #endif //OP_HYBRID_GPU
 
-void op_par_loop_values_operation2_gpu(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4);
-
-//GPU host stub function
-#if OP_HYBRID_GPU
-void op_par_loop_values_operation2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  if (OP_hybrid_gpu) {
-    op_par_loop_values_operation2_gpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-    }else{
-    op_par_loop_values_operation2_cpu(name, set,
-      arg0,
-      arg1,
-      arg2,
-      arg3,
-      arg4);
-
-  }
-}
-#else
-void op_par_loop_values_operation2(char const *name, op_set set,
-  op_arg arg0,
-  op_arg arg1,
-  op_arg arg2,
-  op_arg arg3,
-  op_arg arg4){
-
-  op_par_loop_values_operation2_gpu(name, set,
-    arg0,
-    arg1,
-    arg2,
-    arg3,
-    arg4);
-
-  }
-#endif //OP_HYBRID_GPU
-
 void op_par_loop_applyConst_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
@@ -746,40 +654,86 @@ void op_par_loop_initBathymetry_formula(char const *name, op_set set,
   }
 #endif //OP_HYBRID_GPU
 
-void op_par_loop_initBathymetry_update_gpu(char const *name, op_set set,
+void op_par_loop_zero_bathy_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
-void op_par_loop_initBathymetry_update(char const *name, op_set set,
+void op_par_loop_zero_bathy(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1){
 
   if (OP_hybrid_gpu) {
-    op_par_loop_initBathymetry_update_gpu(name, set,
+    op_par_loop_zero_bathy_gpu(name, set,
       arg0,
       arg1);
 
     }else{
-    op_par_loop_initBathymetry_update_cpu(name, set,
+    op_par_loop_zero_bathy_cpu(name, set,
       arg0,
       arg1);
 
   }
 }
 #else
-void op_par_loop_initBathymetry_update(char const *name, op_set set,
+void op_par_loop_zero_bathy(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1){
 
-  op_par_loop_initBathymetry_update_gpu(name, set,
+  op_par_loop_zero_bathy_gpu(name, set,
     arg0,
     arg1);
 
   }
 #endif //OP_HYBRID_GPU
 
+void op_par_loop_initBathymetry_update_gpu(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3);
+
+//GPU host stub function
+#if OP_HYBRID_GPU
+void op_par_loop_initBathymetry_update(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3){
+
+  if (OP_hybrid_gpu) {
+    op_par_loop_initBathymetry_update_gpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3);
+
+    }else{
+    op_par_loop_initBathymetry_update_cpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3);
+
+  }
+}
+#else
+void op_par_loop_initBathymetry_update(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3){
+
+  op_par_loop_initBathymetry_update_gpu(name, set,
+    arg0,
+    arg1,
+    arg2,
+    arg3);
+
+  }
+#endif //OP_HYBRID_GPU
+
 void op_par_loop_initBore_select_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
@@ -926,6 +880,58 @@ void op_par_loop_initGaussianLandslide(char const *name, op_set set,
   }
 #endif //OP_HYBRID_GPU
 
+void op_par_loop_values_operation2_gpu(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4);
+
+//GPU host stub function
+#if OP_HYBRID_GPU
+void op_par_loop_values_operation2(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4){
+
+  if (OP_hybrid_gpu) {
+    op_par_loop_values_operation2_gpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4);
+
+    }else{
+    op_par_loop_values_operation2_cpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4);
+
+  }
+}
+#else
+void op_par_loop_values_operation2(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4){
+
+  op_par_loop_values_operation2_gpu(name, set,
+    arg0,
+    arg1,
+    arg2,
+    arg3,
+    arg4);
+
+  }
+#endif //OP_HYBRID_GPU
+
 void op_par_loop_getTotalVol_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
@@ -1036,34 +1042,80 @@ void op_par_loop_getMaxSpeed(char const *name, op_set set,
 
 void op_par_loop_gatherLocations_gpu(char const *name, op_set set,
   op_arg arg0,
-  op_arg arg1);
+  op_arg arg1,
+  op_arg arg2);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
 void op_par_loop_gatherLocations(char const *name, op_set set,
   op_arg arg0,
-  op_arg arg1){
+  op_arg arg1,
+  op_arg arg2){
 
   if (OP_hybrid_gpu) {
     op_par_loop_gatherLocations_gpu(name, set,
       arg0,
-      arg1);
+      arg1,
+      arg2);
 
     }else{
     op_par_loop_gatherLocations_cpu(name, set,
       arg0,
-      arg1);
+      arg1,
+      arg2);
 
   }
 }
 #else
 void op_par_loop_gatherLocations(char const *name, op_set set,
   op_arg arg0,
-  op_arg arg1){
+  op_arg arg1,
+  op_arg arg2){
 
   op_par_loop_gatherLocations_gpu(name, set,
     arg0,
-    arg1);
+    arg1,
+    arg2);
+
+  }
+#endif //OP_HYBRID_GPU
+
+void op_par_loop_toOutputs_gpu(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2);
+
+//GPU host stub function
+#if OP_HYBRID_GPU
+void op_par_loop_toOutputs(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2){
+
+  if (OP_hybrid_gpu) {
+    op_par_loop_toOutputs_gpu(name, set,
+      arg0,
+      arg1,
+      arg2);
+
+    }else{
+    op_par_loop_toOutputs_cpu(name, set,
+      arg0,
+      arg1,
+      arg2);
+
+  }
+}
+#else
+void op_par_loop_toOutputs(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2){
+
+  op_par_loop_toOutputs_gpu(name, set,
+    arg0,
+    arg1,
+    arg2);
 
   }
 #endif //OP_HYBRID_GPU
@@ -1158,7 +1210,8 @@ void op_par_loop_limiter_gpu(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7);
+  op_arg arg7,
+  op_arg arg8);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
@@ -1170,7 +1223,8 @@ void op_par_loop_limiter(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7){
+  op_arg arg7,
+  op_arg arg8){
 
   if (OP_hybrid_gpu) {
     op_par_loop_limiter_gpu(name, set,
@@ -1181,7 +1235,8 @@ void op_par_loop_limiter(char const *name, op_set set,
       arg4,
       arg5,
       arg6,
-      arg7);
+      arg7,
+      arg8);
 
     }else{
     op_par_loop_limiter_cpu(name, set,
@@ -1192,7 +1247,8 @@ void op_par_loop_limiter(char const *name, op_set set,
       arg4,
       arg5,
       arg6,
-      arg7);
+      arg7,
+      arg8);
 
   }
 }
@@ -1205,7 +1261,8 @@ void op_par_loop_limiter(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7){
+  op_arg arg7,
+  op_arg arg8){
 
   op_par_loop_limiter_gpu(name, set,
     arg0,
@@ -1215,7 +1272,8 @@ void op_par_loop_limiter(char const *name, op_set set,
     arg4,
     arg5,
     arg6,
-    arg7);
+    arg7,
+    arg8);
 
   }
 #endif //OP_HYBRID_GPU
@@ -1235,7 +1293,8 @@ void op_par_loop_computeFluxes_gpu(char const *name, op_set set,
   op_arg arg11,
   op_arg arg12,
   op_arg arg13,
-  op_arg arg14);
+  op_arg arg14,
+  op_arg arg15);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
@@ -1254,7 +1313,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
   op_arg arg11,
   op_arg arg12,
   op_arg arg13,
-  op_arg arg14){
+  op_arg arg14,
+  op_arg arg15){
 
   if (OP_hybrid_gpu) {
     op_par_loop_computeFluxes_gpu(name, set,
@@ -1272,7 +1332,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
       arg11,
       arg12,
       arg13,
-      arg14);
+      arg14,
+      arg15);
 
     }else{
     op_par_loop_computeFluxes_cpu(name, set,
@@ -1290,7 +1351,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
       arg11,
       arg12,
       arg13,
-      arg14);
+      arg14,
+      arg15);
 
   }
 }
@@ -1310,7 +1372,8 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
   op_arg arg11,
   op_arg arg12,
   op_arg arg13,
-  op_arg arg14){
+  op_arg arg14,
+  op_arg arg15){
 
   op_par_loop_computeFluxes_gpu(name, set,
     arg0,
@@ -1327,7 +1390,78 @@ void op_par_loop_computeFluxes(char const *name, op_set set,
     arg11,
     arg12,
     arg13,
-    arg14);
+    arg14,
+    arg15);
+
+  }
+#endif //OP_HYBRID_GPU
+
+void op_par_loop_Timestep_gpu(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7);
+
+//GPU host stub function
+#if OP_HYBRID_GPU
+void op_par_loop_Timestep(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7){
+
+  if (OP_hybrid_gpu) {
+    op_par_loop_Timestep_gpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4,
+      arg5,
+      arg6,
+      arg7);
+
+    }else{
+    op_par_loop_Timestep_cpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4,
+      arg5,
+      arg6,
+      arg7);
+
+  }
+}
+#else
+void op_par_loop_Timestep(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7){
+
+  op_par_loop_Timestep_gpu(name, set,
+    arg0,
+    arg1,
+    arg2,
+    arg3,
+    arg4,
+    arg5,
+    arg6,
+    arg7);
 
   }
 #endif //OP_HYBRID_GPU
@@ -1340,8 +1474,7 @@ void op_par_loop_NumericalFluxes_gpu(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7,
-  op_arg arg8);
+  op_arg arg7);
 
 //GPU host stub function
 #if OP_HYBRID_GPU
@@ -1353,8 +1486,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7,
-  op_arg arg8){
+  op_arg arg7){
 
   if (OP_hybrid_gpu) {
     op_par_loop_NumericalFluxes_gpu(name, set,
@@ -1365,8 +1497,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
       arg4,
       arg5,
       arg6,
-      arg7,
-      arg8);
+      arg7);
 
     }else{
     op_par_loop_NumericalFluxes_cpu(name, set,
@@ -1377,8 +1508,7 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
       arg4,
       arg5,
       arg6,
-      arg7,
-      arg8);
+      arg7);
 
   }
 }
@@ -1391,10 +1521,119 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
   op_arg arg4,
   op_arg arg5,
   op_arg arg6,
-  op_arg arg7,
-  op_arg arg8){
+  op_arg arg7){
 
   op_par_loop_NumericalFluxes_gpu(name, set,
+    arg0,
+    arg1,
+    arg2,
+    arg3,
+    arg4,
+    arg5,
+    arg6,
+    arg7);
+
+  }
+#endif //OP_HYBRID_GPU
+
+void op_par_loop_computeFluxes_sph_gpu(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7,
+  op_arg arg8,
+  op_arg arg9,
+  op_arg arg10,
+  op_arg arg11,
+  op_arg arg12,
+  op_arg arg13,
+  op_arg arg14,
+  op_arg arg15);
+
+//GPU host stub function
+#if OP_HYBRID_GPU
+void op_par_loop_computeFluxes_sph(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7,
+  op_arg arg8,
+  op_arg arg9,
+  op_arg arg10,
+  op_arg arg11,
+  op_arg arg12,
+  op_arg arg13,
+  op_arg arg14,
+  op_arg arg15){
+
+  if (OP_hybrid_gpu) {
+    op_par_loop_computeFluxes_sph_gpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4,
+      arg5,
+      arg6,
+      arg7,
+      arg8,
+      arg9,
+      arg10,
+      arg11,
+      arg12,
+      arg13,
+      arg14,
+      arg15);
+
+    }else{
+    op_par_loop_computeFluxes_sph_cpu(name, set,
+      arg0,
+      arg1,
+      arg2,
+      arg3,
+      arg4,
+      arg5,
+      arg6,
+      arg7,
+      arg8,
+      arg9,
+      arg10,
+      arg11,
+      arg12,
+      arg13,
+      arg14,
+      arg15);
+
+  }
+}
+#else
+void op_par_loop_computeFluxes_sph(char const *name, op_set set,
+  op_arg arg0,
+  op_arg arg1,
+  op_arg arg2,
+  op_arg arg3,
+  op_arg arg4,
+  op_arg arg5,
+  op_arg arg6,
+  op_arg arg7,
+  op_arg arg8,
+  op_arg arg9,
+  op_arg arg10,
+  op_arg arg11,
+  op_arg arg12,
+  op_arg arg13,
+  op_arg arg14,
+  op_arg arg15){
+
+  op_par_loop_computeFluxes_sph_gpu(name, set,
     arg0,
     arg1,
     arg2,
@@ -1403,12 +1642,19 @@ void op_par_loop_NumericalFluxes(char const *name, op_set set,
     arg5,
     arg6,
     arg7,
-    arg8);
+    arg8,
+    arg9,
+    arg10,
+    arg11,
+    arg12,
+    arg13,
+    arg14,
+    arg15);
 
   }
 #endif //OP_HYBRID_GPU
 
-void op_par_loop_SpaceDiscretization_gpu(char const *name, op_set set,
+void op_par_loop_NumericalFluxes_sph_gpu(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -1422,7 +1668,7 @@ void op_par_loop_SpaceDiscretization_gpu(char const *name, op_set set,
 
 //GPU host stub function
 #if OP_HYBRID_GPU
-void op_par_loop_SpaceDiscretization(char const *name, op_set set,
+void op_par_loop_NumericalFluxes_sph(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -1435,7 +1681,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set,
   op_arg arg9){
 
   if (OP_hybrid_gpu) {
-    op_par_loop_SpaceDiscretization_gpu(name, set,
+    op_par_loop_NumericalFluxes_sph_gpu(name, set,
       arg0,
       arg1,
       arg2,
@@ -1448,7 +1694,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set,
       arg9);
 
     }else{
-    op_par_loop_SpaceDiscretization_cpu(name, set,
+    op_par_loop_NumericalFluxes_sph_cpu(name, set,
       arg0,
       arg1,
       arg2,
@@ -1463,7 +1709,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set,
   }
 }
 #else
-void op_par_loop_SpaceDiscretization(char const *name, op_set set,
+void op_par_loop_NumericalFluxes_sph(char const *name, op_set set,
   op_arg arg0,
   op_arg arg1,
   op_arg arg2,
@@ -1475,7 +1721,7 @@ void op_par_loop_SpaceDiscretization(char const *name, op_set set,
   op_arg arg8,
   op_arg arg9){
 
-  op_par_loop_SpaceDiscretization_gpu(name, set,
+  op_par_loop_NumericalFluxes_sph_gpu(name, set,
     arg0,
     arg1,
     arg2,
diff --git a/sp/cuda/volna_kernels.cu b/sp/cuda/volna_kernels.cu
index 5a70bac..7a05762 100644
--- a/sp/cuda/volna_kernels.cu
+++ b/sp/cuda/volna_kernels.cu
@@ -7,9 +7,12 @@
 #define MAX_CONST_SIZE 128
 #endif
 
-__constant__ float CFL;
-__constant__ float EPS;
-__constant__ float g;
+__constant__ float CFL_cuda;
+__constant__ float EPS_cuda;
+__constant__ float g_cuda;
+__constant__ float Mn_cuda;
+__constant__ int spherical_cuda;
+__constant__ float Radius_cuda;
 
 //header
 #include "op_lib_cpp.h"
@@ -20,15 +23,27 @@ void op_decl_const_char(int dim, char const *type,
 int size, char *dat, char const *name){
   if (!OP_hybrid_gpu) return;
   if (!strcmp(name,"CFL")) {
-    cutilSafeCall(cudaMemcpyToSymbol(CFL, dat, dim*size));
+    cutilSafeCall(cudaMemcpyToSymbol(CFL_cuda, dat, dim*size));
   }
   else
   if (!strcmp(name,"EPS")) {
-    cutilSafeCall(cudaMemcpyToSymbol(EPS, dat, dim*size));
+    cutilSafeCall(cudaMemcpyToSymbol(EPS_cuda, dat, dim*size));
   }
   else
   if (!strcmp(name,"g")) {
-    cutilSafeCall(cudaMemcpyToSymbol(g, dat, dim*size));
+    cutilSafeCall(cudaMemcpyToSymbol(g_cuda, dat, dim*size));
+  }
+  else
+  if (!strcmp(name,"Mn")) {
+    cutilSafeCall(cudaMemcpyToSymbol(Mn_cuda, dat, dim*size));
+  }
+  else
+  if (!strcmp(name,"spherical")) {
+    cutilSafeCall(cudaMemcpyToSymbol(spherical_cuda, dat, dim*size));
+  }
+  else
+  if (!strcmp(name,"Radius")) {
+    cutilSafeCall(cudaMemcpyToSymbol(Radius_cuda, dat, dim*size));
   }
   else
   {
@@ -37,29 +52,32 @@ int size, char *dat, char const *name){
 }
 
 //user kernel files
-#include "EvolveValuesRK3_1_kernel.cu"
-#include "EvolveValuesRK3_2_kernel.cu"
-#include "EvolveValuesRK3_3_kernel.cu"
-#include "EvolveValuesRK3_4_kernel.cu"
+#include "EvolveValuesRK2_1_kernel.cu"
+#include "EvolveValuesRK2_2_kernel.cu"
+#include "Friction_manning_kernel.cu"
 #include "simulation_1_kernel.cu"
 #include "incConst_kernel.cu"
 #include "initEta_formula_kernel.cu"
 #include "initU_formula_kernel.cu"
 #include "initV_formula_kernel.cu"
-#include "values_operation2_kernel.cu"
 #include "applyConst_kernel.cu"
 #include "initBathymetry_large_kernel.cu"
 #include "initBathyRelative_formula_kernel.cu"
 #include "initBathymetry_formula_kernel.cu"
+#include "zero_bathy_kernel.cu"
 #include "initBathymetry_update_kernel.cu"
 #include "initBore_select_kernel.cu"
 #include "initGaussianLandslide_kernel.cu"
+#include "values_operation2_kernel.cu"
 #include "getTotalVol_kernel.cu"
 #include "getMaxElevation_kernel.cu"
 #include "getMaxSpeed_kernel.cu"
 #include "gatherLocations_kernel.cu"
+#include "toOutputs_kernel.cu"
 #include "computeGradient_kernel.cu"
 #include "limiter_kernel.cu"
 #include "computeFluxes_kernel.cu"
+#include "Timestep_kernel.cu"
 #include "NumericalFluxes_kernel.cu"
-#include "SpaceDiscretization_kernel.cu"
+#include "computeFluxes_sph_kernel.cu"
+#include "NumericalFluxes_sph_kernel.cu"
diff --git a/sp/volna_op.cpp b/sp/volna_op.cpp
index fa4b32e..ac781f9 100644
--- a/sp/volna_op.cpp
+++ b/sp/volna_op.cpp
@@ -69,9 +69,9 @@ double timestamp = 0.0;
 int itercount = 0;
 
 // Constants
-float CFL, g, EPS;
+float CFL, g, EPS, Mn, Radius;
+int spherical;
 bool new_format = true;
-
 // Store maximum elevation and speed in global variable, for the sake of max search
 op_dat currentMaxElevation = NULL;
 op_dat currentMaxSpeed = NULL;
@@ -241,7 +241,8 @@ int main(int argc, char **argv) {
    * Read constants from HDF5
    */
   op_get_const_hdf5("CFL", 1, "float", (char *) &CFL, filename_h5);
-  // op_get_const_hdf5("Mn", 1, "float", (char *) &CFL, filename_h5);
+  op_get_const_hdf5("Mn", 1, "float", (char *) &Mn, filename_h5);
+  op_get_const_hdf5("Spherical", 1, "int", (char *) &spherical, filename_h5);
   // Final time: as defined by Volna the end of real-time simulation
   float ftime_tmp, dtmax_tmp;
   op_get_const_hdf5("ftime", 1, "float", (char *) &ftime_tmp, filename_h5);
@@ -253,7 +254,19 @@ int main(int argc, char **argv) {
   op_decl_const2("CFL",1,"float",&CFL);
   op_decl_const2("EPS",1,"float",&EPS);
   op_decl_const2("g",1,"float",&g);
-
+  op_decl_const2("Mn",1,"float",&Mn);
+  op_decl_const2("spherical",1,"int",&spherical);
+
+  Radius = 6378136.6;
+  op_decl_const2("Radius",1,"float",&Radius);
+
+  printf("Mn = %f, CFL = %f \n", Mn, CFL);
+  if (spherical){
+    printf("Spherical Polar coordinates \n");
+    printf("Radius of the Earth = %f \n", Radius);
+  } else {
+    printf("Cartesian coordinates \n");
+  }
   //Read InitBathymetry and InitEta event data when they come from files
   for (unsigned int i = 0; i < events.size(); i++) {
       if (!strcmp(events[i].className.c_str(), "InitEta")) {
@@ -342,8 +355,8 @@ int main(int argc, char **argv) {
 //  op_partition("PARMETIS", "GEOM", NULL, NULL, cellCenters);
 //  op_partition("PTSCOTCH", "GEOM", NULL, NULL, cellCenters);
 //  op_partition("", "", NULL, NULL, NULL);
-  op_partition("PARMETIS", "KWAY", NULL, edgesToCells, NULL);
-//  op_partition("PTSCOTCH", "KWAY", NULL, edgesToCells, NULL);
+  // op_partition("PARMETIS", "KWAY", NULL, edgesToCells, NULL);
+ op_partition("PTSCOTCH", "KWAY", NULL, edgesToCells, NULL);
 //  op_partition("PARMETIS", "GEOMKWAY", edges, edgesToCells, cellCenters);
 //  op_partition("PARMETIS", "KWAY", NULL, NULL, NULL);
 //  op_partition("PARMETIS", "KWAY", edges, edgesToCells, cellCenters);
@@ -385,16 +398,19 @@ int main(int argc, char **argv) {
   while (timestamp < ftime) {
 		//process post_update==false events (usually Init events)
     processEvents(&timers, &events, 0, 0, 0.0, 0, 0, cells, values, cellVolumes, cellCenters, nodeCoords, cellsToNodes, temp_initEta, temp_initU, temp_initV, bathy_nodes,  lifted_cells, liftedcellsToBathyNodes, liftedcellsToCells, bathy_xy, initial_zb, temp_initBathymetry, z_zero, n_initBathymetry, &zmin, outputLocation_map, outputLocation_dat, writeOption);
-
-
-#ifdef DEBUG
-#endif
     {
-      float minTimestep = 0.0;
+      float minTimestep = INFINITY;
+      if (!spherical){
       spaceDiscretization(values, Lw_n, &minTimestep,
           bathySource, edgeFluxes, maxEdgeEigenvalues,
           edgeNormals, edgeLength, cellVolumes, isBoundary,
           cells, edges, edgesToCells, cellsToEdges, cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin);
+      } else {
+      spaceDiscretization_sph(values, Lw_n, &minTimestep,
+          bathySource, edgeFluxes, maxEdgeEigenvalues,
+          edgeNormals, edgeLength, cellVolumes, isBoundary,
+          cells, edges, edgesToCells, cellsToEdges, cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin);
+      }
 #ifdef DEBUG
       printf("Return of SpaceDiscretization #1 midPointConservative H %g U %g V %g Zb %g  \n", normcomp(Lw_n, 0), normcomp(Lw_n, 1),normcomp(Lw_n, 2),normcomp(Lw_n, 3));
 #endif
@@ -411,12 +427,19 @@ int main(int argc, char **argv) {
 #endif
 
       float dummy = -1.0f;
+      if (!spherical){
       spaceDiscretization(w_1, Lw_1, &dummy,
           bathySource, edgeFluxes, maxEdgeEigenvalues,
           edgeNormals, edgeLength, cellVolumes, isBoundary,
           cells, edges, edgesToCells, cellsToEdges,
           cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin);
-
+      } else {
+      spaceDiscretization_sph(w_1, Lw_1, &dummy,
+          bathySource, edgeFluxes, maxEdgeEigenvalues,
+          edgeNormals, edgeLength, cellVolumes, isBoundary,
+          cells, edges, edgesToCells, cellsToEdges,
+          cellsToCells, edgeCenters, cellCenters, GradientatCell, q, lim, &zmin);
+      }
 
       op_par_loop_EvolveValuesRK2_2("EvolveValuesRK2_2",cells,
                   op_arg_gbl(&dT,1,"float",OP_READ),
@@ -427,16 +450,15 @@ int main(int argc, char **argv) {
 
 
       timestep=dT;
-      float Mn = 0.025f;
       op_par_loop_Friction_manning("Friction_manning",cells,
                   op_arg_gbl(&dT,1,"float",OP_READ),
                   op_arg_gbl(&Mn,1,"float",OP_READ),
                   op_arg_dat(values_new,-1,OP_ID,4,"float",OP_RW));
-    }
+
     op_par_loop_simulation_1("simulation_1",cells,
                 op_arg_dat(values,-1,OP_ID,4,"float",OP_WRITE),
                 op_arg_dat(values_new,-1,OP_ID,4,"float",OP_READ));
-//         printf("Return of SpaceDiscretization #1 midPointConservative H %g U %g V %g Zb %g  \n", normcomp(values_new, 0), normcomp(values_new, 1),normcomp(values_new, 2),normcomp(values_new, 3));
+    }
 
     itercount++;
     timestamp += timestep;
diff --git a/sp/volna_simulation_op.cpp b/sp/volna_simulation_op.cpp
index 303827f..a4925c5 100644
--- a/sp/volna_simulation_op.cpp
+++ b/sp/volna_simulation_op.cpp
@@ -84,6 +84,36 @@ void op_par_loop_NumericalFluxes(char const *, op_set,
   op_arg,
   op_arg,
   op_arg );
+
+void op_par_loop_computeFluxes_sph(char const *, op_set,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg );
+
+void op_par_loop_NumericalFluxes_sph(char const *, op_set,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg,
+  op_arg );
 #ifdef OPENACC
 #ifdef __cplusplus
 }
@@ -97,7 +127,8 @@ void op_par_loop_NumericalFluxes(char const *, op_set,
 #include "computeFluxes.h"
 #include "Timestep.h"
 #include "NumericalFluxes.h"
-
+#include "computeFluxes_sph.h"
+#include "NumericalFluxes_sph.h"
 
 
 void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep,
@@ -154,7 +185,6 @@ void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep,
     }
 
     if (*minTimestep >= 0.0){
-    *minTimestep = INFINITY;
     op_par_loop_Timestep("Timestep",cells,
                 op_arg_dat(maxEdgeEigenvalues,0,cellsToEdges,1,"float",OP_READ),
                 op_arg_dat(maxEdgeEigenvalues,1,cellsToEdges,1,"float",OP_READ),
@@ -177,3 +207,82 @@ void spaceDiscretization(op_dat data_in, op_dat data_out, float *minTimestep,
                 op_arg_dat(cellVolumes,1,edgesToCells,1,"float",OP_READ));
     }
 }
+
+void spaceDiscretization_sph(op_dat data_in, op_dat data_out, float *minTimestep,
+                         op_dat bathySource, op_dat edgeFluxes, op_dat maxEdgeEigenvalues,
+                         op_dat edgeNormals, op_dat edgeLength, op_dat cellVolumes, op_dat isBoundary,
+                         op_set cells, op_set edges, op_map edgesToCells, op_map cellsToEdges,
+                         op_map cellsToCells, op_dat edgeCenters, op_dat cellCenters, op_dat GradientatCell, op_dat q, op_dat lim, float *zmin) {
+  {
+
+    {
+    // TO DO: Pre calculate the geometric mesh quantities
+    op_par_loop_computeGradient("computeGradient",cells,
+                op_arg_dat(data_in,-1,OP_ID,4,"float",OP_READ),
+                op_arg_dat(data_in,0,cellsToCells,4,"float",OP_READ),
+                op_arg_dat(data_in,1,cellsToCells,4,"float",OP_READ),
+                op_arg_dat(data_in,2,cellsToCells,4,"float",OP_READ),
+                op_arg_dat(cellCenters,-1,OP_ID,2,"float",OP_READ),
+                op_arg_dat(cellCenters,0,cellsToCells,2,"float",OP_READ),
+                op_arg_dat(cellCenters,1,cellsToCells,2,"float",OP_READ),
+                op_arg_dat(cellCenters,2,cellsToCells,2,"float",OP_RW),
+                op_arg_dat(q,-1,OP_ID,8,"float",OP_WRITE),
+                op_arg_dat(GradientatCell,-1,OP_ID,8,"float",OP_WRITE));
+    }
+   op_par_loop_limiter("limiter",cells,
+               op_arg_dat(q,-1,OP_ID,8,"float",OP_READ),
+               op_arg_dat(lim,-1,OP_ID,4,"float",OP_WRITE),
+               op_arg_dat(data_in,-1,OP_ID,4,"float",OP_READ),
+               op_arg_dat(GradientatCell,-1,OP_ID,8,"float",OP_READ),
+               op_arg_dat(edgeCenters,0,cellsToEdges,2,"float",OP_READ),
+               op_arg_dat(edgeCenters,1,cellsToEdges,2,"float",OP_READ),
+               op_arg_dat(edgeCenters,2,cellsToEdges,2,"float",OP_READ),
+               op_arg_dat(data_out,-1,OP_ID,4,"float",OP_WRITE),
+               op_arg_dat(cellCenters,-1,OP_ID,2,"float",OP_READ));
+
+    {
+    op_par_loop_computeFluxes_sph("computeFluxes_sph",edges,
+                op_arg_dat(data_in,0,edgesToCells,4,"float",OP_READ),
+                op_arg_dat(data_in,1,edgesToCells,4,"float",OP_READ),
+                op_arg_dat(lim,0,edgesToCells,4,"float",OP_READ),
+                op_arg_dat(lim,1,edgesToCells,4,"float",OP_READ),
+                op_arg_dat(edgeLength,-1,OP_ID,1,"float",OP_READ),
+                op_arg_dat(edgeNormals,-1,OP_ID,2,"float",OP_READ),
+                op_arg_dat(cellCenters,0,edgesToCells,2,"float",OP_READ),
+                op_arg_dat(cellCenters,1,edgesToCells,2,"float",OP_READ),
+                op_arg_dat(edgeCenters,-1,OP_ID,2,"float",OP_READ),
+                op_arg_dat(GradientatCell,0,edgesToCells,8,"float",OP_READ),
+                op_arg_dat(GradientatCell,1,edgesToCells,8,"float",OP_READ),
+                op_arg_dat(isBoundary,-1,OP_ID,1,"int",OP_READ),
+                op_arg_dat(bathySource,-1,OP_ID,4,"float",OP_WRITE),
+                op_arg_dat(edgeFluxes,-1,OP_ID,3,"float",OP_WRITE),
+                op_arg_dat(maxEdgeEigenvalues,-1,OP_ID,1,"float",OP_WRITE),
+                op_arg_gbl(zmin,1,"float",OP_READ));
+
+    }
+
+    if (*minTimestep >= 0.0){
+    op_par_loop_Timestep("Timestep",cells,
+                op_arg_dat(maxEdgeEigenvalues,0,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(maxEdgeEigenvalues,1,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(maxEdgeEigenvalues,2,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(edgeLength,0,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(edgeLength,1,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(edgeLength,2,cellsToEdges,1,"float",OP_READ),
+                op_arg_dat(cellVolumes,-1,OP_ID,1,"float",OP_READ),
+                op_arg_gbl(minTimestep,1,"float",OP_MIN));
+    }
+
+    op_par_loop_NumericalFluxes_sph("NumericalFluxes_sph",edges,
+                op_arg_dat(data_out,0,edgesToCells,4,"float",OP_INC),
+                op_arg_dat(data_out,1,edgesToCells,4,"float",OP_INC),
+                op_arg_dat(cellCenters,0,edgesToCells,2,"float",OP_READ),
+                op_arg_dat(cellCenters,1,edgesToCells,2,"float",OP_READ),
+                op_arg_dat(edgeFluxes,-1,OP_ID,3,"float",OP_READ),
+                op_arg_dat(bathySource,-1,OP_ID,4,"float",OP_READ),
+                op_arg_dat(edgeNormals,-1,OP_ID,2,"float",OP_READ),
+                op_arg_dat(isBoundary,-1,OP_ID,1,"int",OP_READ),
+                op_arg_dat(cellVolumes,0,edgesToCells,1,"float",OP_READ),
+                op_arg_dat(cellVolumes,1,edgesToCells,1,"float",OP_READ));
+    }
+}

From 0008584b0457478da8b1a69ca7d0d2a81b644d89 Mon Sep 17 00:00:00 2001
From: DanGiles <daniel.giles@ucdconnect.ie>
Date: Mon, 15 Nov 2021 13:03:15 +0000
Subject: [PATCH 4/4] Corrected Makefile

---
 sp/Makefile | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/sp/Makefile b/sp/Makefile
index 32a8d13..101973a 100755
--- a/sp/Makefile
+++ b/sp/Makefile
@@ -117,20 +117,11 @@ endif
 cuda/volna_kernels_cu.o:        cuda/volna_kernels.cu \
 	EvolveValuesRK2_1.h EvolveValuesRK2_2.h applyConst.h getMaxElevation.h getMaxSpeed.h \
 	initBathymetry_formula.h initBathymetry_update.h initBore_select.h initEta_formula.h initGaussianLandslide.h \
-<<<<<<< HEAD
 	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h NumericalFluxes.h simulation_1.h \
 	values_operation2.h Friction_manning.h cuda/applyConst_kernel.cu cuda/EvolveValuesRK2_1_kernel.cu  cuda/EvolveValuesRK2_2_kernel.cu \
 	cuda/computeFluxes_kernel.cu cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \
 	cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \
 	cuda/values_operation2_kernel.cu cuda/Friction_manning_kernel.cu Makefile
-=======
-	initU_formula.h initV_formula.h computeGradient.h computeFluxes.h limiter.h Timestep.h \
-	NumericalFluxes.h simulation_1.h values_operation2.h cuda/applyConst_kernel.cu \
-	cuda/EvolveValuesRK2_1_kernel.cu  cuda/EvolveValuesRK2_2_kernel.cu cuda/computeFluxes_kernel.cu \
-	cuda/limiter_kernel.cu cuda/Timestep_kernel.cu cuda/NumericalFluxes_kernel.cu \
-	cuda/simulation_1_kernel.cu cuda/computeGradient_kernel.cu cuda/getMaxSpeed_kernel.cu cuda/getMaxElevation_kernel.cu \
-	cuda/values_operation2_kernel.cu Makefile
->>>>>>> 1a93fa5856e7445214d587fca380b11da9ba8b72
 	nvcc  $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -I$(MPI_INC) -c -o cuda/volna_kernels_cu.o cuda/volna_kernels.cu
 
 volna_mpi: volna.cpp volna_event.cpp volna_init.cpp volna_output.cpp volna_writeVTK.cpp volna_simulation.cpp volna_util.cpp Makefile