diff --git a/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy b/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy
new file mode 100644
index 000000000..083ee2504
--- /dev/null
+++ b/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy
@@ -0,0 +1,392 @@
+#
+# The following environment variables should be predefined:
+#
+# CUDA_INSTALL_PATH
+# PARMETIS_INSTALL_PATH
+# PTSCOTCH_INSTALL_PATH
+# HDF5_INSTALL_PATH
+#
+# OP2_INSTALL_PATH
+# OP2_COMPILER (gnu,intel,etc)
+#
+
+include ../../../../make-common.inc
+
+ifeq ($(OP2_COMPILER),gnu)
+  CPP		= g++
+  CPPFLAGS	= -O3 -g -fPIC -DUNIX -Wall -Wextra
+  OMPFLAGS	= -fopenmp
+  MPICPP	= $(MPICXX_PATH)
+  MPIFLAGS	= $(CCFLAGS)
+else
+ifeq ($(OP2_COMPILER),intel)
+  CPP		= icpc
+  CCFLAGS	= -O3 -g -xHost -DMPICH_IGNORE_CXX_SEEK -restrict -inline-forceinline -qopt-report=5 -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report
+  # CCFLAGS	= -O3 -xHost -DMPICH_IGNORE_CXX_SEEK -fno-alias -inline-forceinline -qopt-report -parallel -prec-div -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report
+  CPPFLAGS 	= $(CCFLAGS)
+  OMPFLAGS	= -qopenmp
+  MPICPP	= $(MPICXX_PATH)
+  # NVCCFLAGS	~= -ccbin=$(MPICPP)
+  MPIFLAGS	= $(CPPFLAGS)
+else
+ifeq ($(OP2_COMPILER),xl)
+  CPP		= xlc++
+  CCFLAGS	= -O3 -qarch=pwr8 -qtune=pwr8 -qhot -qxflag=nrcptpo -qinline=level=10 -Wx,-nvvm-compile-options=-ftz=1 -Wx,-nvvm-compile-options=-prec-div=0 -Wx,-nvvm-compile-options=-prec-sqrt=0
+#  CCFLAGS	= -O3 -xHost -DMPICH_IGNORE_CXX_SEEK -fno-alias -inline-forceinline -qopt-report -parallel -prec-div -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report
+  CPPFLAGS 	= $(CCFLAGS)
+  OMPFLAGS	= -qsmp=omp -qthreaded
+  OMPOFFLOAD	= -qsmp=omp -qoffload -Xptxas -v -g1
+  MPICPP	= $(MPICXX_PATH)
+  MPIFLAGS	= $(CPPFLAGS)
+else
+ifeq ($(OP2_COMPILER),pgi)
+  CPP       	= pgc++
+  CCFLAGS  	= -O3
+  CPPFLAGS 	= $(CCFLAGS)
+  OMPFLAGS 	= -mp
+  MPICC   	= $(MPICC_PATH)
+  MPICPP   	= $(MPICXX_PATH)
+  MPIFLAGS 	= $(CPPFLAGS)
+  # NVCCFLAGS	+= -ccbin=$(MPICPP)
+  ACCFLAGS      = -acc -Minfo=acc -ta=tesla:cc35,fastmath,lineinfo -DOPENACC -fast -Minfo=accel -Mcuda=ptxinfo
+else
+ifeq ($(OP2_COMPILER),cray)
+  CPP           = CC
+  CCFLAGS       = -O3 -h fp3 -h ipa5
+  CPPFLAGS      = $(CCFLAGS)
+  OMPFLAGS      = -h omp
+  MPICPP        = CC
+  MPIFLAGS      = $(CPPFLAGS)
+else
+ifeq ($(OP2_COMPILER),clang)
+  CPP       	= clang++
+  CCFLAGS  	= -O3 -ffast-math
+  CPPFLAGS 	= $(CCFLAGS)
+  OMPFLAGS 	= -I$(OMPTARGET_LIBS)/../include -fopenmp=libomp -Rpass-analysis
+  OMPOFFLOAD 	= $(OMPFLAGS) -fopenmp-targets=nvptx64-nvidia-cuda -ffp-contract=fast -Xcuda-ptxas -v #-Xclang -target-feature -Xclang +ptx35
+  MPICC   	= $(MPICC_PATH)
+  MPICPP   	= $(MPICXX_PATH)
+  MPIFLAGS 	= $(CPPFLAGS)
+else
+print:
+	@echo "unrecognised value for OP2_COMPILER"
+endif
+endif
+endif
+endif
+endif
+endif
+
+
+
+#
+# set flags for NVCC compilation and linking
+#
+ifndef NV_ARCH
+  MESSAGE=select an NVIDA device to compile in CUDA, e.g. make NV_ARCH=KEPLER
+  NV_ARCH=Kepler
+endif
+ifeq ($(NV_ARCH),Fermi)
+  CODE_GEN_CUDA=-gencode arch=compute_20,code=sm_21
+else
+ifeq ($(NV_ARCH),Kepler)
+  CODE_GEN_CUDA=-gencode arch=compute_35,code=sm_35
+else
+ifeq ($(NV_ARCH),Maxwell)
+  CODE_GEN_CUDA=-gencode arch=compute_50,code=sm_50
+else
+ifeq ($(NV_ARCH),Pascal)
+  CODE_GEN_CUDA=-gencode arch=compute_60,code=sm_60
+else
+ifeq ($(NV_ARCH),Volta)
+  CODE_GEN_CUDA=-gencode arch=compute_70,code=sm_70
+endif
+endif
+endif
+endif
+endif
+
+ifdef MAX_REG_COUNT
+  REG_COUNT = --maxrregcount $(MAX_REG_COUNT)
+  PGI_REG_COUNT = -ta=nvidia,maxregcount:$(MAX_REG_COUNT)
+  ifeq ($(OP2_COMPILER),xl)
+    OMP4_REG_COUNT= -Xptxas -maxrregcount=$(MAX_REG_COUNT)
+  else
+  ifeq ($(OP2_COMPILER),clang)
+    OMP4_REG_COUNT= -Xcuda-ptxas --maxrregcount=$(MAX_REG_COUNT)
+  endif
+  endif
+endif
+
+NVCCFLAGS += $(CODE_GEN_CUDA) -m64 -Xptxas=-v --use_fast_math -O3 $(REG_COUNT) -lineinfo -DOP2_CUDA #-g -G -O0
+
+VAR		= #-DOP_PART_SIZE_1=160 -DOP_PART_SIZE_2=320 -DOP_PART_SIZE_3=64 #-DOP_BLOCK_SIZE_0=64 -DOP_BLOCK_SIZE_1=64 -DOP_BLOCK_SIZE_2=64 -DOP_BLOCK_SIZE_3=64 -DOP_BLOCK_SIZE_4=64
+
+
+#
+# master to make all versions
+#
+ALL_TARGETS = clean airfoil_mpi airfoil_cuda airfoil_openmp airfoil_seq airfoil_genseq airfoil_mpi_genseq airfoil_mpi_cuda airfoil_mpi_cuda_hyb airfoil_mpi_openmp convert_mesh_seq convert_mesh_mpi
+ifeq ($(OP2_COMPILER),pgi)
+	ALL_TARGETS += airfoil_openacc airfoil_mpi_openacc
+endif
+ifeq ($(OP2_COMPILER),intel)
+	ALL_TARGETS += airfoil_mpi_vec
+endif
+
+all: $(ALL_TARGETS)
+
+#
+# simple sequential version
+#
+
+airfoil_seq: airfoil.cpp save_soln.h adt_calc.h res_calc.h bres_calc.h
+	     $(MPICPP) $(CPPFLAGS) airfoil.cpp $(OP2_INC) $(HDF5_INC) $(OP2_LIB) -lop2_seq -lop2_hdf5 $(HDF5_LIB) -o airfoil_seq
+
+airfoil_genseq: airfoil_op.cpp seq/airfoil_seqkernels.cpp
+	     $(MPICPP) $(CPPFLAGS) $^ \
+	     $(OP2_INC) $(HDF5_INC) \
+	     $(OP2_LIB) -lop2_seq -lop2_hdf5 \
+	     $(HDF5_LIB) \
+	     -o airfoil_genseq
+
+#
+# x86 version using kernel files generated by op2.py
+#
+
+airfoil_openmp:	airfoil_op.cpp openmp/airfoil_kernels.cpp \
+		openmp/save_soln_kernel.cpp  save_soln.h \
+		openmp/adt_calc_kernel.cpp   adt_calc.h  \
+		openmp/res_calc_kernel.cpp   res_calc.h  \
+		openmp/bres_calc_kernel.cpp  bres_calc.h \
+		openmp/update_kernel.cpp     update.h    \
+                Makefile
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_LIB) $(HDF5_INC) -Iopenmp -I. \
+		airfoil_op.cpp -lm openmp/airfoil_kernels.cpp -lm -lop2_openmp -lop2_hdf5 $(HDF5_LIB) -o airfoil_openmp
+
+#
+# OpenACC version using kernel files generated by op2.py
+#
+
+airfoil_openacc: airfoil_op.cpp openacc/airfoil_acckernels.c \
+                openacc/save_soln_acckernel.c  \
+                openacc/adt_calc_acckernel.c   \
+                openacc/res_calc_acckernel.c   \
+                openacc/bres_calc_acckernel.c  \
+                openacc/update_acckernel.c   \
+                Makefile
+		$(MPICC) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) $(PGI_REG_COUNT) -Iopenacc -I. \
+                openacc/airfoil_acckernels.c -c -o openacc/airfoil_acckernels.o
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_LIB) $(HDF5_INC) \
+                airfoil_op.cpp -lm openacc/airfoil_acckernels.o -lm $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_openacc
+
+
+#
+# CUDA version using kernel files generated by op2.py
+#
+
+airfoil_cuda:	airfoil_op.cpp cuda/airfoil_kernels_cu.o Makefile
+		$(MPICPP) $(VAR) $(CPPFLAGS) airfoil_op.cpp cuda/airfoil_kernels_cu.o \
+		$(CUDA_INC) $(OP2_INC) $(HDF5_INC) \
+		$(OP2_LIB) $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_cuda
+
+cuda/airfoil_kernels_cu.o:	cuda/airfoil_kernels.cu      \
+		cuda/save_soln_kernel.cu save_soln.h \
+		cuda/adt_calc_kernel.cu  adt_calc.h  \
+		cuda/res_calc_kernel.cu  res_calc.h  \
+		cuda/bres_calc_kernel.cu bres_calc.h \
+		cuda/update_kernel.cu    update.h    \
+                Makefile
+		nvcc  $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -Icuda -I. \
+		-c -o cuda/airfoil_kernels_cu.o cuda/airfoil_kernels.cu
+
+#
+#  CUDA with clang
+#
+
+airfoil_cuda_clang:     airfoil_op.cpp cuda/airfoil_kernels_cu_clang.o Makefile
+	$(MPICPP) $(VAR) $(CPPFLAGS) airfoil_op.cpp cuda/airfoil_kernels_cu_clang.o \
+        $(CUDA_INC) $(OP2_INC) $(HDF5_INC) \
+	$(OP2_LIB) $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_cuda_clang
+
+cuda/airfoil_kernels_cu_clang.o:     cuda/airfoil_kernels.cu      \
+                cuda/save_soln_kernel.cu save_soln.h \
+                cuda/adt_calc_kernel.cu  adt_calc.h  \
+                cuda/res_calc_kernel.cu  res_calc.h  \
+                cuda/bres_calc_kernel.cu bres_calc.h \
+                cuda/update_kernel.cu    update.h    \
+                Makefile
+	clang++  $(VAR) $(INC) -m64 -O3 $(OP2_INC) $(HDF5_INC) -gline-tables-only -I$(MPI_INSTALL_PATH)/include -Icuda -I. \
+	-c -o cuda/airfoil_kernels_cu_clang.o cuda/airfoil_kernels.cu --cuda-gpu-arch=sm_35 -ffast-math $(OMP4_REG_COUNT) -Xcuda-ptxas -v
+
+#
+# mpi with sequential-nodes version
+#
+
+airfoil_mpi: airfoil.cpp save_soln.h adt_calc.h res_calc.h bres_calc.h Makefile
+	$(MPICPP) $(MPIFLAGS) airfoil.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \
+	$(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi
+
+#
+# mpi genseq version using kernel files generated by op2.py
+#
+
+airfoil_mpi_genseq: airfoil_op.cpp seq/airfoil_seqkernels.cpp \
+                seq/save_soln_seqkernel.cpp  save_soln.h \
+                seq/adt_calc_seqkernel.cpp   adt_calc.h  \
+                seq/res_calc_seqkernel.cpp   res_calc.h  \
+                seq/bres_calc_seqkernel.cpp  bres_calc.h \
+                seq/update_seqkernel.cpp     update.h    \
+                Makefile
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \
+		$(PARMETIS_INC) $(PTSCOTCH_INC) -Iseq  -I. \
+		airfoil_op.cpp -lm seq/airfoil_seqkernels.cpp $(OP2_LIB) -lop2_mpi \
+		$(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_genseq
+
+#
+# mpi vectorized seq version using kernel files generated by op2.py
+#
+
+airfoil_mpi_vec: airfoil_op.cpp vec/airfoil_veckernels.cpp \
+                vec/save_soln_veckernel.cpp  save_soln.h \
+                vec/adt_calc_veckernel.cpp   adt_calc.h  \
+                vec/res_calc_veckernel.cpp   res_calc.h  \
+                vec/bres_calc_veckernel.cpp  bres_calc.h \
+                vec/update_veckernel.cpp     update.h    \
+                Makefile
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \
+                $(PARMETIS_INC) $(PTSCOTCH_INC) -Ivec  -I. \
+                airfoil_op.cpp -lm vec/airfoil_veckernels.cpp $(OP2_LIB) -lop2_mpi \
+                $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_vec
+
+
+#
+# mpi openmp version using kernel files generated by op2.py
+#
+
+airfoil_mpi_openmp: airfoil_op.cpp openmp/airfoil_kernels.cpp \
+                openmp/save_soln_kernel.cpp  save_soln.h \
+                openmp/adt_calc_kernel.cpp   adt_calc.h  \
+                openmp/res_calc_kernel.cpp   res_calc.h  \
+                openmp/bres_calc_kernel.cpp  bres_calc.h \
+                openmp/update_kernel.cpp     update.h    \
+                Makefile
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \
+		$(PARMETIS_INC) $(PTSCOTCH_INC) -Iopenmp -I. \
+		airfoil_op.cpp -lm openmp/airfoil_kernels.cpp $(OP2_LIB) -lop2_mpi \
+		$(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_openmp
+
+#
+# mpi openmp 4.5 version using kernel files generated by op2.py
+#
+
+airfoil_openmp4: airfoil_op.cpp openmp4/airfoil_omp4kernels.cpp openmp4/airfoil_omp4kernel_funcs.cpp \
+			openmp4/save_soln_omp4kernel_func.cpp  save_soln.h openmp4/save_soln_omp4kernel.cpp \
+			openmp4/adt_calc_omp4kernel_func.cpp   adt_calc.h  openmp4/adt_calc_omp4kernel.cpp \
+			openmp4/res_calc_omp4kernel_func.cpp   res_calc.h  openmp4/res_calc_omp4kernel.cpp \
+			openmp4/bres_calc_omp4kernel_func.cpp  bres_calc.h openmp4/bres_calc_omp4kernel.cpp \
+			openmp4/update_omp4kernel_func.cpp     update.h    openmp4/update_omp4kernel.cpp \
+			Makefile
+	$(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(OMP4_REG_COUNT) \
+		-Iopenmp4/  -I. -c openmp4/airfoil_omp4kernel_funcs.cpp  -o openmp4/airfoil_omp4kernel_funcs.o
+	$(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(OMP4_REG_COUNT) \
+		-Iopenmp4/ -I. -c openmp4/airfoil_omp4kernels.cpp -o openmp4/airfoil_omp4kernels.o
+	$(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC)  \
+		airfoil_op.cpp openmp4/airfoil_omp4kernels.o openmp4/airfoil_omp4kernel_funcs.o -o airfoil_openmp4 \
+	   	$(OP2_LIB) -lop2_openmp4 -lop2_hdf5 $(HDF5_LIB) -L$(CUDA_INSTALL_PATH)/lib64 -lcudart
+
+
+airfoil_mpi_openmp4: airfoil_op.cpp openmp4/airfoil_omp4kernels.cpp openmp4/airfoil_omp4kernel_funcs.cpp \
+			openmp4/save_soln_omp4kernel_func.cpp  save_soln.h openmp4/save_soln_omp4kernel.cpp \
+			openmp4/adt_calc_omp4kernel_func.cpp   adt_calc.h  openmp4/adt_calc_omp4kernel.cpp \
+			openmp4/res_calc_omp4kernel_func.cpp   res_calc.h  openmp4/res_calc_omp4kernel.cpp \
+			openmp4/bres_calc_omp4kernel_func.cpp  bres_calc.h openmp4/bres_calc_omp4kernel.cpp \
+			openmp4/update_omp4kernel_func.cpp     update.h    openmp4/update_omp4kernel.cpp \
+			Makefile
+	$(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) \
+		-Iopenmp4/ -I. -c openmp4/airfoil_omp4kernel_funcs.cpp  -o openmp4/airfoil_omp4kernel_funcs.o
+	$(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) \
+		-Iopenmp4/ -I. -c openmp4/airfoil_omp4kernels.cpp -o openmp4/airfoil_omp4kernels.o
+	$(MPICPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(CUDA_LIB) -lcudart  \
+		airfoil_op.cpp openmp4/airfoil_omp4kernels.o openmp4/airfoil_omp4kernel_funcs.o -o airfoil_mpi_openmp4 \
+	   	$(OP2_LIB) -lop2_mpi_cuda $(HDF5_LIB) $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(CUDA_LIB) -lcudart
+#
+# mpi openacc version using kernel files generated by op2.py
+#
+
+airfoil_mpi_openacc: airfoil_op.cpp openacc/airfoil_acckernels.c \
+                openacc/save_soln_acckernel.c  save_soln.h \
+                openacc/adt_calc_acckernel.c   adt_calc.h  \
+                openacc/res_calc_acckernel.c   res_calc.h  \
+                openacc/bres_calc_acckernel.c  bres_calc.h \
+                openacc/update_acckernel.c     update.h    \
+                Makefile
+		$(MPICC) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) -Iopenacc/ -I. \
+		openacc/airfoil_acckernels.c -c -o airfoil_acckernels.o
+		$(MPICPP) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \
+		$(PARMETIS_INC) $(PTSCOTCH_INC) \
+		airfoil_op.cpp -lm openacc/airfoil_acckernels.o -DOPENACC $(CUDA_LIB) -lcudart $(OP2_LIB) -lop2_mpi_cuda \
+		$(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_openacc
+#
+# mpi with CUDA version
+#
+
+airfoil_mpi_cuda: airfoil_op.cpp cuda/airfoil_kernels_mpi_cu.o Makefile
+		  $(MPICPP) $(MPIFLAGS) airfoil_op.cpp -lm cuda/airfoil_kernels_mpi_cu.o \
+		  $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \
+		  $(OP2_LIB) -lop2_mpi_cuda $(PARMETIS_LIB) $(PTSCOTCH_LIB) \
+		  $(HDF5_LIB) $(CUDA_LIB) -lcudart -o airfoil_mpi_cuda
+
+cuda/airfoil_kernels_mpi_cu.o: cuda/airfoil_kernels.cu \
+		cuda/save_soln_kernel.cu  save_soln.h \
+		cuda/adt_calc_kernel.cu   adt_calc.h  \
+		cuda/res_calc_kernel.cu   res_calc.h  \
+		cuda/bres_calc_kernel.cu  bres_calc.h \
+		cuda/update_kernel.cu     update.h    \
+		Makefile
+		nvcc  $(INC) $(NVCCFLAGS) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. \
+		-c -o cuda/airfoil_kernels_mpi_cu.o cuda/airfoil_kernels.cu
+
+#
+# Hybrid CPU+GPU version
+#
+
+airfoil_mpi_cuda_hyb: airfoil_op.cpp cuda/airfoil_hybkernels.o Makefile
+		  $(MPICPP) $(MPIFLAGS) $(OMPFLAGS) -DOP_HYBRID_GPU airfoil_op.cpp -lm cuda/airfoil_hybkernels.o cuda/airfoil_hybkernels2.o \
+		  $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \
+		  $(OP2_LIB) -lop2_mpi_cuda $(PARMETIS_LIB) $(PTSCOTCH_LIB) \
+		  $(HDF5_LIB) $(CUDA_LIB) -lcudart -o airfoil_mpi_cuda_hyb
+
+cuda/airfoil_hybkernels.o: cuda/airfoil_hybkernels.cu \
+                cuda/airfoil_kernels.cu save_soln.h \
+                adt_calc.h  res_calc.h  \
+                bres_calc.h update.h    \
+                Makefile
+		nvcc -DOP_HYBRID_GPU -DGPUPASS $(INC) $(NVCCFLAGS) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. \
+                -c -o cuda/airfoil_hybkernels.o cuda/airfoil_hybkernels.cu
+		cat cuda/airfoil_hybkernels.cu > cuda/airfoil_hybkernels2.cpp
+		$(MPICPP) -DOP_HYBRID_GPU $(OMPFLAGS) $(INC) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. -Iopenmp \
+                -c -o cuda/airfoil_hybkernels2.o cuda/airfoil_hybkernels2.cpp
+		rm cuda/airfoil_hybkernels2.cpp
+
+#
+# convert ASCI new_gird.dat to HDF5 new_grid.h5
+#
+
+convert_mesh_seq: convert_mesh.cpp
+	$(MPICPP) $(MPIFLAGS) convert_mesh.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \
+	$(OP2_LIB) -lop2_seq -lop2_hdf5 $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o convert_mesh_seq
+
+convert_mesh_mpi: convert_mesh_mpi.cpp
+	$(MPICPP) $(MPIFLAGS) convert_mesh_mpi.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \
+	$(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o convert_mesh_mpi
+
+
+
+
+#
+# cleanup
+#
+
+clean:
+		rm -f airfoil_seq airfoil_genseq airfoil_openmp airfoil_cuda airfoil_mpi airfoil_mpi_genseq airfoil_mpi_vec airfoil_mpi_cuda_hyb airfoil_mpi_openmp airfoil_mpi_cuda convert_mesh_seq convert_mesh_mpi airfoil_openacc airfoil_mpi_openacc airfoil_openmp4 airfoil_mpi_openmp4 *.o cuda/*.o openacc/*.o openmp4/*.o *.optrpt
diff --git a/apps/make-common.inc b/apps/make-common.inc
new file mode 100644
index 000000000..51a93a862
--- /dev/null
+++ b/apps/make-common.inc
@@ -0,0 +1,78 @@
+
+# Locate MPI compilers:
+ifdef MPI_INSTALL_PATH
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpic++)")
+    MPICPP_PATH = $(MPI_INSTALL_PATH)/bin/mpic++
+  else
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpic++)")
+    MPICPP_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpic++
+  else
+    MPICPP_PATH = mpic++
+  endif
+  endif
+
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpicxx)")
+    MPICXX_PATH = $(MPI_INSTALL_PATH)/bin/mpicxx
+  else
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpicxx)")
+    MPICXX_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpicxx
+  else
+    MPICXX_PATH = mpicxx
+  endif
+  endif
+
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpicc)")
+    MPICC_PATH = $(MPI_INSTALL_PATH)/bin/mpicc
+  else
+  ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpicc)")
+    MPICC_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpicc
+  else
+    MPICC_PATH = mpicc
+  endif
+  endif
+else
+  MPICXX_PATH = mpicxx
+  MPICC_PATH  = mpicc
+endif
+
+# OP2 paths
+ifdef OP2_INSTALL_PATH
+  OP2_INC = -I$(OP2_INSTALL_PATH)/include
+  OP2_LIB = -L$(OP2_INSTALL_PATH)/lib
+endif
+
+# CUDA paths
+ifdef CUDA_INSTALL_PATH
+  CUDA_INC = -I$(CUDA_INSTALL_PATH)/include
+  CUDA_LIB = -L$(CUDA_INSTALL_PATH)/lib64
+endif
+
+# HDF5 paths
+ifdef HDF5_INSTALL_PATH
+  HDF5_INC := -I$(HDF5_INSTALL_PATH)/include
+  HDF5_LIB := -L$(HDF5_INSTALL_PATH)/lib
+endif
+HDF5_LIB += -lhdf5 -lz
+
+#
+# partitioning software for MPI versions
+#
+# ParMETIS
+PARMETIS_VER=4
+ifdef PARMETIS_INSTALL_PATH
+  PARMETIS_INC = -I$(PARMETIS_INSTALL_PATH)/include
+  PARMETIS_LIB = -L$(PARMETIS_INSTALL_PATH)/lib
+endif
+PARMETIS_INC += -DHAVE_PARMETIS
+PARMETIS_LIB += -lparmetis -lmetis
+ifeq ($(PARMETIS_VER),4)
+  PARMETIS_INC += -DPARMETIS_VER_4
+endif
+
+# PT-Scotch
+ifdef PTSCOTCH_INSTALL_PATH
+  PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include
+  PTSCOTCH_LIB = -L$(PTSCOTCH_INSTALL_PATH)/lib
+endif
+PTSCOTCH_INC += -DHAVE_PTSCOTCH
+PTSCOTCH_LIB += -lptscotch -lscotch  -lptscotcherr
diff --git a/op2/src/cuda/op_cuda_decl.cpp b/op2/src/cuda/op_cuda_decl.cpp
index 3f6b4b98c..64a9c4543 100644
--- a/op2/src/cuda/op_cuda_decl.cpp
+++ b/op2/src/cuda/op_cuda_decl.cpp
@@ -73,22 +73,23 @@ op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size,
   op_dat dat = op_decl_dat_core(set, dim, type, size, data, name);
 
   // transpose data
-  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
   if (data != NULL && (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1))) {
-    char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char));
+    size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size);
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
     int element_size = dat->size / dat->dim;
     for (int i = 0; i < dat->dim; i++) {
       for (int j = 0; j < set_size; j++) {
         for (int c = 0; c < element_size; c++) {
-          temp_data[element_size * i * round32(set_size) + element_size * j + c] =
+          temp_data[element_size * i * set_size + element_size * j + c] =
               dat->data[dat->size * j + element_size * i + c];
         }
       }
     }
     op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data),
-                      (size_t)dat->size * round32(set_size));
+                      (size_t)dat->size * set_size);
     free(temp_data);
   } else {
+    size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
     op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data),
                       (size_t)dat->size * set_size);
   }
@@ -150,15 +151,15 @@ op_set op_decl_set(int size, char const *name) {
 op_map op_decl_map(op_set from, op_set to, int dim, int *imap,
                    char const *name) {
   op_map map = op_decl_map_core(from, to, dim, imap, name);
-  int set_size = map->from->size + map->from->exec_size;
-  int *temp_map = (int *)malloc(map->dim * round32(set_size) * sizeof(int));
+  int set_size = round32(map->from->size + map->from->exec_size);
+  int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int));
   for (int i = 0; i < map->dim; i++) {
     for (int j = 0; j < set_size; j++) {
-      temp_map[i * round32(set_size) + j] = map->map[map->dim * j + i];
+      temp_map[i * set_size + j] = map->map[map->dim * j + i];
     }
   }
   op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map),
-                    sizeof(int) * map->dim * round32(set_size));
+                    sizeof(int) * map->dim * set_size);
   free(temp_map);
   return map;
 }
@@ -290,9 +291,9 @@ void op_upload_all() {
   op_dat_entry *item;
   TAILQ_FOREACH(item, &OP_dat_list, entries) {
     op_dat dat = item->dat;
-    size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
     if (dat->data_d) {
       if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
+        size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size);
         char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
         int element_size = dat->size / dat->dim;
         for (int i = 0; i < dat->dim; i++) {
@@ -308,6 +309,7 @@ void op_upload_all() {
         dat->dirty_hd = 0;
         free(temp_data);
       } else {
+        size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
         cutilSafeCall(gpuMemcpy(dat->data_d, dat->data, dat->size * set_size,
                                  gpuMemcpyHostToDevice));
         dat->dirty_hd = 0;
diff --git a/op2/src/cuda/op_cuda_rt_support.cpp b/op2/src/cuda/op_cuda_rt_support.cpp
index eb30a54e6..13ab70a0b 100644
--- a/op2/src/cuda/op_cuda_rt_support.cpp
+++ b/op2/src/cuda/op_cuda_rt_support.cpp
@@ -281,7 +281,7 @@ void op_cuda_get_data(op_dat dat) {
   else
     return;
   // transpose data
-  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size);
   if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
     char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
     cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, dat->size * set_size,
@@ -347,19 +347,19 @@ void cutilDeviceInit(int argc, char **argv) {
 void op_upload_dat(op_dat dat) {
   if (!OP_hybrid_gpu)
     return;
-  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size);
   if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
-    char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char));
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
     int element_size = dat->size / dat->dim;
     for (int i = 0; i < dat->dim; i++) {
       for (int j = 0; j < set_size; j++) {
         for (int c = 0; c < element_size; c++) {
-          temp_data[element_size * i * round32(set_size) + element_size * j + c] =
+          temp_data[element_size * i * set_size + element_size * j + c] =
               dat->data[dat->size * j + element_size * i + c];
         }
       }
     }
-    cutilSafeCall(gpuMemcpy(dat->data_d, temp_data, round32(set_size) * dat->size,
+    cutilSafeCall(gpuMemcpy(dat->data_d, temp_data, set_size * dat->size,
                              gpuMemcpyHostToDevice));
     free(temp_data);
   } else {
@@ -371,17 +371,17 @@ void op_upload_dat(op_dat dat) {
 void op_download_dat(op_dat dat) {
   if (!OP_hybrid_gpu)
     return;
-  size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size;
+  size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size);
   if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) {
-    char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char));
-    cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, round32(set_size) * dat->size,
+    char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char));
+    cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, set_size * dat->size,
                              gpuMemcpyDeviceToHost));
     int element_size = dat->size / dat->dim;
     for (int i = 0; i < dat->dim; i++) {
       for (int j = 0; j < set_size; j++) {
         for (int c = 0; c < element_size; c++) {
           dat->data[dat->size * j + element_size * i + c] =
-              temp_data[element_size * i * round32(set_size) + element_size * j + c];
+              temp_data[element_size * i * set_size + element_size * j + c];
         }
       }
     }
diff --git a/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja b/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja
index dbe828ada..8a2844253 100644
--- a/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja
+++ b/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja
@@ -20,11 +20,6 @@ extern {{const.typ}} {{const.ptr}}{% if const.dim > 1 %}[{{const.dim}}]{% endif
 void op_decl_const_char(int dim, const char *type, int size, char *dat, const char *name) {
     {{guard}}
 
-    if (size > MAX_CONST_SIZE) {
-        printf("error: requested size %d for const %s exceeds MAX_CONST_SIZE\n", size, name);
-        exit(1);
-    }
-
     {% for const in app.consts() %}
     if (!strcmp(name, "{{const.ptr}}")) {
         {{caller(const)}}
diff --git a/translator/c/op2.py b/translator/c/op2.py
index 97a9997d2..3710fec7a 100755
--- a/translator/c/op2.py
+++ b/translator/c/op2.py
@@ -90,7 +90,7 @@ def op_decl_set_parse(text):
   """Parsing for op_decl_set calls"""
 
   sets = []
-  for m in re.finditer('op_decl_set\((.*)\)', text):
+  for m in re.finditer(r'op_decl_set\((.*)\)', text):
     args = m.group(1).split(',')
 
     # check for syntax errors
@@ -101,7 +101,7 @@ def op_decl_set_parse(text):
     sets.append({
       'name': args[1].strip()
       })
-  for m in re.finditer('op_decl_set_hdf5\((.*)\)', text):
+  for m in re.finditer(r'op_decl_set_hdf5\((.*)\)', text):
     args = m.group(1).split(',')
 
     # check for syntax errors
@@ -120,7 +120,7 @@ def op_decl_const_parse(text):
   """Parsing for op_decl_const calls"""
 
   consts = []
-  for m in re.finditer('op_decl_const\((.*)\)', text):
+  for m in re.finditer(r'op_decl_const\((.*)\)', text):
     args = m.group(1).split(',')
 
     # check for syntax errors
@@ -242,8 +242,8 @@ def get_arg_gbl(arg_string, k):
   return temp_gbl
 
 def append_init_soa(text):
-  text = re.sub('\\bop_init\\b\\s*\((.*)\)','op_init_soa(\\1,1)', text)
-  text = re.sub('\\bop_mpi_init\\b\\s*\((.*)\)','op_mpi_init_soa(\\1,1)', text)
+  text = re.sub(r'\\bop_init\\b\\s*\((.*)\)','op_init_soa(\\1,1)', text)
+  text = re.sub(r'\\bop_mpi_init\\b\\s*\((.*)\)','op_mpi_init_soa(\\1,1)', text)
   return text
 
 def op_par_loop_parse(text):
@@ -305,9 +305,9 @@ def op_par_loop_parse(text):
 
 def op_check_kernel_in_text(text, name):
   match = False
-  inline_impl_pattern = r'inline[ \n]+void[ \n]+'+name+'\s*\('
+  inline_impl_pattern = r'inline[ \n]+void[ \n]+'+name+r'\s*\('
   matches = re.findall(inline_impl_pattern, text)
-  decl_pattern = r'([$\n]+)(void[ \n]+'+name+'\([ \n]*'+'[ \nA-Za-z0-9\*\_\.,#]+\);)'
+  decl_pattern = r'([$\n]+)(void[ \n]+'+name+r'\([ \n]*'+r'[ \nA-Za-z0-9\*\_\.,#]+\);)'
   if len(re.findall(inline_impl_pattern, text)) == 1:
     match = True
   elif len(re.findall(decl_pattern, text)) == 1:
diff --git a/translator/c/op2_gen_cuda.py b/translator/c/op2_gen_cuda.py
index 16f2c8af1..7d0379d75 100644
--- a/translator/c/op2_gen_cuda.py
+++ b/translator/c/op2_gen_cuda.py
@@ -961,10 +961,10 @@ def op2_gen_cuda(master, date, consts, kernels, sets):
   code('')
 
   for nc in range (0,len(consts)):
-    if consts[nc]['dim']==1:
+    if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim'])==1:
       code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';')
     else:
-      if consts[nc]['dim'] > 0:
+      if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim']) > 0:
         num = str(consts[nc]['dim'])
       else:
         num = 'MAX_CONST_SIZE'
@@ -992,8 +992,8 @@ def op2_gen_cuda(master, date, consts, kernels, sets):
     code('                       '+consts[nc]['type'][1:-1]+' *dat){')
     depth = depth + 2
     code('if (!OP_hybrid_gpu) return;')
-    if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1:
-      IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE')
+    if not str(consts[nc]['dim']).isdigit() or int(consts[nc]['dim']) <= 0:
+      IF('dim>MAX_CONST_SIZE')
       code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);')
       ENDIF()
     code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));')
diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py
index 436f1537e..81b4975a4 100644
--- a/translator/c/op2_gen_cuda_simple.py
+++ b/translator/c/op2_gen_cuda_simple.py
@@ -113,8 +113,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
   for nk in range (0,len(kernels)):
 
 #Optimization settings
-    inc_stage=0
-    op_color2_force=1
+    inc_stage=1
+    op_color2_force=0
     atomics=0
 
 
@@ -353,7 +353,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
       else:
         code('int start,           ')
         code('int end,             ')
-      code('int   set_size) {    ')
+      code('int   set_stride,     ')
+      code('int   set_size ) {    ')
     else:
       code('int   set_size ) {')
       code('')
@@ -497,7 +498,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
       for g_m in range(0,nargs):
         if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
           k = k + [mapinds[g_m]]
-          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')
+          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_stride * '+str(int(idxs[g_m]))+'];')
 
       #whatever didn't come up and is opt
       for g_m in range(0,nargs):
@@ -507,7 +508,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
           else:
             k = k + [mapinds[g_m]]
 
-          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];')
+          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_stride * '+str(int(idxs[g_m]))+'];')
           if optflags[g_m]==1:
             ENDIF()
 
@@ -564,7 +565,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
       for g_m in range(0,nargs):
         if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k):
           k = k + [mapinds[g_m]] #non-opt
-          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')
+          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_stride * '+str(int(idxs[g_m]))+'];')
 
       #whatever didn't come up and is opt
       for g_m in range(0,nargs):
@@ -574,7 +575,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
           else:
             k = k + [mapinds[g_m]]
 
-          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];')
+          code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_stride * '+str(int(idxs[g_m]))+'];')
           if optflags[g_m]==1:
             ENDIF()
 
@@ -964,13 +965,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
         for g_m in range(0,nargs):
           if maps[g_m] == OP_MAP and (not mapnames[g_m] in k):
             k = k + [mapnames[g_m]]
-            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))')
-            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');')
+            IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != round32(getSetSizeFromOpArg(&arg'+str(g_m)+')))')
+            code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = round32(getSetSizeFromOpArg(&arg'+str(g_m)+'));')
             code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));')
             ENDIF()
       if dir_soa!=-1:
-          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))')
-          code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');')
+          IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != round32(getSetSizeFromOpArg(&arg'+str(dir_soa)+')))')
+          code('direct_'+name+'_stride_OP2HOST = round32(getSetSizeFromOpArg(&arg'+str(dir_soa)+'));')
           code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));')
           ENDIF()
 
@@ -1112,6 +1113,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
         code('Plan->nthrcol,')
         code('Plan->thrcol,')
         code('Plan->ncolblk[col],')
+      code('round32(set->size+set->exec_size),')
       code('set->size+set->exec_size);')
       code('')
       if reduct:
@@ -1163,7 +1165,9 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
       for g_m in range(0,nargs):
         if inds[g_m]==0:
           code('(<TYP>*)<ARG>.data_d,')
-      code('start,end,set->size+set->exec_size);')
+      code('start,end,')
+      code('round32(set->size+set->exec_size),')
+      code('set->size+set->exec_size);')
       ENDIF()
       if reduct:
         code('if (round==1) mvReductArraysToHost(reduct_bytes);')
@@ -1187,7 +1191,6 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
           code(indent+'(<TYP> *) <ARG>.data_d,')
         else:
           code(indent+'(<TYP> *) <ARG>.data_d,')
-
       code(indent+'set->size );')
 
     if ninds>0 and not atomics:
@@ -1291,10 +1294,10 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
   code('')
 
   for nc in range (0,len(consts)):
-    if consts[nc]['dim']==1:
+    if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim'])==1:
       code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;')
     else:
-      if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0:
+      if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim']) > 0:
         num = str(consts[nc]['dim'])
       else:
         num = 'MAX_CONST_SIZE'
@@ -1317,8 +1320,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs):
     code('                       '+consts[nc]['type'][1:-1]+' *dat){')
     depth = depth + 2
     code('if (!OP_hybrid_gpu) return;')
-    if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1:
-      IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE')
+    if not str(consts[nc]['dim']).isdigit() or int(consts[nc]['dim']) <= 0:
+      IF('dim>MAX_CONST_SIZE')
       code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);')
       ENDIF()
     code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));')