diff --git a/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy b/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy new file mode 100644 index 000000000..083ee2504 --- /dev/null +++ b/apps/c/airfoil/airfoil_hdf5/dp/Makefile.legacy @@ -0,0 +1,392 @@ +# +# The following environment variables should be predefined: +# +# CUDA_INSTALL_PATH +# PARMETIS_INSTALL_PATH +# PTSCOTCH_INSTALL_PATH +# HDF5_INSTALL_PATH +# +# OP2_INSTALL_PATH +# OP2_COMPILER (gnu,intel,etc) +# + +include ../../../../make-common.inc + +ifeq ($(OP2_COMPILER),gnu) + CPP = g++ + CPPFLAGS = -O3 -g -fPIC -DUNIX -Wall -Wextra + OMPFLAGS = -fopenmp + MPICPP = $(MPICXX_PATH) + MPIFLAGS = $(CCFLAGS) +else +ifeq ($(OP2_COMPILER),intel) + CPP = icpc + CCFLAGS = -O3 -g -xHost -DMPICH_IGNORE_CXX_SEEK -restrict -inline-forceinline -qopt-report=5 -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report + # CCFLAGS = -O3 -xHost -DMPICH_IGNORE_CXX_SEEK -fno-alias -inline-forceinline -qopt-report -parallel -prec-div -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report + CPPFLAGS = $(CCFLAGS) + OMPFLAGS = -qopenmp + MPICPP = $(MPICXX_PATH) + # NVCCFLAGS ~= -ccbin=$(MPICPP) + MPIFLAGS = $(CPPFLAGS) +else +ifeq ($(OP2_COMPILER),xl) + CPP = xlc++ + CCFLAGS = -O3 -qarch=pwr8 -qtune=pwr8 -qhot -qxflag=nrcptpo -qinline=level=10 -Wx,-nvvm-compile-options=-ftz=1 -Wx,-nvvm-compile-options=-prec-div=0 -Wx,-nvvm-compile-options=-prec-sqrt=0 +# CCFLAGS = -O3 -xHost -DMPICH_IGNORE_CXX_SEEK -fno-alias -inline-forceinline -qopt-report -parallel -prec-div -DVECTORIZE #-parallel #-DCOMM_PERF #-DDEBUG #-vec-report + CPPFLAGS = $(CCFLAGS) + OMPFLAGS = -qsmp=omp -qthreaded + OMPOFFLOAD = -qsmp=omp -qoffload -Xptxas -v -g1 + MPICPP = $(MPICXX_PATH) + MPIFLAGS = $(CPPFLAGS) +else +ifeq ($(OP2_COMPILER),pgi) + CPP = pgc++ + CCFLAGS = -O3 + CPPFLAGS = $(CCFLAGS) + OMPFLAGS = -mp + MPICC = $(MPICC_PATH) + MPICPP = $(MPICXX_PATH) + MPIFLAGS = $(CPPFLAGS) + # NVCCFLAGS += -ccbin=$(MPICPP) + ACCFLAGS = -acc -Minfo=acc -ta=tesla:cc35,fastmath,lineinfo -DOPENACC -fast -Minfo=accel -Mcuda=ptxinfo +else +ifeq ($(OP2_COMPILER),cray) + CPP = CC + CCFLAGS = -O3 -h fp3 -h ipa5 + CPPFLAGS = $(CCFLAGS) + OMPFLAGS = -h omp + MPICPP = CC + MPIFLAGS = $(CPPFLAGS) +else +ifeq ($(OP2_COMPILER),clang) + CPP = clang++ + CCFLAGS = -O3 -ffast-math + CPPFLAGS = $(CCFLAGS) + OMPFLAGS = -I$(OMPTARGET_LIBS)/../include -fopenmp=libomp -Rpass-analysis + OMPOFFLOAD = $(OMPFLAGS) -fopenmp-targets=nvptx64-nvidia-cuda -ffp-contract=fast -Xcuda-ptxas -v #-Xclang -target-feature -Xclang +ptx35 + MPICC = $(MPICC_PATH) + MPICPP = $(MPICXX_PATH) + MPIFLAGS = $(CPPFLAGS) +else +print: + @echo "unrecognised value for OP2_COMPILER" +endif +endif +endif +endif +endif +endif + + + +# +# set flags for NVCC compilation and linking +# +ifndef NV_ARCH + MESSAGE=select an NVIDA device to compile in CUDA, e.g. make NV_ARCH=KEPLER + NV_ARCH=Kepler +endif +ifeq ($(NV_ARCH),Fermi) + CODE_GEN_CUDA=-gencode arch=compute_20,code=sm_21 +else +ifeq ($(NV_ARCH),Kepler) + CODE_GEN_CUDA=-gencode arch=compute_35,code=sm_35 +else +ifeq ($(NV_ARCH),Maxwell) + CODE_GEN_CUDA=-gencode arch=compute_50,code=sm_50 +else +ifeq ($(NV_ARCH),Pascal) + CODE_GEN_CUDA=-gencode arch=compute_60,code=sm_60 +else +ifeq ($(NV_ARCH),Volta) + CODE_GEN_CUDA=-gencode arch=compute_70,code=sm_70 +endif +endif +endif +endif +endif + +ifdef MAX_REG_COUNT + REG_COUNT = --maxrregcount $(MAX_REG_COUNT) + PGI_REG_COUNT = -ta=nvidia,maxregcount:$(MAX_REG_COUNT) + ifeq ($(OP2_COMPILER),xl) + OMP4_REG_COUNT= -Xptxas -maxrregcount=$(MAX_REG_COUNT) + else + ifeq ($(OP2_COMPILER),clang) + OMP4_REG_COUNT= -Xcuda-ptxas --maxrregcount=$(MAX_REG_COUNT) + endif + endif +endif + +NVCCFLAGS += $(CODE_GEN_CUDA) -m64 -Xptxas=-v --use_fast_math -O3 $(REG_COUNT) -lineinfo -DOP2_CUDA #-g -G -O0 + +VAR = #-DOP_PART_SIZE_1=160 -DOP_PART_SIZE_2=320 -DOP_PART_SIZE_3=64 #-DOP_BLOCK_SIZE_0=64 -DOP_BLOCK_SIZE_1=64 -DOP_BLOCK_SIZE_2=64 -DOP_BLOCK_SIZE_3=64 -DOP_BLOCK_SIZE_4=64 + + +# +# master to make all versions +# +ALL_TARGETS = clean airfoil_mpi airfoil_cuda airfoil_openmp airfoil_seq airfoil_genseq airfoil_mpi_genseq airfoil_mpi_cuda airfoil_mpi_cuda_hyb airfoil_mpi_openmp convert_mesh_seq convert_mesh_mpi +ifeq ($(OP2_COMPILER),pgi) + ALL_TARGETS += airfoil_openacc airfoil_mpi_openacc +endif +ifeq ($(OP2_COMPILER),intel) + ALL_TARGETS += airfoil_mpi_vec +endif + +all: $(ALL_TARGETS) + +# +# simple sequential version +# + +airfoil_seq: airfoil.cpp save_soln.h adt_calc.h res_calc.h bres_calc.h + $(MPICPP) $(CPPFLAGS) airfoil.cpp $(OP2_INC) $(HDF5_INC) $(OP2_LIB) -lop2_seq -lop2_hdf5 $(HDF5_LIB) -o airfoil_seq + +airfoil_genseq: airfoil_op.cpp seq/airfoil_seqkernels.cpp + $(MPICPP) $(CPPFLAGS) $^ \ + $(OP2_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_seq -lop2_hdf5 \ + $(HDF5_LIB) \ + -o airfoil_genseq + +# +# x86 version using kernel files generated by op2.py +# + +airfoil_openmp: airfoil_op.cpp openmp/airfoil_kernels.cpp \ + openmp/save_soln_kernel.cpp save_soln.h \ + openmp/adt_calc_kernel.cpp adt_calc.h \ + openmp/res_calc_kernel.cpp res_calc.h \ + openmp/bres_calc_kernel.cpp bres_calc.h \ + openmp/update_kernel.cpp update.h \ + Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_LIB) $(HDF5_INC) -Iopenmp -I. \ + airfoil_op.cpp -lm openmp/airfoil_kernels.cpp -lm -lop2_openmp -lop2_hdf5 $(HDF5_LIB) -o airfoil_openmp + +# +# OpenACC version using kernel files generated by op2.py +# + +airfoil_openacc: airfoil_op.cpp openacc/airfoil_acckernels.c \ + openacc/save_soln_acckernel.c \ + openacc/adt_calc_acckernel.c \ + openacc/res_calc_acckernel.c \ + openacc/bres_calc_acckernel.c \ + openacc/update_acckernel.c \ + Makefile + $(MPICC) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) $(PGI_REG_COUNT) -Iopenacc -I. \ + openacc/airfoil_acckernels.c -c -o openacc/airfoil_acckernels.o + $(MPICPP) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_LIB) $(HDF5_INC) \ + airfoil_op.cpp -lm openacc/airfoil_acckernels.o -lm $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_openacc + + +# +# CUDA version using kernel files generated by op2.py +# + +airfoil_cuda: airfoil_op.cpp cuda/airfoil_kernels_cu.o Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) airfoil_op.cpp cuda/airfoil_kernels_cu.o \ + $(CUDA_INC) $(OP2_INC) $(HDF5_INC) \ + $(OP2_LIB) $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_cuda + +cuda/airfoil_kernels_cu.o: cuda/airfoil_kernels.cu \ + cuda/save_soln_kernel.cu save_soln.h \ + cuda/adt_calc_kernel.cu adt_calc.h \ + cuda/res_calc_kernel.cu res_calc.h \ + cuda/bres_calc_kernel.cu bres_calc.h \ + cuda/update_kernel.cu update.h \ + Makefile + nvcc $(VAR) $(INC) $(NVCCFLAGS) $(OP2_INC) $(HDF5_INC) -Icuda -I. \ + -c -o cuda/airfoil_kernels_cu.o cuda/airfoil_kernels.cu + +# +# CUDA with clang +# + +airfoil_cuda_clang: airfoil_op.cpp cuda/airfoil_kernels_cu_clang.o Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) airfoil_op.cpp cuda/airfoil_kernels_cu_clang.o \ + $(CUDA_INC) $(OP2_INC) $(HDF5_INC) \ + $(OP2_LIB) $(CUDA_LIB) -lcudart -lop2_cuda -lop2_hdf5 $(HDF5_LIB) -o airfoil_cuda_clang + +cuda/airfoil_kernels_cu_clang.o: cuda/airfoil_kernels.cu \ + cuda/save_soln_kernel.cu save_soln.h \ + cuda/adt_calc_kernel.cu adt_calc.h \ + cuda/res_calc_kernel.cu res_calc.h \ + cuda/bres_calc_kernel.cu bres_calc.h \ + cuda/update_kernel.cu update.h \ + Makefile + clang++ $(VAR) $(INC) -m64 -O3 $(OP2_INC) $(HDF5_INC) -gline-tables-only -I$(MPI_INSTALL_PATH)/include -Icuda -I. \ + -c -o cuda/airfoil_kernels_cu_clang.o cuda/airfoil_kernels.cu --cuda-gpu-arch=sm_35 -ffast-math $(OMP4_REG_COUNT) -Xcuda-ptxas -v + +# +# mpi with sequential-nodes version +# + +airfoil_mpi: airfoil.cpp save_soln.h adt_calc.h res_calc.h bres_calc.h Makefile + $(MPICPP) $(MPIFLAGS) airfoil.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi + +# +# mpi genseq version using kernel files generated by op2.py +# + +airfoil_mpi_genseq: airfoil_op.cpp seq/airfoil_seqkernels.cpp \ + seq/save_soln_seqkernel.cpp save_soln.h \ + seq/adt_calc_seqkernel.cpp adt_calc.h \ + seq/res_calc_seqkernel.cpp res_calc.h \ + seq/bres_calc_seqkernel.cpp bres_calc.h \ + seq/update_seqkernel.cpp update.h \ + Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \ + $(PARMETIS_INC) $(PTSCOTCH_INC) -Iseq -I. \ + airfoil_op.cpp -lm seq/airfoil_seqkernels.cpp $(OP2_LIB) -lop2_mpi \ + $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_genseq + +# +# mpi vectorized seq version using kernel files generated by op2.py +# + +airfoil_mpi_vec: airfoil_op.cpp vec/airfoil_veckernels.cpp \ + vec/save_soln_veckernel.cpp save_soln.h \ + vec/adt_calc_veckernel.cpp adt_calc.h \ + vec/res_calc_veckernel.cpp res_calc.h \ + vec/bres_calc_veckernel.cpp bres_calc.h \ + vec/update_veckernel.cpp update.h \ + Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \ + $(PARMETIS_INC) $(PTSCOTCH_INC) -Ivec -I. \ + airfoil_op.cpp -lm vec/airfoil_veckernels.cpp $(OP2_LIB) -lop2_mpi \ + $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_vec + + +# +# mpi openmp version using kernel files generated by op2.py +# + +airfoil_mpi_openmp: airfoil_op.cpp openmp/airfoil_kernels.cpp \ + openmp/save_soln_kernel.cpp save_soln.h \ + openmp/adt_calc_kernel.cpp adt_calc.h \ + openmp/res_calc_kernel.cpp res_calc.h \ + openmp/bres_calc_kernel.cpp bres_calc.h \ + openmp/update_kernel.cpp update.h \ + Makefile + $(MPICPP) $(VAR) $(CPPFLAGS) $(OMPFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \ + $(PARMETIS_INC) $(PTSCOTCH_INC) -Iopenmp -I. \ + airfoil_op.cpp -lm openmp/airfoil_kernels.cpp $(OP2_LIB) -lop2_mpi \ + $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_openmp + +# +# mpi openmp 4.5 version using kernel files generated by op2.py +# + +airfoil_openmp4: airfoil_op.cpp openmp4/airfoil_omp4kernels.cpp openmp4/airfoil_omp4kernel_funcs.cpp \ + openmp4/save_soln_omp4kernel_func.cpp save_soln.h openmp4/save_soln_omp4kernel.cpp \ + openmp4/adt_calc_omp4kernel_func.cpp adt_calc.h openmp4/adt_calc_omp4kernel.cpp \ + openmp4/res_calc_omp4kernel_func.cpp res_calc.h openmp4/res_calc_omp4kernel.cpp \ + openmp4/bres_calc_omp4kernel_func.cpp bres_calc.h openmp4/bres_calc_omp4kernel.cpp \ + openmp4/update_omp4kernel_func.cpp update.h openmp4/update_omp4kernel.cpp \ + Makefile + $(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(OMP4_REG_COUNT) \ + -Iopenmp4/ -I. -c openmp4/airfoil_omp4kernel_funcs.cpp -o openmp4/airfoil_omp4kernel_funcs.o + $(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(OMP4_REG_COUNT) \ + -Iopenmp4/ -I. -c openmp4/airfoil_omp4kernels.cpp -o openmp4/airfoil_omp4kernels.o + $(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) \ + airfoil_op.cpp openmp4/airfoil_omp4kernels.o openmp4/airfoil_omp4kernel_funcs.o -o airfoil_openmp4 \ + $(OP2_LIB) -lop2_openmp4 -lop2_hdf5 $(HDF5_LIB) -L$(CUDA_INSTALL_PATH)/lib64 -lcudart + + +airfoil_mpi_openmp4: airfoil_op.cpp openmp4/airfoil_omp4kernels.cpp openmp4/airfoil_omp4kernel_funcs.cpp \ + openmp4/save_soln_omp4kernel_func.cpp save_soln.h openmp4/save_soln_omp4kernel.cpp \ + openmp4/adt_calc_omp4kernel_func.cpp adt_calc.h openmp4/adt_calc_omp4kernel.cpp \ + openmp4/res_calc_omp4kernel_func.cpp res_calc.h openmp4/res_calc_omp4kernel.cpp \ + openmp4/bres_calc_omp4kernel_func.cpp bres_calc.h openmp4/bres_calc_omp4kernel.cpp \ + openmp4/update_omp4kernel_func.cpp update.h openmp4/update_omp4kernel.cpp \ + Makefile + $(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) \ + -Iopenmp4/ -I. -c openmp4/airfoil_omp4kernel_funcs.cpp -o openmp4/airfoil_omp4kernel_funcs.o + $(CPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) \ + -Iopenmp4/ -I. -c openmp4/airfoil_omp4kernels.cpp -o openmp4/airfoil_omp4kernels.o + $(MPICPP) $(VAR) $(CPPFLAGS) $(OMPOFFLOAD) $(OP2_INC) $(HDF5_INC) $(CUDA_LIB) -lcudart \ + airfoil_op.cpp openmp4/airfoil_omp4kernels.o openmp4/airfoil_omp4kernel_funcs.o -o airfoil_mpi_openmp4 \ + $(OP2_LIB) -lop2_mpi_cuda $(HDF5_LIB) $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(CUDA_LIB) -lcudart +# +# mpi openacc version using kernel files generated by op2.py +# + +airfoil_mpi_openacc: airfoil_op.cpp openacc/airfoil_acckernels.c \ + openacc/save_soln_acckernel.c save_soln.h \ + openacc/adt_calc_acckernel.c adt_calc.h \ + openacc/res_calc_acckernel.c res_calc.h \ + openacc/bres_calc_acckernel.c bres_calc.h \ + openacc/update_acckernel.c update.h \ + Makefile + $(MPICC) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OMPFLAGS) $(OP2_INC) -Iopenacc/ -I. \ + openacc/airfoil_acckernels.c -c -o airfoil_acckernels.o + $(MPICPP) $(VAR) $(CPPFLAGS) $(ACCFLAGS) $(OP2_INC) $(OP2_INC) $(HDF5_INC) \ + $(PARMETIS_INC) $(PTSCOTCH_INC) \ + airfoil_op.cpp -lm openacc/airfoil_acckernels.o -DOPENACC $(CUDA_LIB) -lcudart $(OP2_LIB) -lop2_mpi_cuda \ + $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o airfoil_mpi_openacc +# +# mpi with CUDA version +# + +airfoil_mpi_cuda: airfoil_op.cpp cuda/airfoil_kernels_mpi_cu.o Makefile + $(MPICPP) $(MPIFLAGS) airfoil_op.cpp -lm cuda/airfoil_kernels_mpi_cu.o \ + $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_mpi_cuda $(PARMETIS_LIB) $(PTSCOTCH_LIB) \ + $(HDF5_LIB) $(CUDA_LIB) -lcudart -o airfoil_mpi_cuda + +cuda/airfoil_kernels_mpi_cu.o: cuda/airfoil_kernels.cu \ + cuda/save_soln_kernel.cu save_soln.h \ + cuda/adt_calc_kernel.cu adt_calc.h \ + cuda/res_calc_kernel.cu res_calc.h \ + cuda/bres_calc_kernel.cu bres_calc.h \ + cuda/update_kernel.cu update.h \ + Makefile + nvcc $(INC) $(NVCCFLAGS) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. \ + -c -o cuda/airfoil_kernels_mpi_cu.o cuda/airfoil_kernels.cu + +# +# Hybrid CPU+GPU version +# + +airfoil_mpi_cuda_hyb: airfoil_op.cpp cuda/airfoil_hybkernels.o Makefile + $(MPICPP) $(MPIFLAGS) $(OMPFLAGS) -DOP_HYBRID_GPU airfoil_op.cpp -lm cuda/airfoil_hybkernels.o cuda/airfoil_hybkernels2.o \ + $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_mpi_cuda $(PARMETIS_LIB) $(PTSCOTCH_LIB) \ + $(HDF5_LIB) $(CUDA_LIB) -lcudart -o airfoil_mpi_cuda_hyb + +cuda/airfoil_hybkernels.o: cuda/airfoil_hybkernels.cu \ + cuda/airfoil_kernels.cu save_soln.h \ + adt_calc.h res_calc.h \ + bres_calc.h update.h \ + Makefile + nvcc -DOP_HYBRID_GPU -DGPUPASS $(INC) $(NVCCFLAGS) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. \ + -c -o cuda/airfoil_hybkernels.o cuda/airfoil_hybkernels.cu + cat cuda/airfoil_hybkernels.cu > cuda/airfoil_hybkernels2.cpp + $(MPICPP) -DOP_HYBRID_GPU $(OMPFLAGS) $(INC) $(OP2_INC) -I $(MPI_INSTALL_PATH)/include -Icuda -I. -Iopenmp \ + -c -o cuda/airfoil_hybkernels2.o cuda/airfoil_hybkernels2.cpp + rm cuda/airfoil_hybkernels2.cpp + +# +# convert ASCI new_gird.dat to HDF5 new_grid.h5 +# + +convert_mesh_seq: convert_mesh.cpp + $(MPICPP) $(MPIFLAGS) convert_mesh.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_seq -lop2_hdf5 $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o convert_mesh_seq + +convert_mesh_mpi: convert_mesh_mpi.cpp + $(MPICPP) $(MPIFLAGS) convert_mesh_mpi.cpp $(OP2_INC) $(PARMETIS_INC) $(PTSCOTCH_INC) $(HDF5_INC) \ + $(OP2_LIB) -lop2_mpi $(PARMETIS_LIB) $(PTSCOTCH_LIB) $(HDF5_LIB) -o convert_mesh_mpi + + + + +# +# cleanup +# + +clean: + rm -f airfoil_seq airfoil_genseq airfoil_openmp airfoil_cuda airfoil_mpi airfoil_mpi_genseq airfoil_mpi_vec airfoil_mpi_cuda_hyb airfoil_mpi_openmp airfoil_mpi_cuda convert_mesh_seq convert_mesh_mpi airfoil_openacc airfoil_mpi_openacc airfoil_openmp4 airfoil_mpi_openmp4 *.o cuda/*.o openacc/*.o openmp4/*.o *.optrpt diff --git a/apps/make-common.inc b/apps/make-common.inc new file mode 100644 index 000000000..51a93a862 --- /dev/null +++ b/apps/make-common.inc @@ -0,0 +1,78 @@ + +# Locate MPI compilers: +ifdef MPI_INSTALL_PATH + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpic++)") + MPICPP_PATH = $(MPI_INSTALL_PATH)/bin/mpic++ + else + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpic++)") + MPICPP_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpic++ + else + MPICPP_PATH = mpic++ + endif + endif + + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpicxx)") + MPICXX_PATH = $(MPI_INSTALL_PATH)/bin/mpicxx + else + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpicxx)") + MPICXX_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpicxx + else + MPICXX_PATH = mpicxx + endif + endif + + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/bin/mpicc)") + MPICC_PATH = $(MPI_INSTALL_PATH)/bin/mpicc + else + ifneq ("","$(wildcard $(MPI_INSTALL_PATH)/intel64/bin/mpicc)") + MPICC_PATH = $(MPI_INSTALL_PATH)/intel64/bin/mpicc + else + MPICC_PATH = mpicc + endif + endif +else + MPICXX_PATH = mpicxx + MPICC_PATH = mpicc +endif + +# OP2 paths +ifdef OP2_INSTALL_PATH + OP2_INC = -I$(OP2_INSTALL_PATH)/include + OP2_LIB = -L$(OP2_INSTALL_PATH)/lib +endif + +# CUDA paths +ifdef CUDA_INSTALL_PATH + CUDA_INC = -I$(CUDA_INSTALL_PATH)/include + CUDA_LIB = -L$(CUDA_INSTALL_PATH)/lib64 +endif + +# HDF5 paths +ifdef HDF5_INSTALL_PATH + HDF5_INC := -I$(HDF5_INSTALL_PATH)/include + HDF5_LIB := -L$(HDF5_INSTALL_PATH)/lib +endif +HDF5_LIB += -lhdf5 -lz + +# +# partitioning software for MPI versions +# +# ParMETIS +PARMETIS_VER=4 +ifdef PARMETIS_INSTALL_PATH + PARMETIS_INC = -I$(PARMETIS_INSTALL_PATH)/include + PARMETIS_LIB = -L$(PARMETIS_INSTALL_PATH)/lib +endif +PARMETIS_INC += -DHAVE_PARMETIS +PARMETIS_LIB += -lparmetis -lmetis +ifeq ($(PARMETIS_VER),4) + PARMETIS_INC += -DPARMETIS_VER_4 +endif + +# PT-Scotch +ifdef PTSCOTCH_INSTALL_PATH + PTSCOTCH_INC = -I$(PTSCOTCH_INSTALL_PATH)/include + PTSCOTCH_LIB = -L$(PTSCOTCH_INSTALL_PATH)/lib +endif +PTSCOTCH_INC += -DHAVE_PTSCOTCH +PTSCOTCH_LIB += -lptscotch -lscotch -lptscotcherr diff --git a/op2/src/cuda/op_cuda_decl.cpp b/op2/src/cuda/op_cuda_decl.cpp index 3f6b4b98c..64a9c4543 100644 --- a/op2/src/cuda/op_cuda_decl.cpp +++ b/op2/src/cuda/op_cuda_decl.cpp @@ -73,22 +73,23 @@ op_dat op_decl_dat_char(op_set set, int dim, char const *type, int size, op_dat dat = op_decl_dat_core(set, dim, type, size, data, name); // transpose data - size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; if (data != NULL && (strstr(type, ":soa") != NULL || (OP_auto_soa && dim > 1))) { - char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char)); + size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size); + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); int element_size = dat->size / dat->dim; for (int i = 0; i < dat->dim; i++) { for (int j = 0; j < set_size; j++) { for (int c = 0; c < element_size; c++) { - temp_data[element_size * i * round32(set_size) + element_size * j + c] = + temp_data[element_size * i * set_size + element_size * j + c] = dat->data[dat->size * j + element_size * i + c]; } } } op_cpHostToDevice((void **)&(dat->data_d), (void **)&(temp_data), - (size_t)dat->size * round32(set_size)); + (size_t)dat->size * set_size); free(temp_data); } else { + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; op_cpHostToDevice((void **)&(dat->data_d), (void **)&(dat->data), (size_t)dat->size * set_size); } @@ -150,15 +151,15 @@ op_set op_decl_set(int size, char const *name) { op_map op_decl_map(op_set from, op_set to, int dim, int *imap, char const *name) { op_map map = op_decl_map_core(from, to, dim, imap, name); - int set_size = map->from->size + map->from->exec_size; - int *temp_map = (int *)malloc(map->dim * round32(set_size) * sizeof(int)); + int set_size = round32(map->from->size + map->from->exec_size); + int *temp_map = (int *)malloc(map->dim * set_size * sizeof(int)); for (int i = 0; i < map->dim; i++) { for (int j = 0; j < set_size; j++) { - temp_map[i * round32(set_size) + j] = map->map[map->dim * j + i]; + temp_map[i * set_size + j] = map->map[map->dim * j + i]; } } op_cpHostToDevice((void **)&(map->map_d), (void **)&(temp_map), - sizeof(int) * map->dim * round32(set_size)); + sizeof(int) * map->dim * set_size); free(temp_map); return map; } @@ -290,9 +291,9 @@ void op_upload_all() { op_dat_entry *item; TAILQ_FOREACH(item, &OP_dat_list, entries) { op_dat dat = item->dat; - size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; if (dat->data_d) { if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { + size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size); char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); int element_size = dat->size / dat->dim; for (int i = 0; i < dat->dim; i++) { @@ -308,6 +309,7 @@ void op_upload_all() { dat->dirty_hd = 0; free(temp_data); } else { + size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; cutilSafeCall(gpuMemcpy(dat->data_d, dat->data, dat->size * set_size, gpuMemcpyHostToDevice)); dat->dirty_hd = 0; diff --git a/op2/src/cuda/op_cuda_rt_support.cpp b/op2/src/cuda/op_cuda_rt_support.cpp index eb30a54e6..13ab70a0b 100644 --- a/op2/src/cuda/op_cuda_rt_support.cpp +++ b/op2/src/cuda/op_cuda_rt_support.cpp @@ -281,7 +281,7 @@ void op_cuda_get_data(op_dat dat) { else return; // transpose data - size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size); if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, dat->size * set_size, @@ -347,19 +347,19 @@ void cutilDeviceInit(int argc, char **argv) { void op_upload_dat(op_dat dat) { if (!OP_hybrid_gpu) return; - size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size); if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { - char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char)); + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); int element_size = dat->size / dat->dim; for (int i = 0; i < dat->dim; i++) { for (int j = 0; j < set_size; j++) { for (int c = 0; c < element_size; c++) { - temp_data[element_size * i * round32(set_size) + element_size * j + c] = + temp_data[element_size * i * set_size + element_size * j + c] = dat->data[dat->size * j + element_size * i + c]; } } } - cutilSafeCall(gpuMemcpy(dat->data_d, temp_data, round32(set_size) * dat->size, + cutilSafeCall(gpuMemcpy(dat->data_d, temp_data, set_size * dat->size, gpuMemcpyHostToDevice)); free(temp_data); } else { @@ -371,17 +371,17 @@ void op_upload_dat(op_dat dat) { void op_download_dat(op_dat dat) { if (!OP_hybrid_gpu) return; - size_t set_size = dat->set->size + dat->set->exec_size + dat->set->nonexec_size; + size_t set_size = round32(dat->set->size + dat->set->exec_size + dat->set->nonexec_size); if (strstr(dat->type, ":soa") != NULL || (OP_auto_soa && dat->dim > 1)) { - char *temp_data = (char *)malloc(dat->size * round32(set_size) * sizeof(char)); - cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, round32(set_size) * dat->size, + char *temp_data = (char *)malloc(dat->size * set_size * sizeof(char)); + cutilSafeCall(gpuMemcpy(temp_data, dat->data_d, set_size * dat->size, gpuMemcpyDeviceToHost)); int element_size = dat->size / dat->dim; for (int i = 0; i < dat->dim; i++) { for (int j = 0; j < set_size; j++) { for (int c = 0; c < element_size; c++) { dat->data[dat->size * j + element_size * i + c] = - temp_data[element_size * i * round32(set_size) + element_size * j + c]; + temp_data[element_size * i * set_size + element_size * j + c]; } } } diff --git a/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja b/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja index dbe828ada..8a2844253 100644 --- a/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja +++ b/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja @@ -20,11 +20,6 @@ extern {{const.typ}} {{const.ptr}}{% if const.dim > 1 %}[{{const.dim}}]{% endif void op_decl_const_char(int dim, const char *type, int size, char *dat, const char *name) { {{guard}} - if (size > MAX_CONST_SIZE) { - printf("error: requested size %d for const %s exceeds MAX_CONST_SIZE\n", size, name); - exit(1); - } - {% for const in app.consts() %} if (!strcmp(name, "{{const.ptr}}")) { {{caller(const)}} diff --git a/translator/c/op2.py b/translator/c/op2.py index 97a9997d2..3710fec7a 100755 --- a/translator/c/op2.py +++ b/translator/c/op2.py @@ -90,7 +90,7 @@ def op_decl_set_parse(text): """Parsing for op_decl_set calls""" sets = [] - for m in re.finditer('op_decl_set\((.*)\)', text): + for m in re.finditer(r'op_decl_set\((.*)\)', text): args = m.group(1).split(',') # check for syntax errors @@ -101,7 +101,7 @@ def op_decl_set_parse(text): sets.append({ 'name': args[1].strip() }) - for m in re.finditer('op_decl_set_hdf5\((.*)\)', text): + for m in re.finditer(r'op_decl_set_hdf5\((.*)\)', text): args = m.group(1).split(',') # check for syntax errors @@ -120,7 +120,7 @@ def op_decl_const_parse(text): """Parsing for op_decl_const calls""" consts = [] - for m in re.finditer('op_decl_const\((.*)\)', text): + for m in re.finditer(r'op_decl_const\((.*)\)', text): args = m.group(1).split(',') # check for syntax errors @@ -242,8 +242,8 @@ def get_arg_gbl(arg_string, k): return temp_gbl def append_init_soa(text): - text = re.sub('\\bop_init\\b\\s*\((.*)\)','op_init_soa(\\1,1)', text) - text = re.sub('\\bop_mpi_init\\b\\s*\((.*)\)','op_mpi_init_soa(\\1,1)', text) + text = re.sub(r'\\bop_init\\b\\s*\((.*)\)','op_init_soa(\\1,1)', text) + text = re.sub(r'\\bop_mpi_init\\b\\s*\((.*)\)','op_mpi_init_soa(\\1,1)', text) return text def op_par_loop_parse(text): @@ -305,9 +305,9 @@ def op_par_loop_parse(text): def op_check_kernel_in_text(text, name): match = False - inline_impl_pattern = r'inline[ \n]+void[ \n]+'+name+'\s*\(' + inline_impl_pattern = r'inline[ \n]+void[ \n]+'+name+r'\s*\(' matches = re.findall(inline_impl_pattern, text) - decl_pattern = r'([$\n]+)(void[ \n]+'+name+'\([ \n]*'+'[ \nA-Za-z0-9\*\_\.,#]+\);)' + decl_pattern = r'([$\n]+)(void[ \n]+'+name+r'\([ \n]*'+r'[ \nA-Za-z0-9\*\_\.,#]+\);)' if len(re.findall(inline_impl_pattern, text)) == 1: match = True elif len(re.findall(decl_pattern, text)) == 1: diff --git a/translator/c/op2_gen_cuda.py b/translator/c/op2_gen_cuda.py index 16f2c8af1..7d0379d75 100644 --- a/translator/c/op2_gen_cuda.py +++ b/translator/c/op2_gen_cuda.py @@ -961,10 +961,10 @@ def op2_gen_cuda(master, date, consts, kernels, sets): code('') for nc in range (0,len(consts)): - if consts[nc]['dim']==1: + if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim'])==1: code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') else: - if consts[nc]['dim'] > 0: + if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim']) > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' @@ -992,8 +992,8 @@ def op2_gen_cuda(master, date, consts, kernels, sets): code(' '+consts[nc]['type'][1:-1]+' *dat){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') - if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1: - IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE') + if not str(consts[nc]['dim']).isdigit() or int(consts[nc]['dim']) <= 0: + IF('dim>MAX_CONST_SIZE') code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));') diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py index 436f1537e..81b4975a4 100644 --- a/translator/c/op2_gen_cuda_simple.py +++ b/translator/c/op2_gen_cuda_simple.py @@ -113,8 +113,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for nk in range (0,len(kernels)): #Optimization settings - inc_stage=0 - op_color2_force=1 + inc_stage=1 + op_color2_force=0 atomics=0 @@ -353,7 +353,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): else: code('int start, ') code('int end, ') - code('int set_size) { ') + code('int set_stride, ') + code('int set_size ) { ') else: code('int set_size ) {') code('') @@ -497,7 +498,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] - code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_stride * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): @@ -507,7 +508,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): else: k = k + [mapinds[g_m]] - code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_stride * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() @@ -564,7 +565,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): k = k + [mapinds[g_m]] #non-opt - code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_stride * '+str(int(idxs[g_m]))+'];') #whatever didn't come up and is opt for g_m in range(0,nargs): @@ -574,7 +575,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): else: k = k + [mapinds[g_m]] - code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_stride * '+str(int(idxs[g_m]))+'];') if optflags[g_m]==1: ENDIF() @@ -964,13 +965,13 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for g_m in range(0,nargs): if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): k = k + [mapnames[g_m]] - IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') - code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') + IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != round32(getSetSizeFromOpArg(&arg'+str(g_m)+')))') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = round32(getSetSizeFromOpArg(&arg'+str(g_m)+'));') code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() if dir_soa!=-1: - IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') - code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') + IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != round32(getSetSizeFromOpArg(&arg'+str(dir_soa)+')))') + code('direct_'+name+'_stride_OP2HOST = round32(getSetSizeFromOpArg(&arg'+str(dir_soa)+'));') code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') ENDIF() @@ -1112,6 +1113,7 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): code('Plan->nthrcol,') code('Plan->thrcol,') code('Plan->ncolblk[col],') + code('round32(set->size+set->exec_size),') code('set->size+set->exec_size);') code('') if reduct: @@ -1163,7 +1165,9 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): for g_m in range(0,nargs): if inds[g_m]==0: code('(*).data_d,') - code('start,end,set->size+set->exec_size);') + code('start,end,') + code('round32(set->size+set->exec_size),') + code('set->size+set->exec_size);') ENDIF() if reduct: code('if (round==1) mvReductArraysToHost(reduct_bytes);') @@ -1187,7 +1191,6 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): code(indent+'( *) .data_d,') else: code(indent+'( *) .data_d,') - code(indent+'set->size );') if ninds>0 and not atomics: @@ -1291,10 +1294,10 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): code('') for nc in range (0,len(consts)): - if consts[nc]['dim']==1: + if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim'])==1: code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;') else: - if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + if str(consts[nc]['dim']).isdigit() and int(consts[nc]['dim']) > 0: num = str(consts[nc]['dim']) else: num = 'MAX_CONST_SIZE' @@ -1317,8 +1320,8 @@ def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): code(' '+consts[nc]['type'][1:-1]+' *dat){') depth = depth + 2 code('if (!OP_hybrid_gpu) return;') - if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1: - IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE') + if not str(consts[nc]['dim']).isdigit() or int(consts[nc]['dim']) <= 0: + IF('dim>MAX_CONST_SIZE') code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);') ENDIF() code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));')