PhD thesis

fpga-opencl-benchmarks · Aug 20, 2018 · 4e47818 · 4e47818
1 parent 849efb0
commit 4e47818
Show file tree

Hide file tree

Showing 203 changed files with 9,629 additions and 11,913 deletions.
diff --git a/README.md b/README.md
@@ -1,33 +1,45 @@
 # Rodinia Benchmark Suite for OpenCL-based FPGAs
 
-The Rodinia Benchmark Suite is a set of benchmarks originally developed at University of Virginia. See `README_original` for the original benchmarks, or visit [here](https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php) for more details.
+The Rodinia Benchmark Suite is a set of benchmarks originally developed at University of Virginia. See `README_original` for the original description, or visit [here](https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php) for more details.
 
-The Rodinia Benchmark Suite for OpenCL-based FPGAs is our modified version of the original benchmarks for FPGAs using OpenCL. As of now, only the following benchmarks are ported to Altera FPGAs.
+The Rodinia Benchmark Suite for OpenCL-based FPGAs is our modified version of the original benchmarks for FPGAs using Intel FPGA SDK for OpenCL. Xilinx FPGAs are NOT supported. The following benchmarks are ported to Intel FPGAs:
 
-- nw
-- hotspot
-- pathfinder
+- nw (full optimization)
+- hotspot (full optimization)
+- hotspot 3D (full optimization)
+- pathfinder (full optimization)
+- srad (full optimization)
+- lud (full optimization)
 - cfd
-- srad
-- lud
+- bfs
+- b+tree
+- backprop
+- lavaMD
 
-Each modified benchmark is available under [the opencl directory](opencl). See [the original README file](README_original) for more details about each benchmark.
+Each modified benchmark is available under [the opencl directory](opencl). See [the original README file](README_original) for more details about each benchmark. Each FPGA version has a readme included that describes the parameters and optimizations for that version.
 
 The input data files for the benchmarks are not included in this distribution and needs to be separately downloaded from [here](https://www.cs.virginia.edu/~skadron/wiki/rodinia/index.php).
 
-## Publication
+## Important note
 
-- Hamid Reza Zohouri, Naoya Maruyama, Aaron Smith, Motohiko Matsuda, and Satoshi Matsuoka, "Evaluating and Optimizing OpenCL Kernels for High Performance Computing with FPGAs," Proceedings of the ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC'16), Nov 2016. [Paper](http://dl.acm.org/ft_gateway.cfm?id=3014951&ftid=1810066&dwn=1&CFID=863386528&CFTOKEN=11866610)
+Makefiles are NOT guaranteed to work correctly for kernel compilation and should only be used for compiling the host codes using the HOST_ONLY=1 flag. Compile kernels manually with the settings reported in Hamid's PhD thesis.
+
+## Publications
+
+- Hamid Reza Zohouri, Naoya Maruyama, Aaron Smith, Motohiko Matsuda, and Satoshi Matsuoka, "Evaluating and Optimizing OpenCL Kernels for High Performance Computing with FPGAs," Proceedings of the ACM/IEEE International Conference for High Performance Computing, Networking, Storage and Analysis (SC'16), Nov. 2016. [Paper](https://dl.acm.org/citation.cfm?id=3014951)
+- Artur Podobas, Hamid Reza Zohouri, Naoya Maruyama, Satoshi Matsuoka, "Evaluating High-Level Design Strategies on FPGAs for High-Performance Computing," Proceedings of the 27th International Conference on Field Programmable Logic and Applications (FPL'17), Sep. 2017. [Paper](https://ieeexplore.ieee.org/abstract/document/8056760/)
+- Hamid Reza Zohouri, Artur Podobas, Satoshi Matsuoka, "Combined Spatial and Temporal Blocking for High-Performance Stencil Computation on FPGAs Using OpenCL," Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays (FPGA'18), Feb. 2018. [Paper](https://dl.acm.org/citation.cfm?id=3174248)
+- Hamid Reza Zohouri, "High Performance Computing with FPGAs and OpenCL," PhD thesis, Tokyo Institute of Technology, Tokyo, Japan, Aug. 2018
 
 ## Contact
 
+Hamid Reza Zohouri <br />
+Tokyo Institute of Technology <br />
+zohouri.h.aa@m.titech.ac.jp <br />
+http://github.com/zohourih
+
 Naoya Maruyama <br />
 RIKEN Advanced Institute for Computational Science / Tokyo Institute of Technology <br />
 nmaruyama@riken.jp <br />
 http://github.com/naoyam <br />
 http://mt.aics.riken.jp/~nmaruyama/
-
-Hamid Reza Zohouri <br />
-Tokyo Institute of Technology <br />
-zohouri.h.aa@m.titech.ac.jp <br />
-http://github.com/zohourih
diff --git a/common/make.config b/common/make.config
@@ -30,18 +30,25 @@ else
 	else
 		CFLAGS = -g -O3 -Wall
 	endif
+
+	ifeq ($(CC),icc)
+		CXX = icpc;
+		CFLAGS   += -fp-model precise
+		CXXFLAGS += -fp-model precise
+	endif
+
+	ifeq ($(CXX),icpc)
+		CC = icc;
+		CFLAGS   += -fp-model precise
+		CXXFLAGS += -fp-model precise
+	endif
 
 	RM = rm -rf
 endif
 
 # Placeholder for all preprocessor macros. Will be appended to compiler flag variables 
 DEFINE_MACROS = $(EXTRA_MACROS)
 
-# RESTRICT switch
-ifneq ($(USE_RESTRICT),0)
-      DEFINE_MACROS += -DUSE_RESTRICT
-endif	    
-
 # Use single precision by default. Pass FP=DOUBLE for double precision.
 ifeq ($(FP),DOUBLE)
 	DEFINE_MACROS += -DFP_DOUBLE
@@ -72,54 +79,27 @@ endif
 
 ### CUDA ###
 
-# CUDA toolkit installation path
-#CUDA_DIR = /usr/local/cuda
-
 CUDA_CC = nvcc
-CUDA_FLAGS = -Xcompiler -fopenmp
+CUDA_FLAGS = -Xcompiler -fopenmp -arch sm_35
 ifdef DEBUG
       CUDA_FLAGS += -g
 else
       CUDA_FLAGS += -O3
 endif
 
-# CUDA toolkit libraries
-#CUDA_LIB_DIR := $(CUDA_DIR)/lib
-#ifeq ($(shell uname -m), x86_64)
-#	ifeq ($(shell if test -d $(CUDA_DIR)/lib64; then echo T; else echo F; fi), T)
-#		CUDA_LIB_DIR := $(CUDA_DIR)/lib64
-#	endif
-#endif
-
-# CUDA samples installation path
-#SDK_DIR = /usr/local/cuda/samples/
-
 ifdef LINUX_TARGET
       CUDA_HELPER_INC =  -I$(dir $(shell which $(CUDA_CC)))../samples/common/inc
 endif
 CUDA_FLAGS += $(CUDA_HELPER_INC)
 
-# Nvidia NVML from Nvidia GDK, NVIDIA_GDK_DIR must be defined in bashrc, for Nvidia GPU power measurement
-NVML_INC = -I$(NVIDIA_GDK_DIR)/usr/include/nvidia/gdk
-NVML_LIB = -L$(NVIDIA_GDK_DIR)/usr/src/gdk/nvml/lib -lnvidia-ml
-
-# Bittware BmcLib, BITTWARE_SDK must be defined in bashrc, for power measurement on Bittware FPGA boards
-BITTWARE_INC = -I$(BITTWARE_SDK)/include -I$(BITTWARE_SDK)/include/resources
-BITTWARE_LIB = -L$(BITTWARE_SDK) -lbwhil -lbmclib
-BITTWARE_FLAGS = -fopenmp -DLINUX -DAOCL_BOARD_a10pl4_dd4gb_gx115es3
-ifeq ($(BOARD),a10pl4_dd4gb_gx115es3)
-	CFLAGS += $(BITTWARE_INC) $(BITTWARE_LIB) $(BITTWARE_FLAGS)
-	CXXFLAGS += $(BITTWARE_INC) $(BITTWARE_LIB) $(BITTWARE_FLAGS)
-endif
-
 ### OpenCL ###
 
 # Use the Apple OpenCL by default on OSX or if APPLE=1 is passed
 ifneq "$(or $(OSX_TARGET),$(APPLE))" ""
 	OPENCL_INC = 
 	OPENCL_LIB = -framework OpenCL
 	USE_JIT = 1
-	OPENCL_MACROS += -DAPPLE_CL
+	OPENCL_MACROS += -DAPPLE
 endif
 
 # NVIDIA OpenCL SDK
@@ -128,7 +108,7 @@ ifdef NVIDIA
 	OPENCL_INC = -I$(OPENCL_DIR)/include
 	OPENCL_LIB = -L$(OPENCL_DIR)/lib64 -lOpenCL
 	USE_JIT = 1
-	OPENCL_MACROS += -DNVIDIA_CL
+	OPENCL_MACROS += -DNVIDIA
 endif
 
 # AMD OpenCL SDK
@@ -137,7 +117,7 @@ ifdef AMD
 	OPENCL_INC = -I$(OPENCL_DIR)/include/
 	OPENCL_LIB = -L$(OPENCL_DIR)/lib/x86_64/ -lOpenCL
 	USE_JIT = 1
-	OPENCL_MACROS += -DAMD_CL -Wno-deprecated-declarations
+	OPENCL_MACROS += -DAMD -Wno-deprecated-declarations
 endif
 
 #ifeq ($(shell uname -m), x86_64)
@@ -150,7 +130,7 @@ endif
 ifdef ALTERA
 	USE_JIT = 0
 	CFPGA_FLAGS = -g -v --report
-	OPENCL_MACROS += -DALTERA_CL
+	OPENCL_MACROS += -DALTERA
 
 	ifeq ($(OS),Windows_NT)
 		OPENCL_LIB = /link $(shell aocl link-config) /nodefaultlib:libcmt 
@@ -161,8 +141,13 @@ ifdef ALTERA
 
 	OPENCL_INC = $(shell aocl compile-config)
 
+	ifdef BOARD
+		CFLAGS += -DAOCL_BOARD_$(BOARD)
+	endif
+
 	ifdef EMULATOR
 		CFPGA_FLAGS += -march=emulator
+		CFLAGS += -DEMULATOR
 	endif
 
 	ifdef ARM
@@ -188,6 +173,31 @@ endif
 
 OPENCL_INC += $(OPENCL_MACROS)
 
+### Power ###
+
+# Nvidia NVML from CUDA Toolkit for Nvidia GPU power measurement, CUDA_DIR must be defined in bashrc
+NVML_INC = -I$(CUDA_DIR)/include
+NVML_LIB = -L$(CUDA_DIR)/lib64/stubs -lnvidia-ml
+
+# Bittware BmcLib for power measurement on Bittware FPGA boards
+# BITTWARE_TOOLKIT must be defined in bashrc and point to Bittware II Toolkit
+BITTWARE_INC = -I$(BITTWARE_TOOLKIT)/include -I$(BITTWARE_SDK)/include/resources
+BITTWARE_LIB = -L$(BITTWARE_TOOLKIT) -lbwhil -lbmclib
+BITTWARE_FLAGS = -fopenmp -DLINUX -DAOCL_BOARD_a10pl4_dd4gb_gx115
+ifeq ($(BOARD),a10pl4_dd4gb_gx115)
+	CFLAGS += $(BITTWARE_INC) $(BITTWARE_LIB) $(BITTWARE_FLAGS)
+	CXXFLAGS += $(BITTWARE_INC) $(BITTWARE_LIB) $(BITTWARE_FLAGS)
+endif
+
+# Power measurement on Nallatech FPGA boards
+# AOCL_BOARD_PACKAGE_ROOT should point to a Nallatech BSP that includes the aocl_mmd.h header
+NALLATECH_INC = -I$(AOCL_BOARD_PACKAGE_ROOT)/software/include
+NALLATECH_FLAGS = -fopenmp -DLINUX -DAOCL_BOARD_p385a_sch_ax115
+ifeq ($(BOARD),p385a_sch_ax115)
+	CFLAGS += $(NALLATECH_INC) $(NALLATECH_FLAGS)
+	CXXFLAGS += $(NALLATECH_INC) $(NALLATECH_FLAGS)
+endif
+
 
 %.aoco : %.cl
 	$(RM) $*

diff --git a/common/power_fpga.h b/common/power_fpga.h
@@ -1,20 +1,26 @@
-// These functions are based on the "read_sensor" example provided by Bittware and depend on Bittware's headers and libraries.
+// The first set of functions are based on the "read_sensor" example provided by Bittware and depend on Bittware's headers and libraries.
+// The second set of functions are based on the example provided in Section 7.5 of Nallatehc's OpenCL A10 BSP Reference Guide and depend on Nallatech's headers and libraries.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <omp.h>
-#include "hil.h"
-#include "bmclib.h"
 
-#define DeviceNum 0 // Used to choose target FPGA board
-#define SDR       0 // Used to choose the target sensor, sensor 0 is the power sensor on the AL10P4 board
+#ifdef AOCL_BOARD_a10pl4_dd4gb_gx115
+	#include "hil.h"
+	#include "bmclib.h"
+#elif AOCL_BOARD_p385a_sch_ax115
+	#include "aocl_mmd.h"
 
-BMC_Handle bmc;
-HHil hil;
-HDevice hdev;
+	#ifdef __APPLE__
+		#include <OpenCL/opencl.h>
+	#else
+		#include <CL/cl.h>
+	#endif
+#endif
 
+#ifdef AOCL_BOARD_a10pl4_dd4gb_gx115
 //====================================================================================================================================
 // FPGA Energy Calculator for Bittware's FPGA boards
 //====================================================================================================================================
@@ -30,6 +36,13 @@ HDevice hdev;
 // A "#pragma omp barrier" should be put before the kernel call
 // Flag should become one in the kernel thread after kernel execution has finished (after clFinish())
 
+#define DeviceNum 0 // Used to choose target FPGA board
+#define SDR       0 // Used to choose the target sensor, sensor 0 is the power sensor on the AL10P4 board
+
+BMC_Handle bmc;
+HHil hil;
+HDevice hdev;
+
 static inline void cleanup()
 {
 	if(bmc)
@@ -94,7 +107,7 @@ static inline double GetPowerFPGA(int* flag)
 
 	if (!p_periph_table)
 	{
-		printf("Unsupported board type %d\n", boardtype);
+		printf("Unsupported board type %d.\n", boardtype);
 		cleanup();
 		return -1;
 	}
@@ -117,6 +130,7 @@ static inline double GetPowerFPGA(int* flag)
 			{
 				// Returns device power usage in Watt
 				bmc_sdr_read_sensor(bmc, record, value, sizeof(value), state, sizeof(state));
+
 				power = atof(value);
 				powerSum = powerSum + power;
 				count++;
@@ -138,6 +152,77 @@ static inline double GetPowerFPGA(int* flag)
 	return (double)(powerSum)/(double)(count);
 }
 
+#elif AOCL_BOARD_p385a_sch_ax115
+//====================================================================================================================================
+// FPGA Energy Calculator for Nallatech's FPGA boards
+//====================================================================================================================================
+
+// AOCL_BOARD_PACKAGE_ROOT must point to Nallatech's BSP that incldues the aocl_mmd.h header file.
+// Unlike the function for the Bittware board, the function for the Nallatech's board needs the OpenCL device list as input
+
+// This function works very similar to the GPU power function
+// Returns average power usage in Watt from when it is called until when "flag" becomes one
+// Sampling is done every 10 milliseconds
+// The host code should have two OpenMP threads, one running the OpenCL kernel and the other calling this function
+// A "#pragma omp barrier" should be put before the kernel call
+// Flag should become one in the kernel thread after kernel execution has finished (after clFinish())
+
+typedef void* (*get_board_extension_function_address_fn_t)(const char* func_name, cl_device_id device);
+typedef void* (*aocl_mmd_card_info_fn_t)(const char*, aocl_mmd_info_t, size_t, void*, size_t* );
+
+static inline void cleanup()
+{
+	#pragma omp barrier
+}
+
+static inline double GetPowerFPGA(int* flag, cl_device_id* device)
+{
+	void *tempPointer;
+
+	float power, powerSum = 0;
+	size_t count = 0;
+	size_t returnedSize;
+
+	get_board_extension_function_address_fn_t board_extension_function_address = (get_board_extension_function_address_fn_t) clGetExtensionFunctionAddress ("clGetBoardExtensionFunctionAddressAltera");
+	if (board_extension_function_address == NULL )
+	{
+		printf ("Failed to get clGetBoardExtensionFunctionAddressAltera.\n");
+		cleanup();
+		return -1;
+	}
+
+	tempPointer = board_extension_function_address("aocl_mmd_card_info", device[0]);
+
+	aocl_mmd_card_info_fn_t aocl_mmd_card_info_fn = (aocl_mmd_card_info_fn_t)tempPointer;
+	if (aocl_mmd_card_info_fn == NULL )
+	{
+		printf ("Failed to get aocl_mmd_card_info_fn address.\n");
+		cleanup();
+		return -1;
+	}
+
+	#pragma omp barrier
+	while(*flag == 0)
+	{
+		// Returns device power usage in Watt
+		// aclnalla_pcie0 is the board name string
+		aocl_mmd_card_info_fn("aclnalla_pcie0", AOCL_MMD_POWER, sizeof(float), (void*) &power, &returnedSize);
+
+		if (power >= 10.0 && power <= 80.0)
+		{
+			powerSum = powerSum + power;
+			count++;
+		}
+
+		// Sleep for 10 ms
+		usleep(10000);
+	}
+
+	return (double)(powerSum)/(double)(count);
+}
+
+#endif
+
 // Returns amount of energy used in jouls
 // "power" is average power usage in Watt from the GetPowerGPU() fucntion
 // "time" is run time in ms from one of our time measurement helper functions

diff --git a/cuda/bfs/run.201509 b/cuda/bfs/run.201509
diff --git a/cuda/cuda_benchmark.sh b/cuda/cuda_benchmark.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-benchmarks=(nw hotspot pathfinder srad lud cfd)
+benchmarks=(nw hotspot hotspot3D pathfinder srad lud cfd)
 runs=5
 
 echo "Benchmark     Time (ms)     Energy Usage (J)  Average Power Consumption (Watts)"