From c005bb5f6bcc7d79bbfeacb27d08a166ef450946 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 13 Aug 2013 13:18:47 -0500
Subject: [PATCH 01/59] Adding link to our API documentation on GitHub Pages

---
 README.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6b763cf1..dfdaa645 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,11 @@ This repository houses the code for the OpenCL™ BLAS portion of APPML.  The co
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
 
+## clBLAS library user documentation
+[Library and API documentation]( http://clmathlibraries.github.io/clBLAS/ ) for developers is available online as a GitHub Pages website
+
 ## clBLAS Wiki
-The [project wiki](https://github.com/kknox/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/kknox/clBLAS/wiki/Build)
+The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/clMathLibraries/clBLAS/wiki/Build)
 
 ## Contributing code
 Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project

From 5f603a8b7c68b9c8aaee9d5a642760c8c98cb5dc Mon Sep 17 00:00:00 2001
From: braga <braga@braga-lnx05.(none)>
Date: Tue, 13 Aug 2013 18:42:05 -0500
Subject: [PATCH 02/59] fixing cmake files to allow for netlib blas reference

---
 src/CMakeLists.txt | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c257817..1b840346 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -98,18 +98,7 @@ endif()
 # TODO: maybe this could be written using the FindBLAS module in the future
 if( BUILD_TEST )
 	if(NOT CORR_TEST_WITH_ACML)
-		if( WIN32 )
-			find_package( Netlib COMPONENTS BLAS REQUIRED )
-		else( )
-			if( $ENV{REFBLAS_ROOT} )
-				set( REFBLAS_ROOT $ENV{REFBLAS_ROOT} CACHE PATH "NetLib BLAS root path")
-			else( )
-				message(FATAL_ERROR "Cannot find reference BLAS, please set REFBLAS_ROOT environment variable")
-			endif( )
-			
-			# Find reference BLAS implementation
-			include( ${REFBLAS_ROOT}/package/cmake/exportBLAS.cmake )
-		endif( )
+		find_package( Netlib COMPONENTS BLAS REQUIRED )
 	else( )
 		# Find ACML BLAS implementation
 		# platform dependent ACML subdirectory

From 14ab8e667f817e210e7dd5899b9a09386fc5c762 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Wed, 14 Aug 2013 16:09:06 -0500
Subject: [PATCH 03/59] Fixed paths to source doxygen files

---
 doc/clBLAS.doxy | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/doc/clBLAS.doxy b/doc/clBLAS.doxy
index 86fbbfc4..afc15ae0 100644
--- a/doc/clBLAS.doxy
+++ b/doc/clBLAS.doxy
@@ -52,7 +52,7 @@ PROJECT_LOGO           =
 # If a relative path is entered, it will be relative to the location 
 # where doxygen was started. If left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = F:\code\git-svn\clBLAS.head\bin\master\vs10x64.superbuild\docs
+OUTPUT_DIRECTORY       = ..\..\bin\clBLAS.doxy
 
 # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
 # 4096 sub-directories (in 2 levels) under the output directory of each output 
@@ -651,17 +651,17 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = clBLAS.h \
-                         include/cltypes.h \
-                         include/kerngen.h \
-                         include/solver.h \
-                         include/mempat.h \
-                         src/blas/gens/blas_kgen.h \
-                         src/blas/include/clblas-internal.h \
-                         src/blas/include/kernel_extra.h \
-                         src/blas/include/solution_seq.h \
-                         include/granulation.h \
-                         src/tools/ktest/step.h
+INPUT                  = ../src/clBLAS.h \
+                         ../src/include/cltypes.h \
+                         ../src/include/kerngen.h \
+                         ../src/include/solver.h \
+                         ../src/include/mempat.h \
+                         ../src/library/gens/blas_kgen.h \
+                         ../src/library/include/clblas-internal.h \
+                         ../src/library/include/kernel_extra.h \
+                         ../src/library/include/solution_seq.h \
+                         ../src/include/granulation.h \
+                         ../src/library/tools/ktest/step.h
 
 # This tag can be used to specify the character encoding of the source files 
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
@@ -721,7 +721,7 @@ EXCLUDE_SYMBOLS        =
 # directories that contain example code fragments that are included (see 
 # the \include command).
 
-EXAMPLE_PATH           = samples
+EXAMPLE_PATH           = ../src/samples
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the 
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 

From a7af65d0ba655a6aebb2a1214f22f5a957a739c7 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Sun, 18 Aug 2013 19:27:49 -0500
Subject: [PATCH 04/59] Changing ACMLROOT to the more standard cmake name
 ACML_ROOT

---
 src/CMakeLists.txt | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1b840346..995fa0c0 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -58,7 +58,7 @@ if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
 	set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE )
 endif( )
 
-set( ACMLROOT $ENV{ACMLROOT} CACHE PATH "AMD ACML root path")
+set( ACML_ROOT $ENV{ACML_ROOT} CACHE PATH "AMD ACML root path")
 
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug CACHE STRING
@@ -110,9 +110,9 @@ if( BUILD_TEST )
 
 		find_path(ACML_INCLUDE_DIRS acml.h
 			HINTS
-				$ENV{ACMLROOT}/include
-				${ACMLROOT}/include
-				${ACMLROOT}/${ACML_SUBDIR}/include
+				$ENV{ACML_ROOT}/include
+				${ACML_ROOT}/include
+				${ACML_ROOT}/${ACML_SUBDIR}/include
 		)
 
 		if( ACML_INCLUDE_DIRS )
@@ -123,15 +123,15 @@ if( BUILD_TEST )
 		if( UNIX )
 			find_library(ACML_LIBRARIES acml acml_mp
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			find_library(_acml_mv_library acml_mv
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			mark_as_advanced(_acml_mv_library)
 		endif( )
@@ -139,9 +139,9 @@ if( BUILD_TEST )
 		if(WIN32)
 			find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 		endif( )
 		

From edd8a9e68c9dd10e6979f3f79291f484273fd74f Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Sun, 18 Aug 2013 21:05:51 -0500
Subject: [PATCH 05/59] Adding support for msvc11 compilers.  Changing the
 output directory of tplgen to generate the .clT files out-of-source.  This
 way the source directory is not polluted with generated files

---
 src/include/defbool.h      |  6 +++---
 src/library/CMakeLists.txt | 44 +++++++++++++-------------------------
 src/tests/CMakeLists.txt   |  6 ++++++
 3 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/src/include/defbool.h b/src/include/defbool.h
index e90736dd..26caf6af 100644
--- a/src/include/defbool.h
+++ b/src/include/defbool.h
@@ -18,7 +18,7 @@
 #ifndef DEFBOOL_H_
 #define DEFBOOL_H_
 
-#if defined(_MSC_VER) && _MSC_VER <= 1600
+#if defined(_MSC_VER) && _MSC_VER <= 1700
 
 /*
 FIX for windows compilation
@@ -48,10 +48,10 @@ typedef  int  _Bool;
 #endif /* !__cplusplus */
 
 
-#else /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#else /* defined(_MSC_VER) && _MSC_VER <= 1700 */
 
 #include <stdbool.h>
 
-#endif /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#endif /* defined(_MSC_VER) && _MSC_VER <= 1700 */
 
 #endif /* DEFBOOL_H_ */
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 5bc8e2aa..24e0d3fd 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -277,35 +277,21 @@ if( BLAS_PRINT_BUILD_ERRORS )
     add_definitions( -DPRINT_BUILD_ERRORS )
 endif()
 
-#add_executable(tplgen tools/tplgen/tplgen.cpp)
-if (CMAKE_COMPILER_IS_GNUCXX)
-    include(ExternalProject)
-    ExternalProject_Add(
-        tplgen
-        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
-        INSTALL_COMMAND ""
-    )
-    add_custom_target( GENERATE_CLT
-                   COMMAND ${CMAKE_BINARY_DIR}/library/tplgen-prefix/src/tplgen-build/tplgen -o ../../include/ ${SRC_CL_TEMPLATES}
-                   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
-                )
-    add_dependencies(GENERATE_CLT tplgen)
-else()
-    include(ExternalProject)
-    ExternalProject_Add(
-        tplgen
-        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
-        CONFIGURE_COMMAND "${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen\\configure.bat"
-        BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Debug
-#        BUILD_COMMAND MSBuild.exe tplgen.sln /m /fl /flp1:logfile=errors.log;errorsonly /flp2:logfile=warnings.log;warningsonly /t:rebuild
-        INSTALL_COMMAND ""
-    )
-    add_custom_target( GENERATE_CLT
-        COMMAND ${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen-build\\Debug\\tplgen.exe -o ..\\..\\include ${SRC_CL_TEMPLATES}
-        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}\\library\\blas\\gens\\clTemplates
-    )
-    add_dependencies(GENERATE_CLT tplgen)
-endif()
+include( ExternalProject )
+ExternalProject_Add( tplgen
+    URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
+    INSTALL_COMMAND ""
+)
+
+ExternalProject_Get_Property( tplgen binary_dir )
+
+add_custom_target( GENERATE_CLT
+    COMMAND ${binary_dir}/Debug/tplgen -o ${clBLAS_BINARY_DIR}/include ${SRC_CL_TEMPLATES}
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
+)
+
+add_dependencies(GENERATE_CLT tplgen)
+
 add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
 add_dependencies(clBLAS GENERATE_CLT)
 set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 1f0e07c4..be2473e7 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -242,6 +242,12 @@ source_group(functional  FILES ${SRC_FUNC} ${FUNC_HEADERS})
 # at paramVal = CL_PROGRAM_BINARIES and several devices in the context
 add_definitions( -DTEST_WITH_SINGLE_DEVICE )
 
+# vs11 needs std::tuples compiled with 10 parameters by default
+# NOTE: this assumes that googletest is compiled with the same preprocessor macro; they must match
+if( MSVC11 )
+	add_definitions( "/D_VARIADIC_MAX=10" )
+endif()
+
 # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long
 # http://code.google.com/p/googletest/issues/detail?id=334
 if( CMAKE_COMPILER_IS_GNUCXX )

From b298b43db787307a11385b96aa22c9211de04992 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 20 Aug 2013 12:13:09 -0500
Subject: [PATCH 06/59] Removing tplgen configure.bat which is not used
 anymore.  Fixing linux build problem with path differences between gcc and
 msvc

---
 src/library/CMakeLists.txt             | 11 +++++++++--
 src/library/tools/tplgen/configure.bat | 14 --------------
 2 files changed, 9 insertions(+), 16 deletions(-)
 delete mode 100644 src/library/tools/tplgen/configure.bat

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 24e0d3fd..61464acd 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -285,12 +285,19 @@ ExternalProject_Add( tplgen
 
 ExternalProject_Get_Property( tplgen binary_dir )
 
+set( tplgenBinaryDir "" )
+if( CMAKE_COMPILER_IS_GNUCXX )
+    set( tplgenBinaryDir ${binary_dir} )
+else()
+    set( tplgenBinaryDir "${binary_dir}/Debug" )
+endif()
+
 add_custom_target( GENERATE_CLT
-    COMMAND ${binary_dir}/Debug/tplgen -o ${clBLAS_BINARY_DIR}/include ${SRC_CL_TEMPLATES}
+    COMMAND ${tplgenBinaryDir}/tplgen -o ${clBLAS_BINARY_DIR}/include ${SRC_CL_TEMPLATES}
     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
 )
 
-add_dependencies(GENERATE_CLT tplgen)
+add_dependencies( GENERATE_CLT tplgen )
 
 add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
 add_dependencies(clBLAS GENERATE_CLT)
diff --git a/src/library/tools/tplgen/configure.bat b/src/library/tools/tplgen/configure.bat
deleted file mode 100644
index b1f3db68..00000000
--- a/src/library/tools/tplgen/configure.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-del CMakeCache.txt
-cmake -DCMAKE_BUILD_TYPE=debug -G "Visual Studio 10" ..\tplgen
-if NOT ERRORLEVEL 1 goto end
-IF ERRORLEVEL 4 goto try9
-IF ERRORLEVEL 3 goto try9
-IF ERRORLEVEL 2 goto try9
-IF ERRORLEVEL 1 goto try9
-goto end
-
-:try9
-del CMakeCache.txt
-cmake -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 9 2008" ..\tplgen
-
-:end
\ No newline at end of file

From cf89e377c0eb3ee92ba0336bc8dcc7b1ac536938 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 20 Aug 2013 13:22:43 -0500
Subject: [PATCH 07/59] Updates to the readme.md file

---
 README.md | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index dfdaa645..43ccdf64 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,6 @@
 clBLAS
 =====
-
-clMATH is a software library containing FFT and BLAS functions written in OpenCL. In addition to GPU devices, the libraries also support running on CPU devices to facilitate debugging and multicore programming.
-
-<a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available version of the library, and pre-built binaries are available for download on both Linux and Windows platforms.
-
-This repository houses the code for the OpenCL™ BLAS portion of APPML.  The complete set of BLAS level 1, 2 & 3 routines has been  implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of routines.  For more information on supported graphics cards, see the <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/system-requirements-driver-compatibility/">AMD APP System Requirements</a>.
+This repository houses the code for the OpenCL™ BLAS portion of clMath.  The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of supported routines.  In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming.  <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
 
@@ -144,7 +139,7 @@ int main( void )
 ## Build dependencies
 ### Library for Windows
 *  Windows® 7/8
-*  Visual Studio 2010 SP1
+*  Visual Studio 2010 SP1, 2012
 *  An OpenCL SDK, such as APP SDK 2.8
 *  Latest CMake
 

From bc5647aa7311d56735b7b25422f822cd8d66cfc2 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Fri, 30 Aug 2013 13:59:36 -0500
Subject: [PATCH 08/59] Fixed a bug wherein the build would fail if a release
 build were built before debug build Removing /stack from windows builds;
 adding printscreen support for nmake builds Changing install suffixes based
 upon user settable cmake cache variables: SUFFIX_BIN/SUFFIX_LIB

---
 src/CMakeLists.txt                            | 33 +++++++--
 src/client/CMakeLists.txt                     | 14 +---
 src/library/CMakeLists.txt                    | 23 ++----
 .../blas/gens/legacy/tests/CMakeLists.txt     | 21 ++----
 src/library/blas/gens/tests/CMakeLists.txt    | 21 ++----
 src/library/common/tests/CMakeLists.txt       | 21 ++----
 src/library/tools/ktest/CMakeLists.txt        | 21 ++----
 src/library/tools/tune/CMakeLists.txt         | 21 ++----
 src/samples/CMakeLists.txt                    | 57 +++++----------
 src/scripts/perf/CMakeLists.txt               |  8 +--
 src/tests/CMakeLists.txt                      | 71 ++++++-------------
 11 files changed, 106 insertions(+), 205 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 995fa0c0..5a8f427c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -28,10 +28,14 @@ option( BUILD_KTEST "A command line tool for testing single clBLAS kernel" ON )
 # However, test-correctness can instead use NETLIB as a reference library
 set(CORR_TEST_WITH_ACML ON CACHE BOOL "Use ACML library in correctness tests")
 
-# uncomment these to print compiler invocation lines for nmake files
-# set( CMAKE_START_TEMP_FILE "" )
-# set( CMAKE_END_TEMP_FILE "" )
-# set( CMAKE_VERBOSE_MAKEFILE 1 )
+if( CMAKE_GENERATOR MATCHES "NMake" )
+  option( NMAKE_COMPILE_VERBOSE "Print compile and link strings to the console" OFF )
+  if( NMAKE_COMPILE_VERBOSE )
+    set( CMAKE_START_TEMP_FILE "" )
+    set( CMAKE_END_TEMP_FILE "" )
+    set( CMAKE_VERBOSE_MAKEFILE 1 )
+  endif( )
+endif( )
 
 # If we are on linux, and we wish to link with the netlib BLAS implementation, we need to have a valid fortran compiler
 if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
@@ -66,16 +70,28 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
+# These variables are meant to contain string which should be appended to the installation paths 
+# of library and executable binaries, respectively.  They are meant to be user configurable/overridable.  
+set( SUFFIX_LIB_DEFAULT "" )
+set( SUFFIX_BIN_DEFAULT "" )
+
 if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64)
     set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    if( TARGET_PLATFORM EQUAL 64 )
+        set( SUFFIX_LIB_DEFAULT "64" )
+    endif( )
 else()
     if(CMAKE_SIZEOF_VOID_P MATCHES 8)
         set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+        set( SUFFIX_LIB_DEFAULT "64" )
     else()
         set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
     endif()
 endif()
 
+set( SUFFIX_LIB ${SUFFIX_LIB_DEFAULT} CACHE STRING "String to append to 'lib' install path" )
+set( SUFFIX_BIN ${SUFFIX_BIN_DEFAULT} CACHE STRING "String to append to 'bin' install path" )
+
 if( MSVC_IDE )
     set_property( GLOBAL PROPERTY USE_FOLDERS TRUE )
 endif( )
@@ -202,11 +218,16 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     if(TARGET_PLATFORM EQUAL 32)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin")
     endif()
-endif()
+elseif( MSVC )
+	# CMake sets huge stack frames for windows, for whatever reason.  We go with compiler default.
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" )
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" )
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) 
+endif( )
 
 if (WIN32)
     add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-endif()
+endif( )
 
 #TODO:  We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( )
 add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS )
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index 5154a313..360173d1 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -52,17 +52,9 @@ target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
 add_executable(testPerfWrapper ${WRAPPER_SRC})
 target_link_libraries(testPerfWrapper ${Boost_LIBRARIES})
 
-if( TARGET_PLATFORM EQUAL 64 )
-    set( BIN_DIR bin64 )
-    set( LIB_DIR lib64 )
-else()
-    set( BIN_DIR bin32 )
-    set( LIB_DIR lib32 )
-endif()
-
 # CPack configuration; include the executable into the package
 install( TARGETS client testPerfWrapper
-		RUNTIME DESTINATION ${BIN_DIR}
-		LIBRARY DESTINATION ${LIB_DIR}
-		ARCHIVE DESTINATION ${LIB_DIR}/import
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
 		)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 61464acd..5ab545a8 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -289,7 +289,7 @@ set( tplgenBinaryDir "" )
 if( CMAKE_COMPILER_IS_GNUCXX )
     set( tplgenBinaryDir ${binary_dir} )
 else()
-    set( tplgenBinaryDir "${binary_dir}/Debug" )
+    set( tplgenBinaryDir "${binary_dir}/${CMAKE_CFG_INTDIR}" )
 endif()
 
 add_custom_target( GENERATE_CLT
@@ -305,18 +305,9 @@ set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
 set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
 target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS clBLAS
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS clBLAS
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS clBLAS
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/blas/gens/legacy/tests/CMakeLists.txt b/src/library/blas/gens/legacy/tests/CMakeLists.txt
index 9c5a0f37..cf31f1ec 100644
--- a/src/library/blas/gens/legacy/tests/CMakeLists.txt
+++ b/src/library/blas/gens/legacy/tests/CMakeLists.txt
@@ -46,18 +46,9 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 add_executable(t_blkmul ${SRC_BLKMUL})
 target_link_libraries(t_blkmul ${OPENCL_LIBRARIES})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_blkmul
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_blkmul
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_blkmul
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/blas/gens/tests/CMakeLists.txt b/src/library/blas/gens/tests/CMakeLists.txt
index f945b1eb..3490426d 100644
--- a/src/library/blas/gens/tests/CMakeLists.txt
+++ b/src/library/blas/gens/tests/CMakeLists.txt
@@ -43,18 +43,9 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 add_executable(t_tilemul ${SRC_TILEMUL})
 target_link_libraries(t_tilemul ${OPENCL_LIBRARIES})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_tilemul
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_tilemul
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_tilemul
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/common/tests/CMakeLists.txt b/src/library/common/tests/CMakeLists.txt
index 213e0bca..c38e59d4 100644
--- a/src/library/common/tests/CMakeLists.txt
+++ b/src/library/common/tests/CMakeLists.txt
@@ -48,18 +48,9 @@ target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
 add_executable(t_gens_cache ${SRC_GENS_CACHE})
 target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_dblock_kgen t_gens_cache
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_dblock_kgen t_gens_cache
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_dblock_kgen t_gens_cache
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/tools/ktest/CMakeLists.txt b/src/library/tools/ktest/CMakeLists.txt
index 34828f0e..e86ea004 100644
--- a/src/library/tools/ktest/CMakeLists.txt
+++ b/src/library/tools/ktest/CMakeLists.txt
@@ -141,18 +141,9 @@ add_executable(make-ktest ${KTEST_SRC} ${KTEST_EXTERNAL_SRC})
 add_dependencies(make-ktest GENERATE_CLT)
 target_link_libraries(make-ktest ${OPENCL_LIBRARIES} ${Boost_LIBRARIES} ${MATH_LIBRARY})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS make-ktest
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS make-ktest
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS make-ktest
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/tools/tune/CMakeLists.txt b/src/library/tools/tune/CMakeLists.txt
index dbfcce97..b792cc91 100644
--- a/src/library/tools/tune/CMakeLists.txt
+++ b/src/library/tools/tune/CMakeLists.txt
@@ -139,18 +139,9 @@ add_executable(tune ${TOOLS_SRC} ${TOOLS_EXTERNAL_SRC})
 add_dependencies(tune GENERATE_CLT)
 target_link_libraries(tune ${OPENCL_LIBRARIES} ${TIME_LIBRARY} ${MATH_LIBRARY})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS tune
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS tune
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS tune
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/samples/CMakeLists.txt b/src/samples/CMakeLists.txt
index ea9e2b5f..c354ba8c 100644
--- a/src/samples/CMakeLists.txt
+++ b/src/samples/CMakeLists.txt
@@ -260,45 +260,24 @@ add_executable(example_sasum ${SASUM_SAMPLE_SRC})
 target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clBLAS)
 set_property( TARGET example_sasum PROPERTY FOLDER "Samples")
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
-             example_ssyr2k example_strmm example_strsm 
-		     example_strmv example_strsv example_sger example_cher example_ssyr 
-		     example_ssyr2 example_cherk example_ssymm example_chemm
-		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
-		     example_sspr2 example_zhpr2 
-		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
-		     example_cher2k
-		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
-		     example_srotg example_srotmg example_srot example_srotm
-		     example_snrm2 example_sasum example_isamax
-
-			 version
-			 RUNTIME DESTINATION bin64
-			 LIBRARY DESTINATION lib64
-			 ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
-             example_ssyr2k example_strmm example_strsm 
-		     example_strmv example_strsv example_sger example_cher example_ssyr 
-		     example_ssyr2 example_cherk example_ssymm example_chemm
-		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
-		     example_sspr2 example_zhpr2 
-		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
-		     example_cher2k
-		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
-		     example_srotg example_srotmg example_srot example_srotm
-		     example_snrm2 example_sasum example_isamax
-
-			 version
-			 RUNTIME DESTINATION bin32
-			 LIBRARY DESTINATION lib32
-			 ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
+         example_ssyr2k example_strmm example_strsm 
+         example_strmv example_strsv example_sger example_cher example_ssyr 
+         example_ssyr2 example_cherk example_ssymm example_chemm
+         example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
+         example_sspr2 example_zhpr2 
+         example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
+         example_cher2k
+         example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
+         example_srotg example_srotmg example_srot example_srotm
+         example_snrm2 example_sasum example_isamax
+
+         version
+        RUNTIME DESTINATION bin${SUFFIX_BIN}
+        LIBRARY DESTINATION lib${SUFFIX_LIB}
+        ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+        )
 
 configure_file( "${PROJECT_SOURCE_DIR}/samples/CMakeLists.pack"
 		"${PROJECT_BINARY_DIR}/samples/CMakeLists.txt" COPYONLY )
diff --git a/src/scripts/perf/CMakeLists.txt b/src/scripts/perf/CMakeLists.txt
index 7b71a092..81d33857 100644
--- a/src/scripts/perf/CMakeLists.txt
+++ b/src/scripts/perf/CMakeLists.txt
@@ -21,10 +21,4 @@ set(GRAPHING_SCRIPTS 	measurePerformance.py
 						performanceUtility.py
 						)
 
-if( TARGET_PLATFORM EQUAL 64 )
-    set( BIN_DIR bin64 )
-else()
-    set( BIN_DIR bin32 )
-endif()
-
-install( FILES ${GRAPHING_SCRIPTS} DESTINATION ${BIN_DIR} )
+install( FILES ${GRAPHING_SCRIPTS} DESTINATION bin${SUFFIX_BIN} )
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index be2473e7..61f5e849 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -276,11 +276,7 @@ endif()
 # It stitches together a path to a previously built static library, based on our 'make install' logic
 # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
 get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
-if( LIB64 )
-	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib64" )
-else( )
-	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib32" )
-endif( )
+set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib${SUFFIX_LIB}" )
 
 if( WIN32 )
 	set( runtime.library "${runtime.library}/import/clBLAS${CMAKE_STATIC_LIBRARY_SUFFIX}" )
@@ -371,22 +367,13 @@ if( GTEST_FOUND )
     set_property( TARGET test-medium PROPERTY FOLDER "Test")
     set_property( TARGET test-short PROPERTY FOLDER "Test")
 
-    if( TARGET_PLATFORM EQUAL 64 )
-	    # CPack configuration; include the executable into the package
-	    install( TARGETS test-correctness test-medium test-short
-			    RUNTIME DESTINATION bin64
-			    LIBRARY DESTINATION lib64
-			    ARCHIVE DESTINATION lib64/import
-			    )
-    else()
-	    # CPack configuration; include the executable into the package
-	    install( TARGETS test-correctness test-medium test-short
-			    RUNTIME DESTINATION bin32
-			    LIBRARY DESTINATION lib32
-			    ARCHIVE DESTINATION lib32/import
-			    )
-    endif()
-
+    # CPack configuration; include the executable into the package
+    install( TARGETS test-correctness test-medium test-short
+            RUNTIME DESTINATION bin${SUFFIX_BIN}
+            LIBRARY DESTINATION lib${SUFFIX_LIB}
+            ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+            )
+    
 	if( ACML_FOUND )
 		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
 			${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include  ${clBLAS_SOURCE_DIR}/include)
@@ -407,21 +394,12 @@ if( GTEST_FOUND )
 
         set_property( TARGET test-performance PROPERTY FOLDER "Test")
 
-		if( TARGET_PLATFORM EQUAL 64 )
-			# CPack configuration; include the executable into the package
-			install( TARGETS test-performance
-					RUNTIME DESTINATION bin64
-					LIBRARY DESTINATION lib64
-					ARCHIVE DESTINATION lib64/import
-					)
-		else()
-			# CPack configuration; include the executable into the package
-			install( TARGETS test-performance
-					RUNTIME DESTINATION bin32
-					LIBRARY DESTINATION lib32
-					ARCHIVE DESTINATION lib32/import
-					)
-		endif()
+        # CPack configuration; include the executable into the package
+        install( TARGETS test-performance
+                RUNTIME DESTINATION bin${SUFFIX_BIN}
+                LIBRARY DESTINATION lib${SUFFIX_LIB}
+                ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+                )
 	endif()
 
 	include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
@@ -438,19 +416,10 @@ if( GTEST_FOUND )
 
     set_property( TARGET test-functional PROPERTY FOLDER "Test")
 
-	if( TARGET_PLATFORM EQUAL 64 )
-		# CPack configuration; include the executable into the package
-		install( TARGETS test-functional
-				RUNTIME DESTINATION bin64
-				LIBRARY DESTINATION lib64
-				ARCHIVE DESTINATION lib64/import
-				)
-	else()
-		# CPack configuration; include the executable into the package
-		install( TARGETS test-functional
-				RUNTIME DESTINATION bin32
-				LIBRARY DESTINATION lib32
-				ARCHIVE DESTINATION lib32/import
-				)
-	endif()
+    # CPack configuration; include the executable into the package
+    install( TARGETS test-functional
+            RUNTIME DESTINATION bin${SUFFIX_BIN}
+            LIBRARY DESTINATION lib${SUFFIX_LIB}
+            ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+            )
 endif()

From 2b3d7be163424e835849e4edaf5019ad6378361e Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <pavan@accelereyes.com>
Date: Fri, 6 Sep 2013 11:44:58 -0400
Subject: [PATCH 09/59] Change CMAKE version requirement to 2.8

ExternalProject is a feature added in version 2.8
---
 src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 1b840346..784836c4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ########################################################################
 
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 2.8)
 
 #User toggle-able options that can be changed on the command line with -D
 option( BUILD_RUNTIME "Build the BLAS runtime library" ON )

From cbb8d82c708f03ab29c42f674c75a604c7585699 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Sat, 7 Sep 2013 22:23:28 -0500
Subject: [PATCH 10/59] Adding basic support for pkg-config based configuration
 management, for gcc based distributions.

---
 src/library/CMakeLists.txt |  8 ++++++++
 src/library/clBLAS.pc.in   | 12 ++++++++++++
 2 files changed, 20 insertions(+)
 create mode 100644 src/library/clBLAS.pc.in

diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 5ab545a8..f3ac63ee 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -299,6 +299,14 @@ add_custom_target( GENERATE_CLT
 
 add_dependencies( GENERATE_CLT tplgen )
 
+if( CMAKE_COMPILER_IS_GNUCC )
+    configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLAS.pc.in
+                    ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc @ONLY )
+
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc
+             DESTINATION lib${SUFFIX_LIB}/pkgconfig )
+endif( )
+
 add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
 add_dependencies(clBLAS GENERATE_CLT)
 set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
diff --git a/src/library/clBLAS.pc.in b/src/library/clBLAS.pc.in
new file mode 100644
index 00000000..3eef330c
--- /dev/null
+++ b/src/library/clBLAS.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}/bin@SUFFIX_BIN@
+includedir=${prefix}/include
+libdir=${exec_prefix}/lib@SUFFIX_LIB@
+
+Name: clFFT
+Description: Open source OpenCL BLAS library
+Version: @CLFFT_VERSION@
+URL: https://github.com/clMathLibraries/clBLAS
+
+Cflags: -I${includedir}
+Libs: -L${libdir} -lclBLAS

From cc2538714e95ec193dde3ea71031438901cd933b Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 10 Sep 2013 14:42:36 -0500
Subject: [PATCH 11/59] Bumped the SOVERSION to 2 to account for ABI changes
 Fixed several typos in .pc file

---
 src/CMakeLists.txt       | 2 +-
 src/library/clBLAS.pc.in | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a063aeb8..92bfb16d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -51,7 +51,7 @@ set( clBLAS_VERSION_PATCH 0 )
 set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
 
 # Increment this if we break backward compatibility.
-set(clBLAS_SOVERSION 1)
+set( clBLAS_SOVERSION 2 )
 
 # We have custom written Find* modules now in the root source directory
 set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR} )
diff --git a/src/library/clBLAS.pc.in b/src/library/clBLAS.pc.in
index 3eef330c..433ca635 100644
--- a/src/library/clBLAS.pc.in
+++ b/src/library/clBLAS.pc.in
@@ -1,11 +1,11 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=${prefix}/bin@SUFFIX_BIN@
 includedir=${prefix}/include
-libdir=${exec_prefix}/lib@SUFFIX_LIB@
+libdir=${prefix}/lib@SUFFIX_LIB@
 
-Name: clFFT
+Name: clBLAS
 Description: Open source OpenCL BLAS library
-Version: @CLFFT_VERSION@
+Version: @clBLAS_VERSION@
 URL: https://github.com/clMathLibraries/clBLAS
 
 Cflags: -I${includedir}

From d42e68c886f2157cbccf3d77d55863370f030b31 Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <pavan@accelereyes.com>
Date: Wed, 11 Sep 2013 18:18:48 -0400
Subject: [PATCH 12/59] Adding CUDA_PATH as an option for OPENCL_ROOT

---
 src/FindOpenCL.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
index 1cdc43de..7398e03e 100644
--- a/src/FindOpenCL.cmake
+++ b/src/FindOpenCL.cmake
@@ -48,6 +48,8 @@
 #-----------------------
 if( DEFINED ENV{AMDAPPSDKROOT} )
 	set( OPENCL_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
+elseif( DEFINED ENV{CUDA_PATH} )
+        set( OPENCL_ROOT $ENV{CUDA_PATH} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
 else( )
 	set( OPENCL_ROOT "/usr/lib" CACHE PATH "Environment variable defining the root of OPENCL implementation" )
 endif( )

From bffcca7065ce4cd47f48cc2a3c2ad979ae7e6256 Mon Sep 17 00:00:00 2001
From: Pavan Yalamanchili <pavan@accelereyes.com>
Date: Wed, 11 Sep 2013 18:21:28 -0400
Subject: [PATCH 13/59] CUDA uses Win32 to store 32 bit libraries

---
 src/FindOpenCL.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
index 7398e03e..4491269e 100644
--- a/src/FindOpenCL.cmake
+++ b/src/FindOpenCL.cmake
@@ -85,7 +85,7 @@ else( )
             ${OPENCL_ROOT}/lib
             ENV AMDAPPSDKROOT/lib
 		DOC "OpenCL dynamic library path"
-		PATH_SUFFIXES x86
+		PATH_SUFFIXES x86 Win32
 	)
 endif( )
 mark_as_advanced( OPENCL_LIBRARIES )

From 89d693c98be128fce0aa793be002f30adc69f116 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Thu, 12 Sep 2013 11:26:44 -0500
Subject: [PATCH 14/59] Cleanup of txt not in the spirit of the clMath Apache
 license

---
 CHANGELOG                           | 31 -----------------------------
 src/clBLAS.def                      | 18 ++++++++++++++---
 src/library/blas/xrotg.c            |  4 ----
 src/library/blas/xrotmg.c           |  4 ----
 src/samples/CMakeLists.pack         | 18 ++++++++++++++---
 src/tests/correctness/corr-rotg.cpp |  4 ----
 6 files changed, 30 insertions(+), 49 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 9cd3d900..03b9faff 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -243,34 +243,3 @@ For example:
 	./example_sgemm
 		- Run a simple client; one example is provided for each supported main 
 		BLAS function family.
-_______________________________________________________________________________
-(C) 2010-2013 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD 
-Arrow logo, ATI, the ATI logo, Radeon, FireStream, FireGL, Catalyst, and 
-combinations thereof are trademarks of Advanced Micro Devices, Inc. Microsoft 
-(R), Windows, and Windows Vista (R) are registered trademarks of Microsoft 
-Corporation in the U.S. and/or other jurisdictions. OpenCL and the OpenCL logo 
-are trademarks of Apple Inc. used by permission by Khronos. Other names are for 
-informational purposes only and may be trademarks of their respective owners.
-
-The contents of this document are provided in connection with Advanced Micro 
-Devices, Inc. ("AMD") products. AMD makes no representations or warranties with 
-respect to the accuracy or completeness of the contents of this publication and 
-reserves the right to make changes to specifications and product descriptions 
-at any time without notice. The information contained herein may be of a 
-preliminary or advance nature and is subject to change without notice. No 
-license, whether express, implied, arising by estoppel or otherwise, to any 
-intellectual property rights is granted by this publication. Except as set forth
-in AMD's Standard Terms and Conditions of Sale, AMD assumes no liability 
-whatsoever, and disclaims any express or implied warranty, relating to its 
-products including, but not limited to, the implied warranty of 
-merchantability, fitness for a particular purpose, or infringement of any 
-intellectual property right.
-
-AMD's products are not designed, intended, authorized or warranted for use as 
-components in systems intended for surgical implant into the body, or in other 
-applications intended to support or sustain life, or in any other application 
-in which the failure of AMD's product could create a situation where personal 
-injury, death, or severe property or environmental damage may occur. AMD 
-reserves the right to discontinue or make changes to its products at any time 
-without notice.
-_______________________________________________________________________________
diff --git a/src/clBLAS.def b/src/clBLAS.def
index 5111ff2a..0a9f9b6b 100644
--- a/src/clBLAS.def
+++ b/src/clBLAS.def
@@ -1,6 +1,18 @@
-;/***********************************************************************
-;**	Copyright (C) 2010 Advanced Micro Devices, Inc. All Rights Reserved.
-;***********************************************************************/
+;/* ************************************************************************
+; * Copyright 2013 Advanced Micro Devices, Inc.
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; * ************************************************************************/
 
 LIBRARY	clBLAS
 
diff --git a/src/library/blas/xrotg.c b/src/library/blas/xrotg.c
index fb9c8e1b..b7e5a0f8 100644
--- a/src/library/blas/xrotg.c
+++ b/src/library/blas/xrotg.c
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 //#define DEBUG_ROTG
 
 #include <stdio.h>
diff --git a/src/library/blas/xrotmg.c b/src/library/blas/xrotmg.c
index b3c22298..6598229d 100644
--- a/src/library/blas/xrotmg.c
+++ b/src/library/blas/xrotmg.c
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 #include <stdio.h>
 #include <string.h>
 #include <clBLAS.h>
diff --git a/src/samples/CMakeLists.pack b/src/samples/CMakeLists.pack
index dbf8e7b4..22e16cca 100644
--- a/src/samples/CMakeLists.pack
+++ b/src/samples/CMakeLists.pack
@@ -1,6 +1,18 @@
-#############################################################################
-## Copyright (C) 2010,2011 Advanced Micro Devices, Inc. All Rights Reserved.
-#############################################################################
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
 cmake_minimum_required(VERSION 2.6)
 project(clblas.samples)
 
diff --git a/src/tests/correctness/corr-rotg.cpp b/src/tests/correctness/corr-rotg.cpp
index e26e7cd6..21ef905b 100644
--- a/src/tests/correctness/corr-rotg.cpp
+++ b/src/tests/correctness/corr-rotg.cpp
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 #include <stdlib.h>             // srand()
 #include <string.h>             // memcpy()
 #include <gtest/gtest.h>

From 1100fc037b8ab759c86f68db159c022139ecb5f6 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Fri, 13 Sep 2013 21:36:09 -0400
Subject: [PATCH 15/59] Fix build problems on OS X.

---
 src/library/blas/generic/kdump.c              |  2 +-
 src/library/blas/gens/legacy/tests/t_blkmul.c |  4 ++
 src/library/blas/gens/tests/t_tilemul.c       |  5 ++-
 src/library/common/tests/t_gens_cache.c       |  4 ++
 src/library/tools/ktest/step.h                |  4 ++
 src/library/tools/ktest/var.h                 |  4 ++
 src/library/tools/tune/CMakeLists.txt         |  2 +-
 src/library/tools/tune/storage_data.h         |  5 ++-
 src/library/tools/tune/storage_io.c           |  1 -
 src/library/tools/tune/toolslib.c             |  1 -
 src/library/tools/tune/toolslib.h             |  4 ++
 src/library/tools/tune/tune.c                 | 37 ++++++++++++++++++-
 12 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/src/library/blas/generic/kdump.c b/src/library/blas/generic/kdump.c
index 5345fc78..a48204a0 100644
--- a/src/library/blas/generic/kdump.c
+++ b/src/library/blas/generic/kdump.c
@@ -17,7 +17,7 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <malloc.h>
+#include <stdlib.h>
 
 #include <cltypes.h>
 #include <clblas-internal.h>
diff --git a/src/library/blas/gens/legacy/tests/t_blkmul.c b/src/library/blas/gens/legacy/tests/t_blkmul.c
index 4983ce0d..590231ee 100644
--- a/src/library/blas/gens/legacy/tests/t_blkmul.c
+++ b/src/library/blas/gens/legacy/tests/t_blkmul.c
@@ -15,7 +15,11 @@
  * ************************************************************************/
 
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/src/library/blas/gens/tests/t_tilemul.c b/src/library/blas/gens/tests/t_tilemul.c
index ba4b49c9..4b4dd803 100644
--- a/src/library/blas/gens/tests/t_tilemul.c
+++ b/src/library/blas/gens/tests/t_tilemul.c
@@ -14,8 +14,11 @@
  * limitations under the License.
  * ************************************************************************/
 
-
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
 #include <assert.h>
diff --git a/src/library/common/tests/t_gens_cache.c b/src/library/common/tests/t_gens_cache.c
index 177a25b3..5a2b9823 100644
--- a/src/library/common/tests/t_gens_cache.c
+++ b/src/library/common/tests/t_gens_cache.c
@@ -23,7 +23,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <kerngen.h>
 #include <kern_cache.h>
diff --git a/src/library/tools/ktest/step.h b/src/library/tools/ktest/step.h
index 7148c726..0472e499 100644
--- a/src/library/tools/ktest/step.h
+++ b/src/library/tools/ktest/step.h
@@ -18,7 +18,11 @@
 #ifndef KTEST_PATTERN_H__
 #define KTEST_PATTERN_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <list>
 #include <map>
 #include <string>
diff --git a/src/library/tools/ktest/var.h b/src/library/tools/ktest/var.h
index 0ebb1078..8bab85e6 100644
--- a/src/library/tools/ktest/var.h
+++ b/src/library/tools/ktest/var.h
@@ -18,7 +18,11 @@
 #ifndef KTEST_VAR_H__
 #define KTEST_VAR_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string>
 
 namespace clMath {
diff --git a/src/library/tools/tune/CMakeLists.txt b/src/library/tools/tune/CMakeLists.txt
index b792cc91..65bf00e5 100644
--- a/src/library/tools/tune/CMakeLists.txt
+++ b/src/library/tools/tune/CMakeLists.txt
@@ -130,7 +130,7 @@ if( BLAS_DEBUG_TOOLS )
 endif()
 
 # Library with functions for time measurement. In Windows they are included automatically
-if(UNIX)
+if(UNIX AND NOT APPLE)
     set(TIME_LIBRARY "rt")
 endif()
 
diff --git a/src/library/tools/tune/storage_data.h b/src/library/tools/tune/storage_data.h
index 5efcf5ee..3e72e76b 100644
--- a/src/library/tools/tune/storage_data.h
+++ b/src/library/tools/tune/storage_data.h
@@ -18,13 +18,16 @@
 #ifndef STORAGEDATA_H_
 #define STORAGEDATA_H_
 
-#include <malloc.h>
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <trace_malloc.h>
 
diff --git a/src/library/tools/tune/storage_io.c b/src/library/tools/tune/storage_io.c
index 4d9dd375..b90792b4 100644
--- a/src/library/tools/tune/storage_io.c
+++ b/src/library/tools/tune/storage_io.c
@@ -16,7 +16,6 @@
 
 
 
-#include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #include <signal.h>
diff --git a/src/library/tools/tune/toolslib.c b/src/library/tools/tune/toolslib.c
index 680a2197..fc55b8a4 100644
--- a/src/library/tools/tune/toolslib.c
+++ b/src/library/tools/tune/toolslib.c
@@ -15,7 +15,6 @@
  * ************************************************************************/
 
 
-#include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #include <signal.h>
diff --git a/src/library/tools/tune/toolslib.h b/src/library/tools/tune/toolslib.h
index 48c27e62..9e08a9a1 100644
--- a/src/library/tools/tune/toolslib.h
+++ b/src/library/tools/tune/toolslib.h
@@ -18,7 +18,11 @@
 #ifndef TOOLSLIB_H__
 #define TOOLSLIB_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <defbool.h>
 #include <devinfo.h>
diff --git a/src/library/tools/tune/tune.c b/src/library/tools/tune/tune.c
index d41e45a3..17d7d476 100644
--- a/src/library/tools/tune/tune.c
+++ b/src/library/tools/tune/tune.c
@@ -15,13 +15,16 @@
  * ************************************************************************/
 
 
-#include <malloc.h>
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 // #include "fileio.h"
 #include "toolslib.h"
@@ -35,6 +38,10 @@
 
 #if defined(_MSC_VER)
 #include "Windows.h"
+#elif defined(__APPLE__)
+#include <stdint.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
 #else
 #include "time.h"
 #endif
@@ -83,7 +90,33 @@ getCurrentTime(void)
      }
      return (nano_time_t)count.QuadPart;
 }
-#else /* defined(_MCS_VER) */
+
+#elif defined(__APPLE__)
+
+typedef uint64_t nano_time_t;
+#define NANOTIME_MAX UINT64_MAX
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+  static mach_timebase_info_data_t timebase_info = {0};
+
+    if (timebase_info.denom == 0)
+    {
+        (void)mach_timebase_info(&timebase_info);
+    }
+
+    /* Let's hope we don't overflow */
+    return (t * timebase_info.denom) / timebase_info.numer;
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+    return mach_absolute_time();
+}
+
+#else
 
 typedef unsigned long nano_time_t;
 #define NANOTIME_MAX (~0UL - 1)

From ba54c93e11de42340288a711983b33831977fbbd Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Wed, 18 Sep 2013 11:06:57 -0500
Subject: [PATCH 16/59] add to travis build. change default device type to cpu.
 need to enable build-testcorrectness (link to acml) in the build.

---
 .travis.yml           | 44 +++++++++++++++++++++++++++++++++++++++++++
 README.md             |  4 +++-
 src/client/client.cpp |  2 +-
 3 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..1039bc46
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,44 @@
+language: cpp
+
+compiler:
+  - gcc
+
+before_install:
+  - sudo apt-get update -qq
+  - sudo apt-get install -qq fglrx opencl-headers libboost-program-options-dev libgtest-dev
+# Uncomment below to help verify the installs above work
+#  - ls -la /usr/lib/libboost*
+#  - ls -la /usr/include/boost
+#  - ls -la /usr/src/gtest
+
+install:
+  - mkdir -p bin/gTest
+  - cd bin/gTest
+  - cmake -DCMAKE_BUILD_TYPE=Release /usr/src/gtest
+  - make
+  - sudo mv libg* /usr/lib
+
+before_script:
+  - cd ${TRAVIS_BUILD_DIR}
+  - mkdir -p bin/clBLAS
+  - cd bin/clBLAS
+  - cmake -DBUILD_TEST=OFF -DBUILD_CLIENT=ON ../../src
+
+script: 
+  - make install
+#  - ls -Rla package
+# Run a simple test to validate that the build works; CPU device in a VM
+  - cd client
+  - export LD_LIBRARY_PATH=${TRAVIS_BUILD_DIR}/bin/clBLAS/package/lib64:${LD_LIBRARY_PATH}
+  - ./client 
+
+after_success:
+  - cd ${TRAVIS_BUILD_DIR}/bin/clBLAS
+  - make package
+
+notifications:
+   email:
+     - clmath-developers@googlegroups.com
+   on_success: change
+   on_failure: always
+   
\ No newline at end of file
diff --git a/README.md b/README.md
index 43ccdf64..caa0e15d 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 clBLAS
 =====
+[![Build Status](https://travis-ci.org/TimmyLiu/clBLAS.png)](https://travis-ci.org/TimmyLiu/clBLAS)
+
 This repository houses the code for the OpenCL™ BLAS portion of clMath.  The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of supported routines.  In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming.  <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
 The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
@@ -154,4 +156,4 @@ int main( void )
 * Latest Boost
 
 ### Performance infrastructure
-* Python
\ No newline at end of file
+* Python
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 8f60a07a..1bf24541 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -51,7 +51,7 @@ int main(int argc, char *argv[])
   cl_double beta;
   cl_uint profileCount;
   cl_uint commandQueueFlags = 0;
-  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
+  cl_device_type deviceType = CL_DEVICE_TYPE_CPU;
   int order_option;
   //clblasOrder order;
   //clblasTranspose transA;

From 10089ceb81397c038b0e9b1907e6e9b865278636 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Wed, 18 Sep 2013 11:42:01 -0500
Subject: [PATCH 17/59] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index caa0e15d..fdc62cd8 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 clBLAS
 =====
-[![Build Status](https://travis-ci.org/TimmyLiu/clBLAS.png)](https://travis-ci.org/TimmyLiu/clBLAS)
+[![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.png)](https://travis-ci.org/clMathLibraries/clBLAS)
 
 This repository houses the code for the OpenCL™ BLAS portion of clMath.  The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of supported routines.  In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming.  <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 

From bb0ce37c16d3004eeea202ac851e99d4687be496 Mon Sep 17 00:00:00 2001
From: Arnaud Bergeron <abergeron@gmail.com>
Date: Wed, 18 Sep 2013 21:24:44 -0400
Subject: [PATCH 18/59] Fix compilation warnings:  - assignment used as
 condition warnings for the error checking  - writeable char constant warning.
  - Fix sprintf of size_t values to not (potentially) print garbage.  - Remove
 extra parentheses around condition in if().  - Replace explicit NULL
 dereference by a call to abort().

---
 src/include/kerngen.h                 |  6 ++++++
 src/library/blas/gens/ger_lds.cpp     |  4 ++--
 src/library/blas/gens/her2_lds.cpp    |  2 +-
 src/library/blas/gens/her_lds.cpp     |  2 +-
 src/library/blas/gens/kprintf.cpp     |  2 +-
 src/library/blas/gens/symm_cached.cpp |  8 ++++----
 src/library/blas/gens/syr2_lds.cpp    |  2 +-
 src/library/blas/gens/syr_lds.cpp     |  2 +-
 src/library/blas/gens/trmv_reg.cpp    |  4 ++--
 src/library/blas/gens/trsv_gemv.cpp   | 10 +++++-----
 src/library/blas/xaxpy.c              |  4 ++--
 src/library/blas/xcopy.c              |  4 ++--
 src/library/blas/xdot.c               |  8 ++++----
 src/library/blas/xgemm2.c             |  8 ++++----
 src/library/blas/xger.c               |  8 ++++----
 src/library/blas/xhemv.c              | 10 +++++-----
 src/library/blas/xher.c               |  6 +++---
 src/library/blas/xher2.c              |  8 ++++----
 src/library/blas/xher2k.c             |  8 ++++----
 src/library/blas/xherk.c              |  6 +++---
 src/library/blas/xhpmv.c              | 12 ++++++------
 src/library/blas/xrot.c               |  4 ++--
 src/library/blas/xrotg.c              |  8 ++++----
 src/library/blas/xrotm.c              |  6 +++---
 src/library/blas/xrotmg.c             | 10 +++++-----
 src/library/blas/xscal.c              |  2 +-
 src/library/blas/xshbmv.c             | 10 +++++-----
 src/library/blas/xspmv.c              | 12 ++++++------
 src/library/blas/xswap.c              |  4 ++--
 src/library/blas/xsymm.c              | 10 +++++-----
 src/library/blas/xsymv.c              | 10 +++++-----
 src/library/blas/xsyr.c               |  6 +++---
 src/library/blas/xsyr2.c              |  8 ++++----
 src/library/blas/xsyr2k.c             |  8 ++++----
 src/library/blas/xsyrk.c              |  6 +++---
 src/library/blas/xtbmv.c              |  8 ++++----
 src/library/blas/xtrmm.c              | 10 +++++-----
 src/library/blas/xtrmv.c              |  8 ++++----
 src/library/blas/xtrsm.c              | 10 +++++-----
 src/library/blas/xtrsv.c              |  6 +++---
 src/library/tools/tplgen/tplgen.cpp   |  2 +-
 src/library/tools/tune/subdim.c       |  2 +-
 42 files changed, 140 insertions(+), 134 deletions(-)

diff --git a/src/include/kerngen.h b/src/include/kerngen.h
index dd44b9ea..73ee1912 100644
--- a/src/include/kerngen.h
+++ b/src/include/kerngen.h
@@ -42,6 +42,12 @@
  */
 /*@{*/
 
+#ifdef _MSC_VER
+#define SPREFIX "I"
+#else
+#define SPREFIX "z"
+#endif
+
 #define SUBDIM_UNUSED (size_t)-1
 
 enum {
diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp
index b74945ea..ebcd577c 100644
--- a/src/library/blas/gens/ger_lds.cpp
+++ b/src/library/blas/gens/ger_lds.cpp
@@ -282,8 +282,8 @@ generator(
 
 	BH = subdims->y;
 	BW = subdims->x;
-	sprintf( bhStr, "%d", BH );
-	sprintf( bwStr, "%d", BW );
+	sprintf( bhStr, "%" SPREFIX "u", BH );
+	sprintf( bwStr, "%" SPREFIX "u", BW );
 
 	#ifdef DEBUG_GER
     printf("BH = %s\n", bhStr);
diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp
index a409c1ad..e724f118 100644
--- a/src/library/blas/gens/her2_lds.cpp
+++ b/src/library/blas/gens/her2_lds.cpp
@@ -301,7 +301,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_HER2
diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp
index e174de2d..6b489cf5 100644
--- a/src/library/blas/gens/her_lds.cpp
+++ b/src/library/blas/gens/her_lds.cpp
@@ -300,7 +300,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_HER
diff --git a/src/library/blas/gens/kprintf.cpp b/src/library/blas/gens/kprintf.cpp
index 54772fa2..d5cbecb8 100644
--- a/src/library/blas/gens/kprintf.cpp
+++ b/src/library/blas/gens/kprintf.cpp
@@ -346,7 +346,7 @@ char* kprintf::mystrtok( char* in, const char* tok)
             bool tokenFound = false;
             for( size_t i=0 ; i <= (strlen(tok) - 1); i++)
             {
-                if ((*strtokPtr == tok[i]))
+                if (*strtokPtr == tok[i])
                 {
                     if ( tok[i] == '(')
                     {
diff --git a/src/library/blas/gens/symm_cached.cpp b/src/library/blas/gens/symm_cached.cpp
index cc8c0350..40011823 100644
--- a/src/library/blas/gens/symm_cached.cpp
+++ b/src/library/blas/gens/symm_cached.cpp
@@ -193,10 +193,10 @@ generator(
 		printf("WARNING: SYMM- generator: subdim and blocksize in-compatible.\n");
 	}
 
-	sprintf(width, "%d", Y);
-	sprintf(itemy, "%lu", ITEMY);
-	sprintf(itemx, "%lu", ITEMX);
-	sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
+	sprintf(width, "%" SPREFIX "u", Y);
+	sprintf(itemy, "%" SPREFIX "u", ITEMY);
+	sprintf(itemx, "%" SPREFIX "u", ITEMX);
+	sprintf(itemy_by_width, "%" SPREFIX "u", (size_t) ITEMY/kextra->vecLenA);
 
 	kobj.put("%WIDTH", width);
 	kobj.put("%ITEMX", itemx);
diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp
index 9fccb059..4abb4ad9 100644
--- a/src/library/blas/gens/syr2_lds.cpp
+++ b/src/library/blas/gens/syr2_lds.cpp
@@ -308,7 +308,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_SYR2
diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp
index 0a12ef4e..2379a6b6 100644
--- a/src/library/blas/gens/syr_lds.cpp
+++ b/src/library/blas/gens/syr_lds.cpp
@@ -308,7 +308,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_SYR
diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp
index 28ee1f26..25e750e4 100644
--- a/src/library/blas/gens/trmv_reg.cpp
+++ b/src/library/blas/gens/trmv_reg.cpp
@@ -381,8 +381,8 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
 
-    sprintf( targetRows, "%d", TARGETROWS );
-	sprintf( blockSize, "%d", BLOCKSIZE );
+    sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
+	sprintf( blockSize, "%" SPREFIX "u", BLOCKSIZE );
 
 	#ifdef DEBUG_TRMV
     printf("TARGET ROWS = %s\n", targetRows);
diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp
index 49d5371b..5047bf17 100644
--- a/src/library/blas/gens/trsv_gemv.cpp
+++ b/src/library/blas/gens/trsv_gemv.cpp
@@ -415,9 +415,9 @@ generator(
 		{
 			return 0;
 		}
-        sprintf( TARGETHEIGHT_S, "%d", TARGETHEIGHT );
+        sprintf( TARGETHEIGHT_S, "%" SPREFIX "u", TARGETHEIGHT );
 	    sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE );
-        sprintf( TRIANGLE_HEIGHT_S, "%d", subdims->y );
+        sprintf( TRIANGLE_HEIGHT_S, "%" SPREFIX "u", subdims->y );
 
 		kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S);
 		kobj.put("%BLOCKSIZE", BLOCKSIZE_S);
@@ -433,9 +433,9 @@ generator(
 		{
 			return 0;
 		}
-        sprintf( TARGETROWS_S, "%d", TARGETROWS );
-	    sprintf( TARGETWIDTH_S, "%d", TARGETWIDTH );
-        sprintf( NLOOPS_S, "%d", NLOOPS );
+        sprintf( TARGETROWS_S, "%" SPREFIX "u", TARGETROWS );
+	    sprintf( TARGETWIDTH_S, "%" SPREFIX "u", TARGETWIDTH );
+        sprintf( NLOOPS_S, "%" SPREFIX "u", NLOOPS );
 		kobj.put("%TARGET_ROWS", TARGETROWS_S);
 		kobj.put("%TARGET_WIDTH", TARGETWIDTH_S);
 		kobj.put("%NLOOPS", NLOOPS_S);
diff --git a/src/library/blas/xaxpy.c b/src/library/blas/xaxpy.c
index 7499c414..d57b4c23 100644
--- a/src/library/blas/xaxpy.c
+++ b/src/library/blas/xaxpy.c
@@ -60,11 +60,11 @@ doAxpy(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xcopy.c b/src/library/blas/xcopy.c
index e0ea2a03..8e375976 100644
--- a/src/library/blas/xcopy.c
+++ b/src/library/blas/xcopy.c
@@ -60,11 +60,11 @@ doCopy(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xdot.c b/src/library/blas/xdot.c
index f29cdb6f..67bf4cd2 100644
--- a/src/library/blas/xdot.c
+++ b/src/library/blas/xdot.c
@@ -67,20 +67,20 @@ doDot(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
 		// Minimum size of scratchBuff is N
-		if (retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET))) {
 			printf("Insufficient ScratchBuff\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for dotProduct\n");
             return retCode;
 		}
diff --git a/src/library/blas/xgemm2.c b/src/library/blas/xgemm2.c
index 0a5ae436..2bba00ae 100644
--- a/src/library/blas/xgemm2.c
+++ b/src/library/blas/xgemm2.c
@@ -209,18 +209,18 @@ doGemm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET)) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
     if (K != 0) {
-        if (retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET )) {
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET))) {
             return retCode;
         }
-        if (retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET )) {
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET))) {
             return retCode;
         }
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET))) {
             return retCode;
     }
 
diff --git a/src/library/blas/xger.c b/src/library/blas/xger.c
index 92d4b311..c9e9e1c9 100644
--- a/src/library/blas/xger.c
+++ b/src/library/blas/xger.c
@@ -58,23 +58,23 @@ doGer(
 
 		/* Validate arguments */
 
-		if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+		if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 			printf("Invalid mem object..\n");
             return retCode;
 		}
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET )) {
+		if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET))) {
 
 			printf("Invalid Size for A %d\n",retCode );
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xhemv.c b/src/library/blas/xhemv.c
index 0db6a8f9..21011dd7 100644
--- a/src/library/blas/xhemv.c
+++ b/src/library/blas/xhemv.c
@@ -54,17 +54,17 @@ doHemv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
diff --git a/src/library/blas/xher.c b/src/library/blas/xher.c
index af36962b..7131057c 100644
--- a/src/library/blas/xher.c
+++ b/src/library/blas/xher.c
@@ -56,16 +56,16 @@ doher(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
    		printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
diff --git a/src/library/blas/xher2.c b/src/library/blas/xher2.c
index cb676592..21a8ddcf 100644
--- a/src/library/blas/xher2.c
+++ b/src/library/blas/xher2.c
@@ -59,21 +59,21 @@ doHer2(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
 
-	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+	if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         printf("Invalid Size for Y\n");
         return retCode;
     }
diff --git a/src/library/blas/xher2k.c b/src/library/blas/xher2k.c
index 302a648b..4c3d2f2a 100644
--- a/src/library/blas/xher2k.c
+++ b/src/library/blas/xher2k.c
@@ -71,7 +71,7 @@ doHer2k(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
 
@@ -79,15 +79,15 @@ doHer2k(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xherk.c b/src/library/blas/xherk.c
index 18d1fb4d..b4f409d7 100644
--- a/src/library/blas/xherk.c
+++ b/src/library/blas/xherk.c
@@ -64,7 +64,7 @@ doHerk(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
 
@@ -72,11 +72,11 @@ doHerk(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xhpmv.c b/src/library/blas/xhpmv.c
index 991819c4..1f0fe67b 100644
--- a/src/library/blas/xhpmv.c
+++ b/src/library/blas/xhpmv.c
@@ -53,17 +53,17 @@ doHpmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
-                         offa, 0, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    AP, offa, 0, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -182,4 +182,4 @@ clblasZhpmv(
     return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
                   Y, offy, incy, numCommandQueues, commandQueues,
                   numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xrot.c b/src/library/blas/xrot.c
index 7fd981bc..d07ec87d 100644
--- a/src/library/blas/xrot.c
+++ b/src/library/blas/xrot.c
@@ -58,11 +58,11 @@ doRot(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xrotg.c b/src/library/blas/xrotg.c
index b7e5a0f8..e4971480 100644
--- a/src/library/blas/xrotg.c
+++ b/src/library/blas/xrotg.c
@@ -69,21 +69,21 @@ doRotg(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for B\n");
             return retCode;
 		}
 
-		if (retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for C\n");
             return retCode;
 		}
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for S\n");
             return retCode;
 		}
diff --git a/src/library/blas/xrotm.c b/src/library/blas/xrotm.c
index fcdfcb08..4130cf5d 100644
--- a/src/library/blas/xrotm.c
+++ b/src/library/blas/xrotm.c
@@ -60,15 +60,15 @@ doRotm(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for PARAM\n"); // PARAM is of minimum length 5
             return retCode;
 		}
diff --git a/src/library/blas/xrotmg.c b/src/library/blas/xrotmg.c
index 6598229d..e6e48b6d 100644
--- a/src/library/blas/xrotmg.c
+++ b/src/library/blas/xrotmg.c
@@ -65,23 +65,23 @@ doRotmg(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for D1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for D2\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for X1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for PARAM\n");
             return retCode;
 		}
diff --git a/src/library/blas/xscal.c b/src/library/blas/xscal.c
index 6722383a..b2620310 100644
--- a/src/library/blas/xscal.c
+++ b/src/library/blas/xscal.c
@@ -57,7 +57,7 @@ doScal(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
diff --git a/src/library/blas/xshbmv.c b/src/library/blas/xshbmv.c
index e0a5087a..94f733da 100644
--- a/src/library/blas/xshbmv.c
+++ b/src/library/blas/xshbmv.c
@@ -68,19 +68,19 @@ doSHbmv(
     }
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)))
     {
         return retCode;
     }
 
-    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
-                                            N, N, K, 0, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                          N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xspmv.c b/src/library/blas/xspmv.c
index d522ba84..b40e0269 100644
--- a/src/library/blas/xspmv.c
+++ b/src/library/blas/xspmv.c
@@ -53,17 +53,17 @@ doSpmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
-                         offa, 0, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    AP, offa, 0, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -184,4 +184,4 @@ clblasDspmv(
     return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
                   Y, offy, incy, numCommandQueues, commandQueues,
                   numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xswap.c b/src/library/blas/xswap.c
index 38066186..1d83a5b2 100644
--- a/src/library/blas/xswap.c
+++ b/src/library/blas/xswap.c
@@ -60,11 +60,11 @@ doSwap(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xsymm.c b/src/library/blas/xsymm.c
index e61a33f6..5c87fc6e 100644
--- a/src/library/blas/xsymm.c
+++ b/src/library/blas/xsymm.c
@@ -50,31 +50,31 @@ doSymm(	CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side,
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
 		printf("SYMM:- Invalid mem object..\n");
         return retCode;
     }
 
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET))) {
 		printf("Invalid Size for B\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET))) {
 		printf("Invalid Size for C\n");
         return retCode;
     }
 	if (side == clblasLeft)
 	{
 		// MxM x MxN
-    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET )) {
+    	if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
     	}
 	} else {
 		// MxN x NxN
-    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    	if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
     	}
diff --git a/src/library/blas/xsymv.c b/src/library/blas/xsymv.c
index 55b23e85..790e8720 100644
--- a/src/library/blas/xsymv.c
+++ b/src/library/blas/xsymv.c
@@ -60,17 +60,17 @@ doSymv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xsyr.c b/src/library/blas/xsyr.c
index d2d1ae7c..9358920f 100644
--- a/src/library/blas/xsyr.c
+++ b/src/library/blas/xsyr.c
@@ -55,7 +55,7 @@ doSyr(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
    		printf("Invalid mem object..\n");
         return retCode;
     }
@@ -65,11 +65,11 @@ doSyr(
      * checkMatrixSizes() does not account of "offa" argument.
      * Need to be added.
      */
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
diff --git a/src/library/blas/xsyr2.c b/src/library/blas/xsyr2.c
index 2f0a1856..fddcfbd2 100644
--- a/src/library/blas/xsyr2.c
+++ b/src/library/blas/xsyr2.c
@@ -58,21 +58,21 @@ doSyr2(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET ))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
 
-	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+	if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         printf("Invalid Size for Y\n");
         return retCode;
     }
diff --git a/src/library/blas/xsyr2k.c b/src/library/blas/xsyr2k.c
index e99a617b..25ed438c 100644
--- a/src/library/blas/xsyr2k.c
+++ b/src/library/blas/xsyr2k.c
@@ -58,7 +58,7 @@ doSyr2k(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
 
@@ -66,13 +66,13 @@ doSyr2k(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xsyrk.c b/src/library/blas/xsyrk.c
index 4157d5e8..2582830e 100644
--- a/src/library/blas/xsyrk.c
+++ b/src/library/blas/xsyrk.c
@@ -55,7 +55,7 @@ doSyrk(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET)) {
+    if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
 
@@ -63,10 +63,10 @@ doSyrk(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtbmv.c b/src/library/blas/xtbmv.c
index 8f59bc99..b3b0d3b7 100644
--- a/src/library/blas/xtbmv.c
+++ b/src/library/blas/xtbmv.c
@@ -59,20 +59,20 @@ doTbmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 	printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET)) {
+    if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
 		printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		printf("Invalid Size for X\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
 		printf("Invalid Size for scratch vector\n");
         return retCode;
     }
diff --git a/src/library/blas/xtrmm.c b/src/library/blas/xtrmm.c
index b7611dae..8aff2079 100644
--- a/src/library/blas/xtrmm.c
+++ b/src/library/blas/xtrmm.c
@@ -55,16 +55,16 @@ doTrmm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
     msize = (side == clblasLeft) ? M : N;
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
-                         offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+                                    B, offB, ldb, B_MAT_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtrmv.c b/src/library/blas/xtrmv.c
index 2f4e2166..145c799f 100644
--- a/src/library/blas/xtrmv.c
+++ b/src/library/blas/xtrmv.c
@@ -57,20 +57,20 @@ doTrmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 	printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 		printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		printf("Invalid Size for X\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
 		printf("Invalid Size for scratch vector\n");
         return retCode;
     }
diff --git a/src/library/blas/xtrsm.c b/src/library/blas/xtrsm.c
index 9fb5b4af..d2fd7f09 100644
--- a/src/library/blas/xtrsm.c
+++ b/src/library/blas/xtrsm.c
@@ -55,17 +55,17 @@ doTrsm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET ))) {
         return retCode;
     }
     msize = (side == clblasLeft) ? M : N;
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
-                         offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+                                    B, offB, ldb, B_MAT_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c
index c3342287..1e48349a 100644
--- a/src/library/blas/xtrsv.c
+++ b/src/library/blas/xtrsv.c
@@ -351,7 +351,7 @@ doTrsv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid mem object..\n");
 		#endif
@@ -363,13 +363,13 @@ doTrsv(
  	 * checkMatrixSizes() does not account for "offa" argument.
  	 * Need to pass "offa" when "checkMatrixSizes()" is changed.
 	 */
-    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET)) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid Size for A\n");
 		#endif
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid Size for X\n");
 		#endif
diff --git a/src/library/tools/tplgen/tplgen.cpp b/src/library/tools/tplgen/tplgen.cpp
index 25150aa7..e81ecd2d 100644
--- a/src/library/tools/tplgen/tplgen.cpp
+++ b/src/library/tools/tplgen/tplgen.cpp
@@ -73,7 +73,7 @@ int main( int argc, char *argv[] )
     size_t found;
     string str;
     int startOptions = 1;
-    char *outputPrefix = "";
+    const char *outputPrefix = "";
 
     std::cout << "TPLGEN Running.....\n";
     if (argc < 2)
diff --git a/src/library/tools/tune/subdim.c b/src/library/tools/tune/subdim.c
index 37ead334..6eed76f1 100644
--- a/src/library/tools/tune/subdim.c
+++ b/src/library/tools/tune/subdim.c
@@ -364,7 +364,7 @@ nextSubdimElem(SubDimInfo* sd)
 
     // !!! DEBUG
     if (sd->count > 500) {
-        *(int*)0 = 0;
+        abort();
     }
 
     sd->count ++;

From 7ba6961e31aa1e1849ffa69cd16a990a3ed4be4c Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Wed, 25 Sep 2013 16:08:29 -0500
Subject: [PATCH 19/59] Adding guards around the clBLAS version numbers, to
 support an external script setting the library version through the command
 line -D mechanism. Changing the name of AMD_CLBLAS_STORAGE_PATH to
 CLBLAS_STORAGE_PATH Bugfix for clAmdBlasCgemm() in wrapper legacy header;
 eliminated the offX parameters which were not present in original header

---
 src/CMakeLists.txt                  | 15 ++++++++++++---
 src/clAmdBlas.h                     |  3 ---
 src/library/tools/tune/storage_io.c |  2 +-
 src/library/tools/tune/tune.c       |  2 +-
 4 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92bfb16d..a5152420 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -45,9 +45,18 @@ else( )
 endif( )
 
 # Define a version for the code
-set( clBLAS_VERSION_MAJOR 2 )
-set( clBLAS_VERSION_MINOR 1 )
-set( clBLAS_VERSION_PATCH 0 )
+if( NOT DEFINED clBLAS_VERSION_MAJOR )
+    set( clBLAS_VERSION_MAJOR 2 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_MINOR )
+    set( clBLAS_VERSION_MINOR 1 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_PATCH )
+    set( clBLAS_VERSION_PATCH 0 )
+endif( )
+
 set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
 
 # Increment this if we break backward compatibility.
diff --git a/src/clAmdBlas.h b/src/clAmdBlas.h
index 1921473e..c994eccc 100644
--- a/src/clAmdBlas.h
+++ b/src/clAmdBlas.h
@@ -8528,14 +8528,11 @@ clAmdBlasCgemm(
     size_t K,
     FloatComplex alpha,
     const cl_mem A,
-    size_t offA,
     size_t lda,
     const cl_mem B,
-    size_t offB,
     size_t ldb,
     FloatComplex beta,
     cl_mem C,
-    size_t offC,
     size_t ldc,
     cl_uint numCommandQueues,
     cl_command_queue *commandQueues,
diff --git a/src/library/tools/tune/storage_io.c b/src/library/tools/tune/storage_io.c
index b90792b4..8fd3ec83 100644
--- a/src/library/tools/tune/storage_io.c
+++ b/src/library/tools/tune/storage_io.c
@@ -24,7 +24,7 @@
 #include "storage_data.h"
 
 #define  SUBDIM_UNUSED_FILE_VALUE 10000
-const char *ENV_FILE_PATH = "AMD_CLBLAS_STORAGE_PATH";
+const char *ENV_FILE_PATH = "CLBLAS_STORAGE_PATH";
 const char *FileID  = "CBS";
 const char *FileExt = "kdb";
 const char *FileExtTmp = "kdb.tmp";
diff --git a/src/library/tools/tune/tune.c b/src/library/tools/tune/tune.c
index 17d7d476..8050cb5a 100644
--- a/src/library/tools/tune/tune.c
+++ b/src/library/tools/tune/tune.c
@@ -2641,7 +2641,7 @@ main(int argc, char*  argv[])
     clblasSetup();
 
     if (!FILE_PATH){
-        printf("The environment variable 'AMD_CLBLAS_STORAGE_PATH' is not defined\n");
+        printf("The environment variable 'CLBLAS_STORAGE_PATH' is not defined\n");
         exit(EXIT_COD_NO_ENVIRONMENT_VARIABLE);
     }
 

From a9856a66028b0fe5d229cd6ffc6a0ad5a9204662 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Thu, 26 Sep 2013 10:26:10 -0500
Subject: [PATCH 20/59] fix a bug in performance client. Since the performance
 is calculated by the average of 20 runs, clReleaseMemObject and delete buffer
 should be done in between runs instead of after all runs in the destructor.
 In certain device and size of matrix, the client will report a slower
 performance due to this inapproriate handling of memory. Also updated the
 travis CI build script. Force the client test to CPU from command line in
 stead of source code.

---
 .travis.yml                  |  2 +-
 src/client/clfunc_common.hpp |  2 +-
 src/client/clfunc_xgemm.hpp  | 23 ++++++++++++++---------
 src/client/clfunc_xgemv.hpp  |  6 ++++++
 src/client/clfunc_xger.hpp   |  6 ++++++
 src/client/clfunc_xgerc.hpp  |  7 ++++++-
 src/client/clfunc_xgeru.hpp  |  7 ++++++-
 src/client/clfunc_xhemm.hpp  |  6 ++++++
 src/client/clfunc_xhemv.hpp  |  7 ++++++-
 src/client/clfunc_xher.hpp   |  7 ++++++-
 src/client/clfunc_xher2.hpp  |  6 ++++++
 src/client/clfunc_xsymm.hpp  | 17 +++++++++++------
 src/client/clfunc_xsymv.hpp  |  6 ++++++
 src/client/clfunc_xsyr.hpp   |  6 ++++++
 src/client/clfunc_xsyr2.hpp  |  7 ++++++-
 src/client/clfunc_xsyr2k.hpp |  7 ++++++-
 src/client/clfunc_xsyrk.hpp  |  6 ++++++
 src/client/clfunc_xtrmm.hpp  | 17 +++++++++++------
 src/client/clfunc_xtrmv.hpp  |  6 ++++++
 src/client/clfunc_xtrsm.hpp  | 17 +++++++++++------
 src/client/clfunc_xtrsv.hpp  |  6 ++++++
 src/client/client.cpp        | 10 ++++++----
 22 files changed, 145 insertions(+), 39 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 1039bc46..725e2020 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -30,7 +30,7 @@ script:
 # Run a simple test to validate that the build works; CPU device in a VM
   - cd client
   - export LD_LIBRARY_PATH=${TRAVIS_BUILD_DIR}/bin/clBLAS/package/lib64:${LD_LIBRARY_PATH}
-  - ./client 
+  - ./client --cpu
 
 after_success:
   - cd ${TRAVIS_BUILD_DIR}/bin/clBLAS
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 4876daf5..293a3b60 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -313,7 +313,7 @@ class clblasFunc
                               size_t M, size_t N, size_t K, size_t lda,
                               size_t ldb, size_t ldc, size_t offA, size_t offBX,
                               size_t offCY, double alpha, double beta) = 0;
-
+	virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
     StatisticalTimer& timer;
     StatisticalTimer::sTimerID timer_id;
 
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 17223a62..c5f706c0 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -62,15 +62,6 @@ class xGemm : public clblasFunc
 
     ~xGemm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
     }
 
     void call_func()
@@ -659,6 +650,20 @@ class xGemm : public clblasFunc
         buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
 
     }
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+	}
 
 protected:
     void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xgemv.hpp b/src/client/clfunc_xgemv.hpp
index 2d1d5b06..cc851094 100644
--- a/src/client/clfunc_xgemv.hpp
+++ b/src/client/clfunc_xgemv.hpp
@@ -286,6 +286,12 @@ class xGemv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 
 protected:
     void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xger.hpp b/src/client/clfunc_xger.hpp
index 05899cd7..d2f36dbc 100644
--- a/src/client/clfunc_xger.hpp
+++ b/src/client/clfunc_xger.hpp
@@ -217,6 +217,12 @@ class xGer : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xgerc.hpp b/src/client/clfunc_xgerc.hpp
index 829d9380..ed39f797 100644
--- a/src/client/clfunc_xgerc.hpp
+++ b/src/client/clfunc_xgerc.hpp
@@ -98,7 +98,12 @@ class xGerc : public clblasFunc
 		{}
 
   void call_func();
-
+  void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/clfunc_xgeru.hpp b/src/client/clfunc_xgeru.hpp
index 8c7d02c9..dbcecc9e 100644
--- a/src/client/clfunc_xgeru.hpp
+++ b/src/client/clfunc_xgeru.hpp
@@ -94,7 +94,12 @@ class xGeru : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
index 8e46d1e3..8a0c5550 100644
--- a/src/client/clfunc_xhemm.hpp
+++ b/src/client/clfunc_xhemm.hpp
@@ -120,6 +120,12 @@ class xHemm : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 
 protected:
 protected:
diff --git a/src/client/clfunc_xhemv.hpp b/src/client/clfunc_xhemv.hpp
index 570c3fce..6211114c 100644
--- a/src/client/clfunc_xhemv.hpp
+++ b/src/client/clfunc_xhemv.hpp
@@ -95,7 +95,12 @@ class xHemv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+  	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher.hpp b/src/client/clfunc_xher.hpp
index e624b558..5144b22b 100644
--- a/src/client/clfunc_xher.hpp
+++ b/src/client/clfunc_xher.hpp
@@ -90,7 +90,12 @@ class xHer : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher2.hpp b/src/client/clfunc_xher2.hpp
index 27d95f34..aec7cc83 100644
--- a/src/client/clfunc_xher2.hpp
+++ b/src/client/clfunc_xher2.hpp
@@ -94,6 +94,12 @@ class xHer2 : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index e9fe9818..d067870f 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -58,12 +58,6 @@ class xSymm : public clblasFunc
 
   ~xSymm()
   {
-    delete buffer.cpuA;
-    delete buffer.cpuB;
-    delete buffer.cpuC;
-    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
   }
 
   double gflops()
@@ -212,6 +206,17 @@ class xSymm : public clblasFunc
   buffer.cpuC = new T[buffer.N * buffer.ldc];
   buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
   }
+  	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer.cpuA;
+		delete buffer.cpuB;
+		delete buffer.cpuC;
+		OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/clfunc_xsymv.hpp b/src/client/clfunc_xsymv.hpp
index 625c7ec7..c9285410 100644
--- a/src/client/clfunc_xsymv.hpp
+++ b/src/client/clfunc_xsymv.hpp
@@ -209,6 +209,12 @@ class xSymv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xsyr.hpp b/src/client/clfunc_xsyr.hpp
index 172032c9..4c70e69c 100644
--- a/src/client/clfunc_xsyr.hpp
+++ b/src/client/clfunc_xsyr.hpp
@@ -90,6 +90,12 @@ class xSyr : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        //to-do
+	}
 
 protected:
 protected:
diff --git a/src/client/clfunc_xsyr2.hpp b/src/client/clfunc_xsyr2.hpp
index 761c6167..9977d08a 100644
--- a/src/client/clfunc_xsyr2.hpp
+++ b/src/client/clfunc_xsyr2.hpp
@@ -94,7 +94,12 @@ class xSyr2 : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+ 	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 4faa3997..9fb33812 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -376,7 +376,12 @@ class xSyr2k : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index 5bfd0e3c..ec842e28 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -293,6 +293,12 @@ class xSyrk : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index d47ddfdb..68034570 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -57,12 +57,6 @@ class xTrmm : public clblasFunc
 
     ~xTrmm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
-                       "releasing buffer A");
-        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
-                       "releasing buffer B");
     }
 
     void call_func()
@@ -450,6 +444,17 @@ class xTrmm : public clblasFunc
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
 	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+                       "releasing buffer A");
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+                       "releasing buffer B");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xtrmv.hpp b/src/client/clfunc_xtrmv.hpp
index 725e9f31..80d5004c 100644
--- a/src/client/clfunc_xtrmv.hpp
+++ b/src/client/clfunc_xtrmv.hpp
@@ -225,6 +225,12 @@ class xTrmv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 8ae85c30..7a86be9e 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -57,12 +57,6 @@ class xTrsm : public clblasFunc
 
     ~xTrsm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
     }
 
     void call_func()
@@ -456,6 +450,17 @@ class xTrsm : public clblasFunc
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
 	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+                       "releasing buffer A");
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+                       "releasing buffer B");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xtrsv.hpp b/src/client/clfunc_xtrsv.hpp
index f0b728ab..4eb0e5b8 100644
--- a/src/client/clfunc_xtrsv.hpp
+++ b/src/client/clfunc_xtrsv.hpp
@@ -218,6 +218,12 @@ class xTrsv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 1bf24541..74a8eb8d 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -51,7 +51,7 @@ int main(int argc, char *argv[])
   cl_double beta;
   cl_uint profileCount;
   cl_uint commandQueueFlags = 0;
-  cl_device_type deviceType = CL_DEVICE_TYPE_CPU;
+  cl_device_type deviceType = CL_DEVICE_TYPE_GPU;
   int order_option;
   //clblasOrder order;
   //clblasTranspose transA;
@@ -484,7 +484,8 @@ int main(int argc, char *argv[])
 	my_function->read_gpu_buffer();
     my_function->reset_gpu_write_buffer();*/
 	my_function->roundtrip_func();
-	my_function->reset_gpu_write_buffer();
+	//my_function->reset_gpu_write_buffer();
+	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }
 
   if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -512,7 +513,8 @@ int main(int argc, char *argv[])
     my_function->initialize_gpu_buffer();
     my_function->call_func();
 	my_function->read_gpu_buffer();
-    my_function->reset_gpu_write_buffer();
+    //my_function->reset_gpu_write_buffer();
+	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }
 
   if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -525,7 +527,7 @@ int main(int argc, char *argv[])
       std::endl;
   }
   }
-
+  delete my_function;
   return 0;
 }
 

From a414380026aa7bc1f0927c785c14aaf2d1304e6a Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Tue, 5 Nov 2013 15:04:44 +0100
Subject: [PATCH 21/59] Make GTest work on OS X

* Link against the Accelerate framework for BLAS
* Use cblas for _dotu, et al. calls
* No need for a fortran compiler since we are using Accelerate
---
 src/CMakeLists.txt                         | 10 +++++++--
 src/tests/BlasBase.cpp                     |  2 ++
 src/tests/CMakeLists.txt                   | 18 ++++++++--------
 src/tests/correctness/blas-lapack.c        | 13 +++++++++++-
 src/tests/correctness/blas-lapack.h        |  2 +-
 src/tests/correctness/test-correctness.cpp |  8 ++++++--
 src/tests/timer.c                          | 24 ++++++++++++++++++++++
 7 files changed, 63 insertions(+), 14 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5152420..76547fc5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -38,7 +38,7 @@ if( CMAKE_GENERATOR MATCHES "NMake" )
 endif( )
 
 # If we are on linux, and we wish to link with the netlib BLAS implementation, we need to have a valid fortran compiler
-if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE )
   project(clBLAS Fortran C CXX )
 else( )
   project(clBLAS C CXX)
@@ -123,7 +123,13 @@ endif()
 # TODO: maybe this could be written using the FindBLAS module in the future
 if( BUILD_TEST )
 	if(NOT CORR_TEST_WITH_ACML)
-		find_package( Netlib COMPONENTS BLAS REQUIRED )
+	        if(APPLE)
+			find_library(BLAS_LIBRARIES Accelerate)
+		       	MARK_AS_ADVANCED(BLAS_LIBRARIES)
+		       	message(STATUS "Using Accelerate framework on Mac OS-X")
+	       	else()
+			find_package( Netlib COMPONENTS BLAS REQUIRED )
+              	endif()
 	else( )
 		# Find ACML BLAS implementation
 		# platform dependent ACML subdirectory
diff --git a/src/tests/BlasBase.cpp b/src/tests/BlasBase.cpp
index 85905994..c012803d 100644
--- a/src/tests/BlasBase.cpp
+++ b/src/tests/BlasBase.cpp
@@ -506,6 +506,8 @@ BlasBase::printEnvInfo(void)
     #else
             std::cout << "(x32)" << std::endl;
     #endif
+#elif defined( __APPLE__ )
+        std::cout << "Apple OS X" << std::endl;
 #else
         std::cout << "Linux" << std::endl;
 #endif
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 61f5e849..d4a03f73 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -268,7 +268,9 @@ endif( )
 
 # Library with functions for time measurement. In Windows they are included automatically
 if(UNIX)
-    set(TIME_LIBRARY "rt")
+    if(NOT APPLE)
+        set(TIME_LIBRARY "rt")
+    endif()
     set(THREAD_LIBRARY "pthread")
 endif()
 
@@ -334,7 +336,7 @@ if( GTEST_FOUND )
 					  ${CORR_HEADERS} ${TESTS_HEADERS})
 		set_target_properties( test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS )
 
-		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE)
 			set_target_properties( test-correctness PROPERTIES LINKER_LANGUAGE Fortran )
 			set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran )
 			set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran )
@@ -346,9 +348,9 @@ if( GTEST_FOUND )
 				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 			else( )
-				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
-				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
-				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 			endif( )
 		else( )
 			if( NETLIB_FOUND )
@@ -356,9 +358,9 @@ if( GTEST_FOUND )
 				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 			else( )
-				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
-				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
-				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 			endif( )
 		endif( )
     endif( )
diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index a010b7b8..4c93104a 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -24,6 +24,9 @@
 #if !defined CORR_TEST_WITH_ACML
 
 #include "blas-lapack.h"
+#if defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
 
 void
 sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy)
@@ -639,8 +642,10 @@ complex cdotu( int n, complex *x, int incx, complex *y, int incy)
 {
     complex ans;
 
-    #if defined( _WIN32 ) || defined( _WIN64 )
+#if defined( _WIN32 ) || defined( _WIN64 )
         ans = cdotu_(&n, x, &incx, y, &incy);
+    #elif defined( __APPLE__)
+        cblas_cdotu_sub(n, x, incx, y, incy, &ans);
     #else
         cdotusub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -654,6 +659,8 @@ doublecomplex zdotu( int n, doublecomplex *x, int incx,  doublecomplex *y, int i
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = zdotu_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_zdotu_sub(n, x, incx, y, incy, &ans);
     #else
         zdotusub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -667,6 +674,8 @@ complex cdotc( int n, complex *x, int incx, complex *y, int incy)
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = cdotc_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_cdotc_sub(n, x, incx, y, incy, &ans);
     #else
         cdotcsub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -680,6 +689,8 @@ doublecomplex zdotc( int n, doublecomplex *x, int incx,  doublecomplex *y, int i
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = zdotc_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_zdotc_sub(n, x, incx, y, incy, &ans);
     #else
         zdotcsub_(&n, x, &incx, y, &incy, &ans);
     #endif
diff --git a/src/tests/correctness/blas-lapack.h b/src/tests/correctness/blas-lapack.h
index 6dc55ee3..d2db1aa3 100644
--- a/src/tests/correctness/blas-lapack.h
+++ b/src/tests/correctness/blas-lapack.h
@@ -1164,7 +1164,7 @@ void zcopy_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy);
 float sdot_(int *n, float *x, int *incx, float* y, int *incy);
 double ddot_(int *n, double *x, int *incx, double* y, int *incy);
 
-#if defined( _WIN32 ) || defined( _WIN64 )
+#if defined( _WIN32 ) || defined( _WIN64 ) || defined( __APPLE__)
     complex cdotu_(int *n, complex *x, int *incx, complex* y, int *incy);
     doublecomplex zdotu_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy);
     complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy);
diff --git a/src/tests/correctness/test-correctness.cpp b/src/tests/correctness/test-correctness.cpp
index 950382e9..7a1a0841 100644
--- a/src/tests/correctness/test-correctness.cpp
+++ b/src/tests/correctness/test-correctness.cpp
@@ -205,7 +205,11 @@ const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}};
 const ComplexLong alphaBeta = {10,10};
 const ComplexLong sflagRange[] = {{-1,0}, {0,0}, {1,0}, {-2,0}};
 
+const ComplexLong rotCosMedium = {0, 3};
+const ComplexLong rotSinMedium = {0, 4};
 
+const ComplexLong rotCosShort = {1, 6};
+const ComplexLong rotSinShort = {1, 2};
 
 #ifdef DO_SPL
 
@@ -316,10 +320,10 @@ INSTANTIATE_TEST_CASE_P(ALL_ROTM, ROTM, Combine(
 #ifdef DO_ROT
 #if defined(SHORT_TESTS)
 INSTANTIATE_TEST_CASE_P(Small_ROT, ROT, Combine(
-        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, 2), Values(1)));
+        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(rotCosShort), Values(rotSinShort), Values(1)));
 #elif defined(MEDIUM_TESTS)
 INSTANTIATE_TEST_CASE_P(Medium_ROT, ROT, Combine(
-        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(0, 3), Values(0, 4), Values(1)));
+        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(rotCosMedium), Values(rotSinMedium), Values(1)));
 #else
 INSTANTIATE_TEST_CASE_P(ALL_ROT, ROT, Combine(
         ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
diff --git a/src/tests/timer.c b/src/tests/timer.c
index e304f4f5..8b9c54d4 100644
--- a/src/tests/timer.c
+++ b/src/tests/timer.c
@@ -79,6 +79,30 @@ sleepTime(nano_time_t time) {
 
 #include <time.h>
 
+#ifdef __APPLE__
+#include <sys/time.h>
+// we dont have clock_gettime on mac, fake it
+// NB: this is *not* nano-second precision
+#define CLOCK_REALTIME 0
+static int
+clock_gettime(int time_id, struct timespec *t)
+{
+  struct timeval nuc;
+  int err;
+
+  err = gettimeofday(&nuc, NULL);
+  if (err != 0) {
+    return err;
+  }
+
+  t->tv_sec = nuc.tv_sec;
+  t->tv_nsec = nuc.tv_usec * 1000;
+
+  return 0;
+}
+#endif
+
+
 nano_time_t
 conv2nanosec(nano_time_t t)
 {

From 6bd10142f48ade74d1ecf4430b2728b2926233ee Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Wed, 6 Nov 2013 14:14:58 +0100
Subject: [PATCH 22/59] OSX: Provide nanosecond resolution timer for tests

Directly implement the getCurrentTime function on OSX instead
of providing a gettimeofday based clock_gettime implementation;
we use mach_absolute_time together with mach_timebase_info.
NB: The multiplication is done first and then the division
because the risk of overflowing seems smaller then precision
loss together with the risk of actually having a ratio of zero,
i.e. when denom > numer, when doing it the other way around.
---
 src/tests/include/timer.h |  6 ++++
 src/tests/timer.c         | 69 +++++++++++++++++++++++----------------
 2 files changed, 47 insertions(+), 28 deletions(-)

diff --git a/src/tests/include/timer.h b/src/tests/include/timer.h
index 29353ff8..41c8e275 100644
--- a/src/tests/include/timer.h
+++ b/src/tests/include/timer.h
@@ -27,6 +27,12 @@ extern "C" {
 typedef unsigned long long nano_time_t;
 #define NANOTIME_MAX (~0ULL - 1)
 
+#elif defined(__APPLE__)
+#include <stdint.h>
+
+typedef uint64_t nano_time_t;
+#define NANOTIME_MAX (UINT64_MAX - 1)
+
 #else
 
 typedef unsigned long nano_time_t;
diff --git a/src/tests/timer.c b/src/tests/timer.c
index 8b9c54d4..01844793 100644
--- a/src/tests/timer.c
+++ b/src/tests/timer.c
@@ -79,27 +79,52 @@ sleepTime(nano_time_t time) {
 
 #include <time.h>
 
-#ifdef __APPLE__
-#include <sys/time.h>
-// we dont have clock_gettime on mac, fake it
-// NB: this is *not* nano-second precision
-#define CLOCK_REALTIME 0
-static int
-clock_gettime(int time_id, struct timespec *t)
+#if defined(__APPLE__) && defined(__MACH__)
+
+#include <assert.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#include <pthread.h>
+
+// see https://developer.apple.com/library/mac/qa/qa1398/_index.html
+static mach_timebase_info_data_t mtb_;
+
+static void
+init_timebase_conv_(void)
 {
-  struct timeval nuc;
-  int err;
+    kern_return_t err;
 
-  err = gettimeofday(&nuc, NULL);
-  if (err != 0) {
-    return err;
-  }
+    err = mach_timebase_info(&mtb_);
+    assert(err == KERN_SUCCESS);
+}
 
-  t->tv_sec = nuc.tv_sec;
-  t->tv_nsec = nuc.tv_usec * 1000;
+nano_time_t
+getCurrentTime(void)
+{
+     static pthread_once_t once = PTHREAD_ONCE_INIT;
+     uint64_t              now;
 
-  return 0;
+     pthread_once(&once, init_timebase_conv_);
+     now = mach_absolute_time();
+
+     return (now * mtb_.numer) / mtb_.denom;
 }
+
+#else /* ! (_MCS_VER || __APPLE__) */
+
+nano_time_t
+getCurrentTime(void)
+{
+    int err;
+    struct timespec t;
+
+    err = clock_gettime(CLOCK_REALTIME, &t);
+    if (err == 0) {
+        return (t.tv_sec * 1000000000UL + t.tv_nsec);
+    }
+    return 0;
+}
+
 #endif
 
 
@@ -122,18 +147,6 @@ conv2millisec(nano_time_t t)
     return t/1000000;
 }
 
-nano_time_t
-getCurrentTime(void)
-{
-    int err;
-    struct timespec t;
-
-    err = clock_gettime(CLOCK_REALTIME, &t);
-    if (err == 0) {
-        return (t.tv_sec * 1000000000UL + t.tv_nsec);
-    }
-    return 0;
-}
 
 void
 sleepTime(nano_time_t time) {

From 29d0c282fcf3112b9e9cf780bdde9e9527f2a802 Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Thu, 7 Nov 2013 16:45:14 +0100
Subject: [PATCH 23/59] Kernel cache: Store if we have the source for a kernel

Then don't try to estimate kernel source size if we don't have it:
In fullKernelSize() only call add the source code if we have them,
i.e. did not drop it earlier, because doing so will crash on OSX 10.9.
This should fix issue #21
---
 src/include/kern_cache.h          | 1 +
 src/library/blas/generic/common.c | 2 ++
 src/library/common/kern_cache.c   | 4 +++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/include/kern_cache.h b/src/include/kern_cache.h
index b6749c59..af14a855 100644
--- a/src/include/kern_cache.h
+++ b/src/include/kern_cache.h
@@ -55,6 +55,7 @@ typedef struct Kernel {
     void *extra;
     size_t extraSize;
     void (*dtor)(struct Kernel *kern);
+    int noSource;
 } Kernel;
 
 typedef int
diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index 9e26887d..de99f72a 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -364,6 +364,7 @@ Kernel VISIBILITY_HIDDEN
         kernel->extra = calloc(1, kernel->extraSize);
         *(CLBLASKernExtra*)(kernel->extra) = *extra;
         kernel->dtor = extraDtor;
+        kernel->noSource = 1;
     }
     else {
         putKernel(NULL, kernel);
@@ -491,6 +492,7 @@ Kernel
 #if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
     if (err == CL_SUCCESS) {
         err = dropProgramSource(&kernel->program, context, device);
+        kernel->noSource = 1;
     }
 #endif  /* !DUMP_CLBLAS_KERNELS */
 
diff --git a/src/library/common/kern_cache.c b/src/library/common/kern_cache.c
index 787d139f..1006e482 100644
--- a/src/library/common/kern_cache.c
+++ b/src/library/common/kern_cache.c
@@ -425,7 +425,9 @@ fullKernelSize(Kernel *kern)
         size += allSizes[i];
     }
 
-    clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+    if (!kern->noSource) {
+        clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+    }
 
     return (size + retSize + sizeof(Kernel) + kern->extraSize);
 }

From 3bb2214fe5626689d19976440fbec72d31a5fef8 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Tue, 12 Nov 2013 09:17:12 -0600
Subject: [PATCH 24/59] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index fdc62cd8..e3a24d1f 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains help
 Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
 
 ## License
-The source for clFFT is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+The source for clBLAS is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
 
 ## Example
 The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM

From 5472441a834c3a307ab6d98c7311b533ac38e08f Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Fri, 1 Nov 2013 16:26:23 -0500
Subject: [PATCH 25/59] Adding 'staging' directory to make it easier to debug;
 all binary code is copied together Adding new install logic for debug builds
 to copy debug runtimes into package Adding new install logic to copy test
 dependencies into package if tests are built

---
 src/client/CMakeLists.txt                     |  2 +
 src/library/CMakeLists.txt                    | 32 +++++++++
 .../blas/gens/legacy/tests/CMakeLists.txt     |  1 +
 src/library/blas/gens/tests/CMakeLists.txt    |  1 +
 src/library/common/tests/CMakeLists.txt       |  2 +
 src/library/tools/ktest/CMakeLists.txt        |  1 +
 src/library/tools/tune/CMakeLists.txt         |  1 +
 src/tests/CMakeLists.txt                      | 21 +++++-
 src/tests/copyTestDependencies.cmake.in       | 72 +++++++++++++++++++
 9 files changed, 132 insertions(+), 1 deletion(-)
 create mode 100644 src/tests/copyTestDependencies.cmake.in

diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index 360173d1..a647da08 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -48,9 +48,11 @@ include_directories(
 
 add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
 target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 add_executable(testPerfWrapper ${WRAPPER_SRC})
 target_link_libraries(testPerfWrapper ${Boost_LIBRARIES})
+set_target_properties( testPerfWrapper PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS client testPerfWrapper
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index f3ac63ee..f06282e6 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -311,6 +311,7 @@ add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS
 add_dependencies(clBLAS GENERATE_CLT)
 set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
 set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
+set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
 
 # CPack configuration; include the executable into the package
@@ -319,3 +320,34 @@ install( TARGETS clBLAS
 		LIBRARY DESTINATION lib${SUFFIX_LIB}
 		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
 		)
+
+# For debug builds, include the debug runtimes into the package for testing on non-developer machines
+set( CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY true )
+
+if( WIN32 )
+    set( CLBLAS_RUNTIME_DESTINATION bin${SUFFIX_BIN} )
+else( )
+    set( CLBLAS_RUNTIME_DESTINATION lib${SUFFIX_LIB} )
+endif( )
+
+include( InstallRequiredSystemLibraries )
+
+# Install necessary runtime files for debug builds
+install(    PROGRAMS ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS}
+            CONFIGURATIONS Debug
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION} )
+
+# Install all *.pdb files for debug builds
+install(    DIRECTORY ${PROJECT_BINARY_DIR}/staging/
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+            OPTIONAL
+            CONFIGURATIONS Debug
+            FILES_MATCHING PATTERN "*.pdb" )
+
+# Install a snapshot of the source as it was for this build; useful for the .pdb's
+install(    DIRECTORY ${PROJECT_SOURCE_DIR}
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+            OPTIONAL
+            CONFIGURATIONS Debug )
diff --git a/src/library/blas/gens/legacy/tests/CMakeLists.txt b/src/library/blas/gens/legacy/tests/CMakeLists.txt
index cf31f1ec..fae11cc5 100644
--- a/src/library/blas/gens/legacy/tests/CMakeLists.txt
+++ b/src/library/blas/gens/legacy/tests/CMakeLists.txt
@@ -45,6 +45,7 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_blkmul ${SRC_BLKMUL})
 target_link_libraries(t_blkmul ${OPENCL_LIBRARIES})
+set_target_properties( t_blkmul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS t_blkmul
diff --git a/src/library/blas/gens/tests/CMakeLists.txt b/src/library/blas/gens/tests/CMakeLists.txt
index 3490426d..6d10e3fe 100644
--- a/src/library/blas/gens/tests/CMakeLists.txt
+++ b/src/library/blas/gens/tests/CMakeLists.txt
@@ -42,6 +42,7 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_tilemul ${SRC_TILEMUL})
 target_link_libraries(t_tilemul ${OPENCL_LIBRARIES})
+set_target_properties( t_tilemul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS t_tilemul
diff --git a/src/library/common/tests/CMakeLists.txt b/src/library/common/tests/CMakeLists.txt
index c38e59d4..b1e34871 100644
--- a/src/library/common/tests/CMakeLists.txt
+++ b/src/library/common/tests/CMakeLists.txt
@@ -44,9 +44,11 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_dblock_kgen ${SRC_DBLOCK_KGEN})
 target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_dblock_kgen PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 add_executable(t_gens_cache ${SRC_GENS_CACHE})
 target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_gens_cache PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS t_dblock_kgen t_gens_cache
diff --git a/src/library/tools/ktest/CMakeLists.txt b/src/library/tools/ktest/CMakeLists.txt
index e86ea004..2cc8c318 100644
--- a/src/library/tools/ktest/CMakeLists.txt
+++ b/src/library/tools/ktest/CMakeLists.txt
@@ -140,6 +140,7 @@ source_group(\\ FILES ${KTEST_SRC})
 add_executable(make-ktest ${KTEST_SRC} ${KTEST_EXTERNAL_SRC})
 add_dependencies(make-ktest GENERATE_CLT)
 target_link_libraries(make-ktest ${OPENCL_LIBRARIES} ${Boost_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( make-ktest PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS make-ktest
diff --git a/src/library/tools/tune/CMakeLists.txt b/src/library/tools/tune/CMakeLists.txt
index 65bf00e5..2de5bf3b 100644
--- a/src/library/tools/tune/CMakeLists.txt
+++ b/src/library/tools/tune/CMakeLists.txt
@@ -138,6 +138,7 @@ endif()
 add_executable(tune ${TOOLS_SRC} ${TOOLS_EXTERNAL_SRC})
 add_dependencies(tune GENERATE_CLT)
 target_link_libraries(tune ${OPENCL_LIBRARIES} ${TIME_LIBRARY} ${MATH_LIBRARY})
+set_target_properties( tune PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS tune
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 61f5e849..feda6244 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -294,14 +294,17 @@ if( GTEST_FOUND )
 	    
 	    add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 									    ${CORR_HEADERS} ${TESTS_HEADERS})
+        set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 							      ${CORR_HEADERS} ${TESTS_HEADERS})
 	    set_target_properties(test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS)
+        set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 							      ${CORR_HEADERS} ${TESTS_HEADERS})
 	    set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS)
+        set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with 
 	    # gcc > 4.3.2 to support ACML.  
@@ -325,14 +328,17 @@ if( GTEST_FOUND )
 
 		add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 						${CORR_HEADERS} ${TESTS_HEADERS})
+        set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 		add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 					  ${CORR_HEADERS} ${TESTS_HEADERS})
 		set_target_properties( test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS )
+        set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 		add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 					  ${CORR_HEADERS} ${TESTS_HEADERS})
 		set_target_properties( test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS )
+        set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
 			set_target_properties( test-correctness PROPERTIES LINKER_LANGUAGE Fortran )
@@ -374,6 +380,17 @@ if( GTEST_FOUND )
             ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
             )
     
+    get_target_property( testLocation test-correctness LOCATION )
+
+    configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/copyTestDependencies.cmake.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake"
+        @ONLY
+    )
+
+    # Register script at run at install time to analyze the executable and copy dependencies into package
+    install( SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake")
+ 
 	if( ACML_FOUND )
 		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
 			${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include  ${clBLAS_SOURCE_DIR}/include)
@@ -384,6 +401,7 @@ if( GTEST_FOUND )
 			${SRC_COMMON_TIMER} ${PERF_HEADERS} ${TESTS_HEADERS}
 			${SRC_COMMON_REFIMPL})
 		target_link_libraries(test-performance ${ACML_LIBRARIES})
+        set_target_properties( test-performance PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 
 		if( BUILD_RUNTIME )
@@ -408,8 +426,9 @@ if( GTEST_FOUND )
 	add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER}
 								  ${FUNC_HEADERS} ${TESTS_HEADERS})
 								  
+    set_target_properties( test-functional PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 	if( BUILD_RUNTIME )
-		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} clBLAS)
+		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS )
 	else()
 		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} )
 	endif()
diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
new file mode 100644
index 00000000..3a5d395b
--- /dev/null
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -0,0 +1,72 @@
+# Customized install script for fftw test program; analyzes all the shared library dependencies and installs
+# the dependencies into the package
+include( GetPrerequisites )
+
+#    message( testLocation ": @testLocation@" )
+
+# The Microsoft IDE presents a challenge because the full configuration is not known at cmake time
+# This logic allows us to 'substitute' the proper confguration at install time
+if( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "Debug" )
+    string( REPLACE "\$(Configuration)" "Debug" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "Release" )
+    string( REPLACE "\$(Configuration)" "Release" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "MinSizeRel" )
+    string( REPLACE "\$(Configuration)" "MinSizeRel" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "RelwithDebInfo" )
+    string( REPLACE "\$(Configuration)" "RelwithDebInfo" fixedTestLocation "@testLocation@" )
+endif( )
+
+#    message( fixedTestLocation ": ${fixedTestLocation}" )
+# Get the directory that the test executable resides in; this helps get_prerequisites( ) find dependent libraries
+get_filename_component( testName "${fixedTestLocation}" NAME )
+string( REPLACE ${testName} "" testDir ${fixedTestLocation} )
+string( REGEX REPLACE "/+$" "" testDir ${testDir} )
+#    message( testDir ": ${testDir}" )
+
+set( installPath "" )
+if( WIN32 )
+    set( installPath "${CMAKE_INSTALL_PREFIX}/bin@SUFFIX_BIN@" )
+else( )
+    set( installPath "${CMAKE_INSTALL_PREFIX}/lib@SUFFIX_LIB@" )
+endif( )
+
+# Only search for dependencies that have ROOT defined
+set( depList "" )
+
+if( EXISTS "@ACML_ROOT@" )
+    list( APPEND depList "@ACML_ROOT@/lib" )
+endif( )
+
+if( EXISTS "@GTEST_ROOT@" )
+    list( APPEND depList "@GTEST_ROOT@/lib@SUFFIX_LIB@" )
+endif( )
+
+if( EXISTS "${testDir}" )
+    list( APPEND depList "${testDir}" )
+endif( )
+
+# message( STATUS "depList: ${depList}" )
+
+# This retrieves a list of shared library dependencies from the target; they are not full path names
+# Skip system dependencies and skip recursion
+get_prerequisites( ${fixedTestLocation} testDependencies 1 0 "" "${depList}" )
+
+# Loop on queried library dependencies and copy them into package
+foreach( dep ${testDependencies} )
+    # This converts the dependency into a full path
+    gp_resolve_item( "${fixedTestLocation}" "${dep}" "" "${depList}" dep_test_path )
+
+    # In linux, the dep_test_path may point to a symbolic link, we also need to copy real file
+    get_filename_component( dep_realpath "${dep_test_path}" REALPATH )
+    get_filename_component( dep_name "${dep_test_path}" NAME )
+    # message( STATUS "depName: ${dep_name}" )
+    # message( STATUS "depFullPath: ${dep_test_path}" )
+    # message( STATUS "dep_realpath: ${dep_realpath}" )
+
+    if( NOT EXISTS ${installPath}/${dep_name} )
+        file( INSTALL ${dep_test_path} ${dep_realpath}
+              USE_SOURCE_PERMISSIONS
+              DESTINATION ${installPath}
+            )
+    endif( )
+endforeach( )

From 546bd32d55d479df7856ca67d5c2f165077187ae Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Wed, 20 Nov 2013 11:04:16 -0600
Subject: [PATCH 26/59] Modified travis build file to make release builds, and
 to run unit test in package directory

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 725e2020..2d30e3b3 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -22,13 +22,13 @@ before_script:
   - cd ${TRAVIS_BUILD_DIR}
   - mkdir -p bin/clBLAS
   - cd bin/clBLAS
-  - cmake -DBUILD_TEST=OFF -DBUILD_CLIENT=ON ../../src
+  - cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TEST=OFF -DBUILD_CLIENT=ON ../../src
 
 script: 
   - make install
 #  - ls -Rla package
 # Run a simple test to validate that the build works; CPU device in a VM
-  - cd client
+  - cd package/bin
   - export LD_LIBRARY_PATH=${TRAVIS_BUILD_DIR}/bin/clBLAS/package/lib64:${LD_LIBRARY_PATH}
   - ./client --cpu
 

From 4ee2b8fabdceaef8f5d26566783df9365529ebe0 Mon Sep 17 00:00:00 2001
From: AMD-FirePro <FirePro.Developers@amd.com>
Date: Thu, 21 Nov 2013 20:48:10 +0000
Subject: [PATCH 27/59] fix bug in ssyrk, now use only uint for mad24

The problem is only visible if we tune syrk for single precision.
---
 src/library/blas/gens/syrxk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
index a0f6a295..e4a00eec 100644
--- a/src/library/blas/gens/syrxk.c
+++ b/src/library/blas/gens/syrxk.c
@@ -1401,7 +1401,7 @@ genUpdateGenericDiagTile(
                                 "cc%u = ((%s)mask &\n"
                                 "       %s) >>\n"
                                 "      %s;\n"
-                                "cc%u = %u - mad24(cc%u, %s, 0);\n",
+                                "cc%u = %u - mad24(cc%u, %s, 0u);\n",
 
                                 iter.row,
                                 (1 << (nrCols - 1)),
@@ -1416,7 +1416,7 @@ genUpdateGenericDiagTile(
                                 "cc%u = ((%s)mask &\n"
                                 "       %s) >>\n"
                                 "      %s;\n"
-                                "cc%u = mad24(cc%u, %s, 0);\n",
+                                "cc%u = mad24(cc%u, %s, 0u);\n",
 
                                 nrRows - 1, iter.row,
                                 i, vctype.buf, constMasks.buf, constShifts.buf,

From 361d574c1c2727bb3d967bffafa8b0c0c4546257 Mon Sep 17 00:00:00 2001
From: Nicolas Kaiser <nikai@nikai.net>
Date: Fri, 6 Dec 2013 11:32:06 +0100
Subject: [PATCH 28/59] fix typos in CONTRIBUTING.md

---
 CONTRIBUTING.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 61932f1f..0dc5c7e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,7 +8,7 @@ Firstly, in order to contribute code to this project, a contributor must have a
 * After forking, the contributor [clones their repository](https://help.github.com/articles/create-a-repo) locally on their machine
 * Code is developed and checked into the contributor's repository.  These commits are eventually pushed upstream to their GitHub repository
 * The contributor then issues a [pull-request](https://help.github.com/articles/using-pull-requests) against the **develop** branch of this repository, which is the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow which is well suited for working with GitHub
-    * A [git extention](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.  Refer to the projects wiki
+    * A [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.  Refer to the projects wiki
 
 At this point, the repository maintainers will be notified by GitHub that a 'pull request' exists pending against their repository.  A code review should be completed within a few days, depending on the scope of submitted code, and the code will either be accepted, rejected or commented on for extra feedback.
 
@@ -32,5 +32,5 @@ guidelines over time
 Pull requests will be reviewed by the set of collaborators that are assigned for the repository.  Pull requests may be accepted, declined or a conversation may start on the pull request thread with feedback.  If the pull request is trivial and all the submission guidelines defined above are honored, the pull request may be accepted without delay.  If the pull request is good, but the guidelines defined above are not followed, the collaborators may leave feedback on the pull request and engage in a conversation with the contributor with what they can do to improve the pull request.  At any time, collaborators may decline a pull request if they decide the contribution is not appropriate for the project, or the feedback from reviewers on a pull request is not being addressed in an appropriate amount of time.
 
 ## Is it possible to become an official collaborator of the repository?
-Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project.  When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator.  These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes.  It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project.  The benefit of being a repository collaborator allows you to be able to be able to manage other peoples pull requests.
+Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project.  When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator.  These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes.  It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project.  The benefit of being a repository collaborator allows you to be able to manage other peoples pull requests.
 

From 7e7fa103ef320169008493fa291f299922b30c04 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Wed, 11 Dec 2013 14:05:06 -0600
Subject: [PATCH 29/59] add support of hemm, herk, her2k, syrk, syr2k to
 performance test suite(client)

---
 src/client/CMakeLists.txt              |   6 +-
 src/client/clfunc_xhemm.hpp            | 252 +++++++++-
 src/client/clfunc_xher2k.hpp           | 672 +++++++++++++++++++++++++
 src/client/clfunc_xherk.hpp            | 531 +++++++++++++++++++
 src/client/clfunc_xsymm.hpp            |  12 +-
 src/client/clfunc_xsyr2k.hpp           | 394 ++++++++++++++-
 src/client/clfunc_xsyrk.hpp            | 308 +++++++++++-
 src/client/clfunc_xtrmm.hpp            |  10 +-
 src/client/clfunc_xtrsm.hpp            |  10 +-
 src/client/client.cpp                  |  28 ++
 src/scripts/perf/measurePerformance.py |   2 +-
 11 files changed, 2154 insertions(+), 71 deletions(-)
 create mode 100644 src/client/clfunc_xher2k.hpp
 create mode 100644 src/client/clfunc_xherk.hpp

diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index a647da08..2ebebf11 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -26,7 +26,11 @@ set(CLIENT_HEADER
     clfunc_xtrmm.hpp
     clfunc_xtrsm.hpp
     clfunc_xsyrk.hpp
-    clfunc_xsyr2k.hpp)
+    clfunc_xsyr2k.hpp
+	clfunc_xhemm.hpp
+	clfunc_xsymm.hpp
+	clfunc_xherk.hpp
+	clfunc_xher2k.hpp)
 
 set(WRAPPER_SRC testPerfWrapper.cpp)
 
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
index 8a0c5550..9f4047e2 100644
--- a/src/client/clfunc_xhemm.hpp
+++ b/src/client/clfunc_xhemm.hpp
@@ -45,7 +45,7 @@
 template <typename T>
 struct xHemmBuffer
 {
-	clblasOrder order;
+  clblasOrder order;
   clblasSide side;
   clblasUplo uplo;
   size_t M;
@@ -78,22 +78,30 @@ class xHemm : public clblasFunc
 
   ~xHemm()
   {
-    delete buffer.cpuA;
-    delete buffer.cpuB;
-    delete buffer.cpuC;
-    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
   }
 
   double gflops()
   {
-    return (buffer.N*(buffer.N+1))/time_in_ns();
+	  if (buffer.side == clblasLeft)
+	  {
+		return (8*buffer.M*buffer.M*buffer.N)/time_in_ns();
+	  }
+	  else
+	  {
+		return (8*buffer.N*buffer.N*buffer.M)/time_in_ns();
+	  }
   }
 
   std::string gflops_formula()
   {
-    return "M*(M+1)/time";
+	  if (buffer.side == clblasLeft)
+	  {
+		  return "8*M*M*N/time";
+	  }
+	  else
+	  {
+		  return "8*N*N*M/time";
+	  }
   }
 
   void setup_buffer(int order_option, int side_option, int
@@ -106,25 +114,136 @@ class xHemm : public clblasFunc
   void initialize_gpu_buffer();
   void reset_gpu_write_buffer();
   void call_func();
-  	void read_gpu_buffer()
+  void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                    buffer.offc * sizeof(T),
+								buffer.ldc*buffer.N*sizeof(T),
+								buffer.cpuC,0,NULL,NULL);
 	}
-	void roundtrip_func()
-	{//to-do need to fill up
+  void roundtrip_func()
+	{
+		std::cout << "xHemm::roundtrip_func" <<std::endl;
 	}
-	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+  void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
-                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      size_t ldc, size_t offA, size_t offB, size_t offC,
                       double alpha, double beta)
-		{}
-	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		  initialize_scalars(alpha, beta);
+		  buffer.offa = offA;
+		  buffer.offb = offB;
+		  buffer.offc = offC;
+		  buffer.M = M;
+		  buffer.N = N;
+		  if (order_option == 0)
+		  {
+			buffer.order = clblasRowMajor;
+		  }
+		  else
+		  {
+			buffer.order = clblasColumnMajor;
+		  }
+		  if (uplo_option == 0)
+		  {
+			buffer.uplo = clblasUpper;
+		  }
+		  else
+		  {
+			buffer.uplo = clblasLower;
+		  }
+		  if (side_option == 0)
+		  {
+			  buffer.side = clblasLeft;
+			  buffer.a_num_vectors = M;
+			  if (lda == 0)
+			  {
+				buffer.lda = buffer.M;
+			  }
+			  else if (lda < buffer.M)
+			  {
+				std::cerr << "lda:wrong size\n";
+				exit(1);
+			  }
+			  else
+			  {
+				buffer.lda = lda;
+			  }
+		  }
+		  else
+		  {
+			  buffer.side = clblasRight;
+			  buffer.a_num_vectors = N;
+			  if (lda == 0)
+			  {
+				buffer.lda = buffer.N;
+			  }
+			  else if (lda < buffer.N)
+			  {
+				std::cerr << "lda:wrong size\n";
+				exit(1);
+			  }
+			  else
+			  {
+				buffer.lda = lda;
+			  }
+		  }
+		  /*}
+		  if (lda == 0)
+		  {
+			buffer.lda = buffer.M;
+		  }
+		  else if (lda < buffer.M)
+		  {
+			std::cerr << "lda:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.lda = lda;
+		  }*/
+		  if (ldb == 0)
+		  {
+			buffer.ldb = buffer.M;
+		  }
+		  else if (ldb < buffer.M)
+		  {
+			std::cerr << "ldb:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.ldb = ldb;
+		  }
+		  if (ldc == 0)
+		  {
+			buffer.ldc = buffer.M;
+		  }
+		  else if (ldc < buffer.M)
+		  {
+			std::cerr << "ldc:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.ldc = ldc;
+		  }
+		  buffer.cpuB = new T[buffer.N * buffer.ldb];
+		  buffer.cpuC = new T[buffer.N * buffer.ldc];
+		  buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+	}
+  void releaseGPUBuffer_deleteCPUBuffer()
 	{
 		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
 		//need to do this before we eventually hit the destructor
-		//to do
+		delete buffer.cpuA;
+		delete buffer.cpuB;
+		delete buffer.cpuC;
+		OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
 	}
 
 protected:
@@ -253,7 +372,7 @@ void xHemm<T>::setup_buffer(int order_option, int side_option, int
                                 buffer.a_num_vectors * buffer.lda*sizeof(T),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(T),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -356,10 +475,12 @@ void xHemm<T>::initialize_gpu_buffer()
                               buffer.a_num_vectors * buffer.lda*sizeof(T),
                               buffer.cpuA, 0, NULL, NULL);
 
-  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(T),
                               buffer.ldb*buffer.N*sizeof(T),
                               buffer.cpuB, 0, NULL, NULL);
-  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(T),
                               buffer.ldc*buffer.N*sizeof(T),
                               buffer.cpuC, 0, NULL, NULL);
 }
@@ -385,6 +506,50 @@ void xHemm<cl_float2>::call_func()
   timer.Stop(timer_id);
 }
 
+template <>
+void xHemm<cl_float2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+	cl_int err;
+	//create buffer
+	buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                                NULL, &err);
+
+    buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer.N*buffer.ldb*sizeof(cl_float2),
+                                    NULL, &err);
+    buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_float2),
+                                    NULL, &err);
+	//write gpu buffer
+	err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_float2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                              buffer.cpuA, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(cl_float2),
+                              buffer.ldb*buffer.N*sizeof(cl_float2),
+                              buffer.cpuB, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(cl_float2),
+                              buffer.ldc*buffer.N*sizeof(cl_float2),
+                              buffer.cpuC, 0, NULL, NULL);
+
+	clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+	//read gpu buffer
+	err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE, 
+							  buffer.offc * sizeof(cl_float2),
+                              buffer.ldc*buffer.N*sizeof(cl_float2),
+                              buffer.cpuC, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+
+}
 template <>
 void xHemm<cl_double2>::call_func()
 {
@@ -396,5 +561,48 @@ void xHemm<cl_double2>::call_func()
   clWaitForEvents(1, &event_);
   timer.Stop(timer_id);
 }
+template <>
+void xHemm<cl_double2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+	cl_int err;
+	//create buffer
+	buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                                NULL, &err);
+
+    buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer.N*buffer.ldb*sizeof(cl_double2),
+                                    NULL, &err);
+    buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_double2),
+                                    NULL, &err);
+	//write gpu buffer
+	err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_double2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                              buffer.cpuA, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(cl_double2),
+                              buffer.ldb*buffer.N*sizeof(cl_double2),
+                              buffer.cpuB, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(cl_double2),
+                              buffer.ldc*buffer.N*sizeof(cl_double2),
+                              buffer.cpuC, 0, NULL, NULL);
 
+	clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+	//read gpu buffer
+	err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE, 
+							  buffer.offc * sizeof(cl_double2),
+                              buffer.ldc*buffer.N*sizeof(cl_double2),
+                              buffer.cpuC, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+
+}
 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
new file mode 100644
index 00000000..088d9283
--- /dev/null
+++ b/src/client/clfunc_xher2k.hpp
@@ -0,0 +1,672 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER2K_HXX__
+#define CLBLAS_BENCHMARK_XHER2K_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHer2kBuffer
+{
+    clblasOrder order_;
+    clblasUplo uplo_;
+    clblasTranspose transA_;
+    size_t N_;
+    size_t K_;
+    T alpha_;
+	cl_mem A_;
+    size_t offa_;
+    size_t lda_;
+	cl_mem B_;
+	size_t offb_;
+	size_t ldb_;
+    T beta_;
+    cl_mem C_;
+    size_t offc_;
+    size_t ldc_;
+	size_t a_num_vectors_;
+	size_t b_num_vectors_;
+    size_t c_num_vectors_;
+	T* cpuA_;
+	T* cpuB_;
+	T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHer2k : public clblasFunc
+{
+public:
+  xHer2k(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHer2k", 0);
+  }
+
+  ~xHer2k()
+  {
+  }
+
+  double gflops()
+  {
+    return static_cast<double>(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "(8*K*N*N+2*N)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offb_ = offB;
+		buffer_.offc_ = offC;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+				buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+				buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+		buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(T),
+                                        NULL, &err);
+
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(T),
+                                        NULL, &err);
+  }
+  void initialize_cpu_buffer()
+  {
+	  srand(10);
+	  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.lda_; ++j)
+		  {
+                buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+	  for (size_t i = 0; i < buffer_.N_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.ldc_; ++j)
+		  {
+                buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+  }
+  void initialize_gpu_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuA_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void reset_gpu_write_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void call_func();
+  void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+								  buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.cpuC_, 0, NULL, NULL);
+	}
+	void roundtrip_func();
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offb_ = offBX;
+		buffer_.offc_ = offCY;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+				buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+				buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+		buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.cpuA_;
+		delete buffer_.cpuB_;
+		delete buffer_.cpuC_;
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.B_), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+	}
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer_.alpha_ = makeScalar<T>(alpha);
+      buffer_.beta_ = makeScalar<T>(beta);
+  }
+
+private:
+  xHer2kBuffer<T> buffer_;
+};
+
+template<>
+void 
+xHer2k<cl_float2>::call_func()
+{
+	timer.Start(timer_id);
+	clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHer2k<cl_float2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_float2),
+                                        NULL, &err);
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(cl_float2),
+                                        NULL, &err);
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_float2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+		clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_float2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHer2k<cl_double2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k<cl_double2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_double2),
+                                        NULL, &err);
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(cl_double2),
+                                        NULL, &err);
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_double2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+	   clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_double2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
new file mode 100644
index 00000000..110c1078
--- /dev/null
+++ b/src/client/clfunc_xherk.hpp
@@ -0,0 +1,531 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHERK_HXX__
+#define CLBLAS_BENCHMARK_XHERK_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHerkBuffer
+{
+    clblasOrder order_;
+    clblasUplo uplo_;
+    clblasTranspose transA_;
+    size_t N_;
+    size_t K_;
+    T alpha_;
+	cl_mem A_;
+    size_t offa_;
+    size_t lda_;
+    T beta_;
+    cl_mem C_;
+    size_t offc_;
+    size_t ldc_;
+	size_t a_num_vectors_;
+    size_t c_num_vectors_;
+	T* cpuA_;
+	T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHerk : public clblasFunc
+{
+public:
+  xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHerk", 0);
+  }
+
+  ~xHerk()
+  {
+  }
+
+  double gflops()
+  {
+    return static_cast<double>(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "4*K*N*(N+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offB);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offc_ = offC;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(T),
+                                        NULL, &err);
+  }
+  void initialize_cpu_buffer()
+  {
+	  srand(10);
+	  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.lda_; ++j)
+		  {
+                buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+	  for (size_t i = 0; i < buffer_.N_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.ldc_; ++j)
+		  {
+                buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+  }
+  void initialize_gpu_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuA_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void reset_gpu_write_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void call_func();
+  void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+								  buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.cpuC_, 0, NULL, NULL);
+	}
+	void roundtrip_func();
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offc_ = offCY;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.cpuA_;
+		delete buffer_.cpuC_;
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+	}
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer_.alpha_ = makeScalar<T>(alpha);
+      buffer_.beta_ = makeScalar<T>(beta);
+  }
+
+private:
+  xHerkBuffer<T> buffer_;
+};
+
+template<>
+void 
+xHerk<cl_float2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHerk<cl_float2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_float2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+		clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_float2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHerk<cl_double2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk<cl_double2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_double2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+		clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_double2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index d067870f..25a29244 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -342,7 +342,7 @@ void xSymm<T>::setup_buffer(int order_option, int side_option, int
                                 buffer.a_num_vectors * buffer.lda*sizeof(T),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(T),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -428,7 +428,7 @@ void xSymm<cl_float>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_float),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -481,7 +481,7 @@ void xSymm<cl_double>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_double),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -534,7 +534,7 @@ void xSymm<cl_float2>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_float2),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -582,12 +582,12 @@ void xSymm<cl_double2>::roundtrip_func()
 {
   timer.Start(timer_id);
   //set up buffer
-    cl_int err;
+  cl_int err;
   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_double2),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 9fb33812..087329e7 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -61,15 +61,6 @@ class xSyr2k : public clblasFunc
 
     ~xSyr2k()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
     }
 
     void call_func()
@@ -293,7 +284,7 @@ class xSyr2k : public clblasFunc
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                         NULL, &err);
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
@@ -364,23 +355,227 @@ class xSyr2k : public clblasFunc
     }
 	void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(T),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, NULL);
 	}
 	void roundtrip_func()
-	{//to-do need to fill up
+	{
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
-		{}
+	{
+		DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.trans_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+	}
 	void releaseGPUBuffer_deleteCPUBuffer()
 	{
 		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
 		//need to do this before we eventually hit the destructor
-		//to-do
+        delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
 	}
 protected:
     void initialize_scalars(double alpha, double beta)
@@ -411,6 +606,41 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<float>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(float),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(float),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(float),
+                                        NULL, &err);
+
+	this->initialize_gpu_buffer();
+	clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(float),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(float),
+								  buffer_.c_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyr2k<double>::
@@ -428,6 +658,41 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<double>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(double),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(double),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(double),
+                                        NULL, &err);
+
+	this->initialize_gpu_buffer();
+    clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(double),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(double),
+								  buffer_.c_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyr2k<cl_float2>::
@@ -445,6 +710,56 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<cl_float2>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float2),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+
+	clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(cl_float2),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+								  buffer_.c_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_float2>::gflops()
+{
+        return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyr2k<cl_float2>::gflops_formula()
+{
+        return "(8*N*(N+1)*K)/time";
+}
+
 template<>
 void
 xSyr2k<cl_double2>::
@@ -462,4 +777,53 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<cl_double2>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double2),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+    clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(cl_double2),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+								  buffer_.c_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_double2>::gflops()
+{
+        return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyr2k<cl_double2>::gflops_formula()
+{
+        return "(8*N*(N+1)*K)/time";
+}
+
 #endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index ec842e28..c04cc1fb 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -56,13 +56,7 @@ class xSyrk : public clblasFunc
 
     ~xSyrk()
     {
-        delete buffer_.a_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
-        }
+    }
 
     void call_func()
     {
@@ -70,13 +64,12 @@ class xSyrk : public clblasFunc
 
     double gflops()
     {
-        return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
-            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+        return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
     }
 
     std::string gflops_formula()
     {
-        return "(N*(N+1)*K+N*(N+1))/time";
+        return "(N*(N+1)*K)/time";
     }
 
     void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -224,7 +217,7 @@ class xSyrk : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
@@ -281,23 +274,163 @@ class xSyrk : public clblasFunc
     }
  	void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.c_, 0, NULL, NULL);
 	}
 	void roundtrip_func()
-	{//to-do need to fill up
+	{
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
-		{}
+	{
+		DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_a_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.trans_a_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
 	void releaseGPUBuffer_deleteCPUBuffer()
 	{
 		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
 		//need to do this before we eventually hit the destructor
-		//to-do
+		delete buffer_.a_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
 	}
 protected:
     void initialize_scalars(double alpha, double beta)
@@ -327,6 +460,35 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<float>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(float),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(float),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyrk<double>::
@@ -343,6 +505,35 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<double>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(double),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(double),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(double), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(double),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyrk<cl_float2>::
@@ -359,6 +550,48 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<cl_float2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(cl_float2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_float2),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_float2>::gflops()
+{
+        return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyrk<cl_float2>::gflops_formula()
+{
+        return "(4*N*(N+1)*K)/time";
+}
 template<>
 void
 xSyrk<cl_double2>::
@@ -375,4 +608,47 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<cl_double2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(cl_double2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_double2),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_double2>::gflops()
+{
+        return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyrk<cl_double2>::gflops_formula()
+{
+        return "(4*N*(N+1)*K)/time";
+}
+
 #endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index 68034570..a018e833 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -232,7 +232,7 @@ class xTrmm : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                         NULL, &err);
@@ -498,7 +498,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float),
                                         NULL, &err);
@@ -562,7 +562,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double),
                                         NULL, &err);
@@ -626,7 +626,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float2),
                                         NULL, &err);
@@ -690,7 +690,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double2),
                                         NULL, &err);
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 7a86be9e..456c4880 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -231,7 +231,7 @@ class xTrsm : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                          NULL, &err);
@@ -504,7 +504,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float),
                                          NULL, &err);
@@ -567,7 +567,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double),
                                          NULL, &err);
@@ -630,7 +630,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float2),
                                          NULL, &err);
@@ -693,7 +693,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double2),
                                          NULL, &err);
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 74a8eb8d..4ce3f346 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -39,6 +39,8 @@
 #include "clfunc_xhemv.hpp"
 #include "clfunc_xhemm.hpp"
 #include "clfunc_xsymm.hpp"
+#include "clfunc_xherk.hpp"
+#include "clfunc_xher2k.hpp"
 
 namespace po = boost::program_options;
 
@@ -130,6 +132,8 @@ int main(int argc, char *argv[])
       && function != "hemv"
       && function != "hemm"
       && function != "symm"
+	  && function != "herk"
+	  && function != "her2k"
       )
   {
     std::cerr << "Invalid value for --function" << std::endl;
@@ -432,6 +436,30 @@ int main(int argc, char *argv[])
       return -1;
     }
   }
+  else if (function == "herk")
+  {
+    if (precision == "c")
+      my_function = new xHerk<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHerk<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "her2k")
+  {
+    if (precision == "c")
+      my_function = new xHer2k<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHer2k<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her2 function" << std::endl;
+      return -1;
+    }
+  }
   else if (function == "symm")
   {
     if (precision == "s")
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
index 659d4ef6..f89674e7 100644
--- a/src/scripts/perf/measurePerformance.py
+++ b/src/scripts/perf/measurePerformance.py
@@ -42,7 +42,7 @@
 sidevalues = ['left','right']
 uplovalues = ['upper','lower']
 diagvalues = ['unit','nonunit']
-functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv' ]
+functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ]
 precisionvalues = ['s', 'd', 'c', 'z']
 roundtripvalues = ['roundtrip','noroundtrip','both']
 

From 2a0ed3b81afc3c9312171f88548ea2d666fce128 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Thu, 12 Dec 2013 14:22:39 -0600
Subject: [PATCH 30/59] fix the flops calculation formula

---
 src/client/clfunc_xsyr2k.hpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 087329e7..d937cd5f 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -69,13 +69,12 @@ class xSyr2k : public clblasFunc
 
     double gflops()
     {
-        return 2.0*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
-            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+        return 2*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
     }
 
     std::string gflops_formula()
     {
-        return "2.0*(M*(M+1)*N+M*(M+1))/time";
+        return "(2*N*(N+1)*K)/time";
     }
 
     void setup_buffer(int order_option, int side_option, int uplo_option,

From 5d47c236279cb0a3c3f97a586004c2bfb8533d7e Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Thu, 12 Dec 2013 14:36:07 -0600
Subject: [PATCH 31/59] fix the flop calculation formula for syr2k

---
 src/client/clfunc_xsyr2k.hpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index d937cd5f..414fa09a 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -69,12 +69,12 @@ class xSyr2k : public clblasFunc
 
     double gflops()
     {
-        return 2*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+        return (2*buffer_.k_*buffer_.n_*buffer_.n_+buffer_.n_)/time_in_ns();
     }
 
     std::string gflops_formula()
     {
-        return "(2*N*(N+1)*K)/time";
+        return "(2*K*N*N+N)/time";
     }
 
     void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -749,14 +749,14 @@ template<>
 double
 xSyr2k<cl_float2>::gflops()
 {
-        return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+        return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
 }
 
 template<>
 std::string 
 xSyr2k<cl_float2>::gflops_formula()
 {
-        return "(8*N*(N+1)*K)/time";
+        return "(8*K*N*N+2*N)/time";
 }
 
 template<>
@@ -815,14 +815,14 @@ template<>
 double
 xSyr2k<cl_double2>::gflops()
 {
-        return 8*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+        return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
 }
 
 template<>
 std::string 
 xSyr2k<cl_double2>::gflops_formula()
 {
-        return "(8*N*(N+1)*K)/time";
+        return "(8*K*N*N+2*N)/time";
 }
 
 #endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__

From 9f7538a5a771d382142c0e8a103bb0503b08cee6 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 7 Jan 2014 16:26:19 -0600
Subject: [PATCH 32/59] The ACML_ROOT variable was being clobbered even if the
 user passed in a valid value for ACML_ROOT through the command line.  Removed
 the overwrite statement and instead added the ACML_ROOT environment variable
 to the HINTS section of find_library().

Also, added a new location on Linux to find the dependant clBLAS.so
library, for the packaging step.
---
 src/CMakeLists.txt                      | 14 ++++++++------
 src/tests/copyTestDependencies.cmake.in |  4 ++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 76547fc5..44f51af3 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -71,8 +71,6 @@ if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
 	set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE )
 endif( )
 
-set( ACML_ROOT $ENV{ACML_ROOT} CACHE PATH "AMD ACML root path")
-
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug CACHE STRING
       "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
@@ -141,9 +139,10 @@ if( BUILD_TEST )
 
 		find_path(ACML_INCLUDE_DIRS acml.h
 			HINTS
-				$ENV{ACML_ROOT}/include
 				${ACML_ROOT}/include
 				${ACML_ROOT}/${ACML_SUBDIR}/include
+				$ENV{ACML_ROOT}/include
+                                $ENV{ACML_ROOT}/${ACML_SUBDIR}/include
 		)
 
 		if( ACML_INCLUDE_DIRS )
@@ -154,15 +153,17 @@ if( BUILD_TEST )
 		if( UNIX )
 			find_library(ACML_LIBRARIES acml acml_mp
 				HINTS
-					$ENV{ACML_ROOT}/lib
 					${ACML_ROOT}/lib
 					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			find_library(_acml_mv_library acml_mv
 				HINTS
-					$ENV{ACML_ROOT}/lib
 					${ACML_ROOT}/lib
 					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			mark_as_advanced(_acml_mv_library)
 		endif( )
@@ -170,9 +171,10 @@ if( BUILD_TEST )
 		if(WIN32)
 			find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
 				HINTS
-					$ENV{ACML_ROOT}/lib
 					${ACML_ROOT}/lib
 					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 		endif( )
 		
diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
index 3a5d395b..357ac7af 100644
--- a/src/tests/copyTestDependencies.cmake.in
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -43,6 +43,10 @@ endif( )
 
 if( EXISTS "${testDir}" )
     list( APPEND depList "${testDir}" )
+    # On linux, the .so files are not staged with the rest of the executables
+    if( UNIX )
+       list( APPEND depList "${testDir}/../library" )
+    endif( )
 endif( )
 
 # message( STATUS "depList: ${depList}" )

From 85e0803db1b5a8b57f2e30c06da0a61db67766b5 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Fri, 10 Jan 2014 13:46:11 -0600
Subject: [PATCH 33/59] Modified search path for ACML; if looking in _mp
 subdir, make sure to search and link _mp libs accordingly

---
 src/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 44f51af3..73a12962 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -151,7 +151,7 @@ if( BUILD_TEST )
 		endif()
 		
 		if( UNIX )
-			find_library(ACML_LIBRARIES acml acml_mp
+			find_library(ACML_LIBRARIES acml_mp
 				HINTS
 					${ACML_ROOT}/lib
 					${ACML_ROOT}/${ACML_SUBDIR}/lib
@@ -169,7 +169,7 @@ if( BUILD_TEST )
 		endif( )
 		
 		if(WIN32)
-			find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
+			find_library(ACML_LIBRARIES libacml_mp_dll
 				HINTS
 					${ACML_ROOT}/lib
 					${ACML_ROOT}/${ACML_SUBDIR}/lib

From 2d5a3ea56e406cb1a74f576c587623faea7749b7 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Fri, 24 Jan 2014 17:43:08 -0600
Subject: [PATCH 34/59] Renamed version.h to clBLAS.version.h

If the clBLAS and clFFT projects are installed into the same directory,
there is a filename collision with version.h.  Only the last file to
be installed survived, and would break the other project.
---
 src/CMakeLists.txt                        | 4 ++--
 src/{version.h.in => clBLAS.version.h.in} | 0
 src/library/blas/init.c                   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename src/{version.h.in => clBLAS.version.h.in} (100%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 73a12962..4e0cc749 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -249,7 +249,7 @@ endif( )
 #TODO:  We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( )
 add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS )
 
-configure_file( "${PROJECT_SOURCE_DIR}/version.h.in" "${PROJECT_BINARY_DIR}/include/version.h" )
+configure_file( "${PROJECT_SOURCE_DIR}/clBLAS.version.h.in" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" )
 
 # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive
 install( FILES 
@@ -257,7 +257,7 @@ install( FILES
 			"clAmdBlas.h"
 			"clAmdBlas.version.h"
 			"clBLAS-complex.h"
-			"${PROJECT_BINARY_DIR}/include/version.h"
+			"${PROJECT_BINARY_DIR}/include/clBLAS.version.h"
 		DESTINATION 
 			"./include" )
 
diff --git a/src/version.h.in b/src/clBLAS.version.h.in
similarity index 100%
rename from src/version.h.in
rename to src/clBLAS.version.h.in
diff --git a/src/library/blas/init.c b/src/library/blas/init.c
index 5095cb0f..2b257a8e 100644
--- a/src/library/blas/init.c
+++ b/src/library/blas/init.c
@@ -18,7 +18,7 @@
 #include <clBLAS.h>
 #include <toolslib.h>
 #include <kern_cache.h>
-#include <version.h>
+#include <clBLAS.version.h>
 #include <trace_malloc.h>
 
 #include "clblas-internal.h"

From 711444988a4b1ea7600f7ad0fc54229426b09db2 Mon Sep 17 00:00:00 2001
From: BenjaminCoquelle <benjamin.coquelle@amd.com>
Date: Thu, 6 Feb 2014 08:19:21 +0000
Subject: [PATCH 35/59] updating documentation

The documentation is not clear concerning the thread safety of this
library. So I updated it to clearly mention it
---
 src/clBLAS.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/clBLAS.h b/src/clBLAS.h
index 6d219c33..7d89b9f6 100644
--- a/src/clBLAS.h
+++ b/src/clBLAS.h
@@ -56,6 +56,10 @@ extern "C" {
  * keeping interfaces familiar to users who know how to use BLAS. All
  * functions accept matrices through buffer objects.
  *
+ * This library is entirely thread-safe with the exception of the following API :
+ * clblasSetup and clblasTeardown. 
+ * Developers using the library can safely using any blas routine from different thread. 
+ *
  * @section deprecated
  * This library provided support for the creation of scratch images to achieve better performance
  * on older <a href="http://developer.amd.com/gpu/AMDAPPSDK/Pages/default.aspx">AMD APP SDK's</a>.

From d481f72992b552724c3c8b2596263a0ee8bda2a2 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Wed, 26 Feb 2014 15:43:08 -0600
Subject: [PATCH 36/59] add memalloc options to client (gemm and trsm)

---
 src/client/clfunc_common.hpp           |  10 +
 src/client/clfunc_xgemm.hpp            | 490 ++++++++++++-------------
 src/client/clfunc_xher2k.hpp           |   4 +
 src/client/clfunc_xherk.hpp            |   4 +
 src/client/clfunc_xsymm.hpp            |   4 +
 src/client/clfunc_xsyr2k.hpp           |   4 +
 src/client/clfunc_xsyrk.hpp            |   4 +
 src/client/clfunc_xtrmm.hpp            |   6 +-
 src/client/clfunc_xtrsm.hpp            | 409 ++++++++++-----------
 src/client/client.cpp                  |  26 +-
 src/scripts/perf/measurePerformance.py |   9 +-
 11 files changed, 512 insertions(+), 458 deletions(-)

diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 293a3b60..5f736130 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -28,6 +28,11 @@
 #include "dis_warning.h"
 
 #include "clBLAS.h"
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl_ext.h>
+#endif
 
 template<typename T>
 static T
@@ -243,6 +248,7 @@ class clblasFunc
         OPENCL_V_THROW(err, "creating context");
         queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
 
+
         timer_id = timer.getUniqueID( "clfunc", 0 );
 
 
@@ -307,6 +313,10 @@ class clblasFunc
     virtual void reset_gpu_write_buffer() = 0;
 	virtual void read_gpu_buffer() = 0;
 	virtual void roundtrip_func() = 0;
+	virtual void allochostptr_roundtrip_func() {}
+	virtual void usehostptr_roundtrip_func() {}
+	virtual void copyhostptr_roundtrip_func() {}
+	virtual void usepersismem_roundtrip_func() {}
 	virtual void roundtrip_setup_buffer(int order_option, int side_option,
                               int uplo_option, int diag_option, int
                               transA_option, int transB_option,
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index c5f706c0..df843922 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -66,7 +66,9 @@ class xGemm : public clblasFunc
 
     void call_func()
     {
-        std::cout << "xGemm::call_func\n";
+		timer.Start(timer_id);
+		xGemm_Function(true);
+		timer.Stop(timer_id);
     }
 
     double gflops()
@@ -411,7 +413,215 @@ class xGemm : public clblasFunc
 
 	void roundtrip_func()
 	{
-		std::cout << "xGemm::roundtrip_func\n";
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void allochostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+
+		cl_int err;
+		// Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+
+		// map the buffers to pointers at host device
+		T *map_a,*map_b,*map_c;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           buffer_.offB_) * sizeof(T),
+										   0, NULL, NULL, &err);
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		// memcpy the input A, B, C to the host pointers
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+		// calling clBLAS
+		xGemm_Function(false);
+		// map the C buffer again to read output
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+
+	timer.Stop(timer_id);
+	}
+	void usehostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        buffer_.b_, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        buffer_.c_, &err);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void copyhostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        buffer_.b_, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        buffer_.c_, &err);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usepersismem_roundtrip_func()
+	{
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+	timer.Start(timer_id);
+
+		cl_int err;
+
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+
+		// map the buffers to pointers at host devices
+		T *map_a,*map_b,*map_c;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           buffer_.offB_) * sizeof(T),
+										   0, NULL, NULL, &err);
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		// memcpy the input A, B, C to the host pointers
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+		// calling clBLAS
+		xGemm_Function(false);
+		// map the C buffer again to read output
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+
+	timer.Stop(timer_id);
+#else
+		std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
@@ -674,296 +884,86 @@ class xGemm : public clblasFunc
 
 private:
     xGemmBuffer<T> buffer_;
-
-}; // class xgemm
+	void xGemm_Function(bool flush);
 
 
+}; // class xgemm
 
 template<>
-void
+void 
 xGemm<cl_float>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
 	clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float>::
-roundtrip_func()
-{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_float),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_float),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_float),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_float),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_float), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_float),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
-
 template<>
-void
+void 
 xGemm<cl_double>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
 	clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-	//set up buffer
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_double),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_double),
-                                        NULL, &err);
-		//initialize gpu buffer
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_double),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_double),
-                                   buffer_.c_, 0, NULL, NULL);
-		//call_func
-		clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_double), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_double),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
+}
 
 template<>
-void
+void 
 xGemm<cl_float2>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
-    clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+	clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float2>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_float2),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float2),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_float2),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_float2),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_float2),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_float2),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
+}
 
 template<>
-void
+void 
 xGemm<cl_double2>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
-    clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+	clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double2>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_double2),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double2),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_double2),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_double2),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_double2),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_double2),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
-
+}
 
 template<>
 double
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
index 088d9283..15095fa8 100644
--- a/src/client/clfunc_xher2k.hpp
+++ b/src/client/clfunc_xher2k.hpp
@@ -344,6 +344,10 @@ class xHer2k : public clblasFunc
 								  buffer_.cpuC_, 0, NULL, NULL);
 	}
 	void roundtrip_func();
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
index 110c1078..74871a39 100644
--- a/src/client/clfunc_xherk.hpp
+++ b/src/client/clfunc_xherk.hpp
@@ -273,6 +273,10 @@ class xHerk : public clblasFunc
 								  buffer_.cpuC_, 0, NULL, NULL);
 	}
 	void roundtrip_func();
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index 25a29244..a7558e92 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -98,6 +98,10 @@ class xSymm : public clblasFunc
 	{
 				std::cout << "xSymm::roundtrip_func\n";
 	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xSymm::zerocopy_roundtrip_func\n";
+	}
   void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 414fa09a..ae60f9e0 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -364,6 +364,10 @@ class xSyr2k : public clblasFunc
 	void roundtrip_func()
 	{
 	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index c04cc1fb..e9b6a7a5 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -282,6 +282,10 @@ class xSyrk : public clblasFunc
 	void roundtrip_func()
 	{
 	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xSyrk::zerocopy_roundtrip_func\n";
+	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index a018e833..2e05300c 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -304,7 +304,11 @@ class xTrmm : public clblasFunc
 	}
 	void roundtrip_func()
 	{
-		std::cout << "xGemm::roundtrip_func\n";
+		std::cout << "xTrmm::roundtrip_func\n";
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 456c4880..2eb64cfb 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -22,6 +22,7 @@
 
 #include "clfunc_common.hpp"
 
+
 template <typename T>
 struct xTrsmBuffer
 {
@@ -61,7 +62,9 @@ class xTrsm : public clblasFunc
 
     void call_func()
     {
-        std::cout << "xtrsm::call_func\n";
+    timer.Start(timer_id);
+	xTrsm_Function(true);
+    timer.Stop(timer_id);
     }
 
     double gflops()
@@ -311,7 +314,179 @@ class xTrsm : public clblasFunc
 	}
 	void roundtrip_func()
 	{
-		std::cout << "xtrsm::call_func\n";
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void allochostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		// Map the buffers to pointers at host device
+		T *map_a,*map_b;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		// memcpy the input A, B to the mapped regions
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		// map the B buffer again to read the output
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usehostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         buffer_.b_, &err);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void copyhostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         buffer_.b_, &err);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usepersismem_roundtrip_func()
+	{
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		// Map the buffers to pointers at host device
+		T *map_a,*map_b;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		// memcpy the input A, B to the mapped regions
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		// map the B buffer again to read the output
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+#else
+		std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
@@ -470,261 +645,79 @@ class xTrsm : public clblasFunc
 
 private:
     xTrsmBuffer<T> buffer_;
+	void xTrsm_Function(bool flush);
 
 }; // class xtrsm
 
 template<>
 void
 xTrsm<cl_float>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_float),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_double>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_double),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_float2>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-  clWaitForEvents(1, &event_);
-  timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float2>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_float2),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float2),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double2>::
-call_func()
-{
-  timer.Start(timer_id);
-
-  clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
-                   buffer_.trans_a_, buffer_.diag_,
-                   buffer_.m_, buffer_.n_, buffer_.alpha_,
-                   buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                   buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                   1, &queue_, 0, NULL, &event_);
-
-      clWaitForEvents(1, &event_);
-      timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_double2>::
-roundtrip_func()
+xTrsm_Function(bool flush)
 {
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_double2),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double2),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+    clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+                     1, &queue_, 0, NULL, &event_);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
+
 template<>
 double
 xTrsm<cl_float2>::
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 4ce3f346..a55def31 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -69,6 +69,7 @@ int main(int argc, char *argv[])
   std::string function;
   std::string precision;
   std::string roundtrip;
+  std::string memalloc;
   int side_option;
   int uplo_option;
   int diag_option;
@@ -100,7 +101,8 @@ int main(int argc, char *argv[])
     ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" )  // xsymv xsyrk xsyr2k xtrsm xtrmm
     ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
     ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
-	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"calculate the time for round trips")
+	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+	( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd")
     ;
 
   po::variables_map vm;
@@ -511,7 +513,27 @@ int main(int argc, char *argv[])
     my_function->call_func();
 	my_function->read_gpu_buffer();
     my_function->reset_gpu_write_buffer();*/
-	my_function->roundtrip_func();
+	
+	if(memalloc=="default")
+	{
+		my_function->roundtrip_func();
+	}
+	else if (memalloc=="alloc_host_ptr")
+	{
+		my_function->allochostptr_roundtrip_func();
+	}
+	else if (memalloc=="use_host_ptr")
+	{
+		my_function->usehostptr_roundtrip_func();
+	}
+	else if (memalloc=="copy_host_ptr")
+	{
+		my_function->copyhostptr_roundtrip_func();
+	}
+	else if (memalloc=="use_persistent_mem_amd")
+	{
+		my_function->usepersismem_roundtrip_func();
+	}
 	//my_function->reset_gpu_write_buffer();
 	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
index f89674e7..8559e66d 100644
--- a/src/scripts/perf/measurePerformance.py
+++ b/src/scripts/perf/measurePerformance.py
@@ -45,6 +45,7 @@
 functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ]
 precisionvalues = ['s', 'd', 'c', 'z']
 roundtripvalues = ['roundtrip','noroundtrip','both']
+memallocvalues = ['default','alloc_host_ptr','use_host_ptr','copy_host_ptr','use_persistent_mem_amd']
 
 parser = argparse.ArgumentParser(description='Measure performance of the clblas library')
 parser.add_argument('--device',
@@ -125,6 +126,9 @@
 parser.add_argument('--roundtrip',
     dest='roundtrip', default='noroundtrip',
     help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML')
+parser.add_argument('--memalloc',
+	dest='memalloc', default='default',
+	help='set the flags for OpenCL memory allocation. Choices are ' + str(memallocvalues) + '. (default is default); do not need to set when calling ACML or if roundtrip is not set')
 ini_group = parser.add_mutually_exclusive_group()
 ini_group.add_argument('--createini',
     dest='createIniFilename', default=None, type=argparse.FileType('w'),
@@ -138,6 +142,7 @@
 label = str(args.label)
 roundtrip = str(args.roundtrip)
 library = str(args.library)
+memalloc = str(args.memalloc)
 
 subprocess.call('mkdir perfLog', shell = True)
 logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt'))
@@ -145,7 +150,6 @@
 def printLog(txt):
     print txt
     log(logfile, txt)
-printLog(roundtrip)
 printLog("=========================MEASURE PERFORMANCE START===========================")
 printLog("Process id of Measure Performance:"+str(os.getpid()))
 
@@ -449,7 +453,8 @@ def executeCommand():
                      '--function', function,
                      '--precision', precision,
                      '-p', '10',
-					 '--roundtrip', roundtrip]
+					 '--roundtrip', roundtrip,
+					 '--memalloc', memalloc]
     else:
         printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command')
         quit()

From da0a6383a335fa7e21d3ce7749d2347949217ced Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Fri, 7 Mar 2014 16:18:40 -0600
Subject: [PATCH 37/59] bug fix

---
 src/client/clfunc_xgemm.hpp | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index df843922..9e6836d6 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -554,12 +554,7 @@ class xGemm : public clblasFunc
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         buffer_.c_, &err);
-		xGemm_Function(false);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(T),
-								  buffer_.c_, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
+		xGemm_Function(true);
 	timer.Stop(timer_id);
 	}
 	void usepersismem_roundtrip_func()

From fae4a1b508c5f9ce664d67c5b7640aa773f272f4 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 11 Mar 2014 12:11:32 -0500
Subject: [PATCH 38/59] A fix for an uninitialized variable in the tune tool. 
 The problem would manifest itself when the --store-kernels flag was used on
 the command line. Comments were added to clarify a few sections of code.

---
 src/include/trace_malloc.h    |  2 +-
 src/library/tools/tune/tune.c | 53 ++++++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/src/include/trace_malloc.h b/src/include/trace_malloc.h
index 3dfa3152..acc97531 100644
--- a/src/include/trace_malloc.h
+++ b/src/include/trace_malloc.h
@@ -48,7 +48,7 @@ void releaseMallocTrace(void);
 
 static __inline void initMallocTrace(void)
 {
-    /* do noting */
+    /* do nothing */
 }
 
 static __inline void printMallocStatistics(void)
diff --git a/src/library/tools/tune/tune.c b/src/library/tools/tune/tune.c
index 8050cb5a..b6174c4e 100644
--- a/src/library/tools/tune/tune.c
+++ b/src/library/tools/tune/tune.c
@@ -309,12 +309,12 @@ struct GeneratorInfoRec {
     DeviceInfo          deviceInfos;    // Todo delete this member. Use TargetDevice.
     char                *deviceName;    //
 
-    bool       aFunc[BLAS_FUNCTIONS_NUMBER];
+    bool       aFunc[BLAS_FUNCTIONS_NUMBER];    //  True/false value if the corresponding function should be tuned
     int        aPattern;
-    bool       aDType[TYPE_NUMBER];
+    bool       aDType[TYPE_NUMBER]; //  True false value if the precision should be tuned; s/d/c/z
     int        aFlag;
     int        aCommand;
-    bool       aIsKernel;
+    bool       aIsKernel;   // True/false value to store binary kernels into the kernel database
     int        aMaxparam;
     bool       aExtendedOutput;
     bool       aAll;
@@ -2247,14 +2247,29 @@ generateKernelForOthersFlag( BlasExtraInfo* bExtra,
                             bestParamOther[nDim]->count++;
                     }
                 }
+
+                //  If the user selected that they want to store the kernel binaries to disk,
+                //  and we do not have those binaries, generate them again
                 if (genInfo.aIsKernel && bestParamOther[nDim]->kernel == NULL) {
+                    MatrixInfo mi [DIMARRAYCOUNT];
                     unsigned int func = bFunc->funcNo;
                     unsigned int patt = bPatt->pattNo;
+
+                    //  Initialize resources to generate kernels in genAllKernel
                     initCLBLASExtra(&extra, bExtra);
-                    genAllKernel(&args, extra, bestParamOther[nDim],
-                                 pattern, func, patt);
-                    logKernalGen();
+                    initMatrixInfo( mi, extra.dtype, &genInfo.deviceInfos, bExtra );
+                    initCLBlasKArgDim( &args, mi, extra.flags );
+
+                    genAllKernel(&args, extra, bestParamOther[nDim], pattern, func, patt);
+
+                    //  Free those resources when finished
+                    releaseMemObjAll( mi, bExtra );
+                    destroyMatrixInfo( mi, bExtra );
+
+                    logKernalGen( );
                 }
+
+                //  This stores the kernel binaries to disk
                 saveBestParams(bExtraOther, bestParamOther);
             }
             deleteGParams(bExtraOther, bestParamOther);
@@ -2304,13 +2319,22 @@ createFile(void)
     bool isEnvPattSelected = false;
     unsigned int dev;
 
-    initOpenCl();
+    //  This intializes global genInfo with either the last detected platform, or the
+    //  first AMD platform it finds.  It records the number of devices in that platform.
+    initOpenCl( );
+
     // For each devices
     for (dev = 0; dev < genInfo.numDevices; dev++) {
     	initDevice(dev);
+
+        //  The following creates the .kdb file on disk according to the set environment variable
         writeStorageCache(&genInfo.targetDevice);
-        getContext();
-        configurePattern();
+
+        //  The following creates the OpenCL context and commanqueue for the first device in global genInfo struct
+        getContext( );
+
+        //  Does nothing; nop
+        configurePattern( );
 
         // for each function
         for (funcId = 0; funcId < BLAS_FUNCTIONS_NUMBER; funcId++) {
@@ -2378,6 +2402,9 @@ createFile(void)
                     bExtra = &(bPatt->extra[nExtra]);
                     genInfo.last = 0;
 
+                    //  This evaluates whether the current combination of parameters from the given function should be tuned or not
+                    //  If skipFlags returns 1, then the this combination is skipped
+                    //  This checks for hardcoded combinations which are skipped because of known runtime bugs.  
                     if ( skipFlags(bExtra,
                             pattId,
                             funcId,
@@ -2386,6 +2413,7 @@ createFile(void)
                         continue;
                     }
 
+                    //  Similar logic to skipFlags, but this mostly filters out cases that were specified on the command line
                     if (isFilter(bExtra, pattId, funcId)) {
                         continue;
                     }
@@ -2636,8 +2664,13 @@ main(int argc, char*  argv[])
 {
     FILE_PATH = getenv(ENV_FILE_PATH);
 
-    initGeneratorInfoRec();
+    //  This clears and initializes the global GeneratorInfoRec genInfo struct
+    initGeneratorInfoRec( );
     parseArg(argc, argv);
+
+    //  This will
+    //  Set up the global clblasSolvers for all function families supported within blas, including initializing memory patterns
+    //  Identify all recognized devices in the system
     clblasSetup();
 
     if (!FILE_PATH){

From 15c03be40361c56bc5109ced72864607077efc00 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Fri, 14 Mar 2014 16:04:27 -0500
Subject: [PATCH 39/59] enable rect read/write for gemm

---
 src/client/clfunc_common.hpp |  1 +
 src/client/clfunc_xgemm.hpp  | 97 +++++++++++++++++++++++++++++++++---
 src/client/client.cpp        |  6 ++-
 3 files changed, 96 insertions(+), 8 deletions(-)

diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 5f736130..bda11866 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -313,6 +313,7 @@ class clblasFunc
     virtual void reset_gpu_write_buffer() = 0;
 	virtual void read_gpu_buffer() = 0;
 	virtual void roundtrip_func() = 0;
+	virtual void roundtrip_func_rect() {}
 	virtual void allochostptr_roundtrip_func() {}
 	virtual void usehostptr_roundtrip_func() {}
 	virtual void copyhostptr_roundtrip_func() {}
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 9e6836d6..f5552b2d 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -454,6 +454,89 @@ class xGemm : public clblasFunc
 		clWaitForEvents(1, &event_);
 	timer.Stop(timer_id);
 	}
+	void roundtrip_func_rect()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+		//rect
+		size_t a_buffer_origin[3] = {0,0,0}; 
+		size_t a_host_origin[3] = {0,0,0};
+		size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
+		size_t a_buffer_row_pitch=0*sizeof(T);//lda
+		size_t a_buffer_slice_pitch=0;
+		size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
+		size_t a_host_slice_pitch=0;
+
+		size_t b_buffer_origin[3] = {0,0,0}; 
+		size_t b_host_origin[3] = {0,0,0};
+		size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
+		size_t b_buffer_row_pitch=0*sizeof(T);//ldb
+		size_t b_buffer_slice_pitch=0;
+		size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
+		size_t b_host_slice_pitch=0;
+
+		size_t c_buffer_origin[3] = {0,0,0}; 
+		size_t c_host_origin[3] = {0,0,0};
+		size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
+		size_t c_buffer_row_pitch=0*sizeof(T);//ldc
+		size_t c_buffer_slice_pitch=0;
+		size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
+		size_t c_host_slice_pitch=0;
+
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.k_*buffer_.m_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.k_ * buffer_.n_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.m_ * buffer_.n_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+        /*
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+		
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);*/
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
+										a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
+										b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
+
+		buffer_.lda_ = 0;
+        buffer_.ldb_ = 0;
+        buffer_.ldc_ = 0;
+		xGemm_Function(false);
+		/*
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		*/
+		err = ::clEnqueueReadBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}	
 	void allochostptr_roundtrip_func()
 	{
 	timer.Start(timer_id);
@@ -528,12 +611,7 @@ class xGemm : public clblasFunc
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         buffer_.c_, &err);
-		xGemm_Function(false);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(T),
-								  buffer_.c_, 0, NULL, &event_);
-		clWaitForEvents(1, &event_);
+		xGemm_Function(true);
 	timer.Stop(timer_id);
 	}
 	void copyhostptr_roundtrip_func()
@@ -554,7 +632,12 @@ class xGemm : public clblasFunc
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         buffer_.c_, &err);
-		xGemm_Function(true);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
 	timer.Stop(timer_id);
 	}
 	void usepersismem_roundtrip_func()
diff --git a/src/client/client.cpp b/src/client/client.cpp
index a55def31..16186095 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -102,7 +102,7 @@ int main(int argc, char *argv[])
     ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
     ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
 	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
-	( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd")
+	( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
     ;
 
   po::variables_map vm;
@@ -534,6 +534,10 @@ int main(int argc, char *argv[])
 	{
 		my_function->usepersismem_roundtrip_func();
 	}
+	else if (memalloc=="rect_mem")
+	{
+		my_function->roundtrip_func_rect();
+	}
 	//my_function->reset_gpu_write_buffer();
 	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }

From e9dc0f10c2c9239d08aa8e0b5b0210362bdaf80b Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Mon, 17 Mar 2014 15:32:37 -0500
Subject: [PATCH 40/59] undate

---
 src/client/clfunc_xgemm.hpp | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index f5552b2d..cb2725d2 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -522,9 +522,23 @@ class xGemm : public clblasFunc
         err = clEnqueueWriteBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
 										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
 
-		buffer_.lda_ = 0;
-        buffer_.ldb_ = 0;
-        buffer_.ldc_ = 0;
+		if(buffer_.trans_a_==clblasNoTrans)
+		{
+			buffer_.lda_=buffer_.m_;
+		}
+		else
+		{
+			buffer_.lda_=buffer_.k_;
+		}
+		if(buffer_.trans_b_==clblasNoTrans)
+		{
+			buffer_.ldb_=buffer_.k_;
+		}
+		else
+		{
+			buffer_.ldb_=buffer_.m_;
+		}
+		buffer_.ldc_=buffer_.m_;
 		xGemm_Function(false);
 		/*
 		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,

From 7e239b69a582d9185a701ea36f78215191ccb4d2 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Mon, 17 Mar 2014 17:20:54 -0500
Subject: [PATCH 41/59] bug fix

---
 src/client/clfunc_xgemm.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index cb2725d2..fcd40a79 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -536,7 +536,7 @@ class xGemm : public clblasFunc
 		}
 		else
 		{
-			buffer_.ldb_=buffer_.m_;
+			buffer_.ldb_=buffer_.n_;
 		}
 		buffer_.ldc_=buffer_.m_;
 		xGemm_Function(false);

From f607ed6026d4e7c84e933e2d93f4557133d142bb Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Thu, 27 Mar 2014 14:45:49 -0500
Subject: [PATCH 42/59] Improved detection of the OpenCL dependencies during
 the 'copy' step A little refactoring of the FindOpenCL to make it more
 standard

---
 src/FindOpenCL.cmake                    | 58 ++++++++++++-------------
 src/tests/copyTestDependencies.cmake.in | 10 ++++-
 2 files changed, 38 insertions(+), 30 deletions(-)

diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
index 4491269e..8725612f 100644
--- a/src/FindOpenCL.cmake
+++ b/src/FindOpenCL.cmake
@@ -46,23 +46,17 @@
 #    target_link_libraries(foo ${OPENCL_LIBRARIES})
 #
 #-----------------------
-if( DEFINED ENV{AMDAPPSDKROOT} )
-	set( OPENCL_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-elseif( DEFINED ENV{CUDA_PATH} )
-        set( OPENCL_ROOT $ENV{CUDA_PATH} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-else( )
-	set( OPENCL_ROOT "/usr/lib" CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-endif( )
 
 find_path(OPENCL_INCLUDE_DIRS
-	NAMES OpenCL/cl.h CL/cl.h
+    NAMES OpenCL/cl.h CL/cl.h
     HINTS
-		${OPENCL_ROOT}/include
-		ENV AMDAPPSDKROOT/include
-	PATHS
-		/usr/include
-		/usr/local/include
-	DOC "OpenCL header file path"
+        ${OPENCL_ROOT}/include
+        $ENV{AMDAPPSDKROOT}/include
+        $ENV{CUDA_PATH}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+    DOC "OpenCL header file path"
 )
 mark_as_advanced( OPENCL_INCLUDE_DIRS )
 
@@ -70,23 +64,29 @@ mark_as_advanced( OPENCL_INCLUDE_DIRS )
 get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
 
 if( LIB64 )
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
             ${OPENCL_ROOT}/lib
-            ENV AMDAPPSDKROOT/lib
-		DOC "OpenCL dynamic library path"
-		PATH_SUFFIXES x86_64 x64
-	)
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86_64 x64
+        PATHS
+            /usr/lib
+    )
 else( )
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
             ${OPENCL_ROOT}/lib
-            ENV AMDAPPSDKROOT/lib
-		DOC "OpenCL dynamic library path"
-		PATH_SUFFIXES x86 Win32
-	)
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86 Win32
+        PATHS
+            /usr/lib
+    )
 endif( )
 mark_as_advanced( OPENCL_LIBRARIES )
 
@@ -94,5 +94,5 @@ include( FindPackageHandleStandardArgs )
 FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
 
 if( NOT OPENCL_FOUND )
-	message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
 endif()
diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
index 357ac7af..e42ddb80 100644
--- a/src/tests/copyTestDependencies.cmake.in
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -34,13 +34,21 @@ endif( )
 set( depList "" )
 
 if( EXISTS "@ACML_ROOT@" )
-    list( APPEND depList "@ACML_ROOT@/lib" )
+    list( APPEND depList "@ACML_ROOT@/@ACML_SUBDIR@/lib" )
 endif( )
 
 if( EXISTS "@GTEST_ROOT@" )
     list( APPEND depList "@GTEST_ROOT@/lib@SUFFIX_LIB@" )
 endif( )
 
+if( EXISTS "@OPENCL_LIBRARIES@" )
+    get_filename_component( clLibName "@OPENCL_LIBRARIES@" NAME )
+    string( REPLACE ${clLibName} "" clLibDir "@OPENCL_LIBRARIES@" )
+    string( REGEX REPLACE "/+$" "" clLibDir ${clLibDir} )
+
+    list( APPEND depList "${clLibDir}" )
+endif( )
+ 
 if( EXISTS "${testDir}" )
     list( APPEND depList "${testDir}" )
     # On linux, the .so files are not staged with the rest of the executables

From d910be6385f807838217a6d71ceedd292c9fa0f4 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 1 Apr 2014 13:52:10 -0500
Subject: [PATCH 43/59] Update to the cmake script to copy dependencies into
 packages This should make the logic more robust in windows

---
 src/tests/copyTestDependencies.cmake.in | 39 ++++++++++++++++---------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
index e42ddb80..5da7127a 100644
--- a/src/tests/copyTestDependencies.cmake.in
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -18,9 +18,7 @@ endif( )
 
 #    message( fixedTestLocation ": ${fixedTestLocation}" )
 # Get the directory that the test executable resides in; this helps get_prerequisites( ) find dependent libraries
-get_filename_component( testName "${fixedTestLocation}" NAME )
-string( REPLACE ${testName} "" testDir ${fixedTestLocation} )
-string( REGEX REPLACE "/+$" "" testDir ${testDir} )
+get_filename_component( testDir "${fixedTestLocation}" DIRECTORY )
 #    message( testDir ": ${testDir}" )
 
 set( installPath "" )
@@ -33,20 +31,35 @@ endif( )
 # Only search for dependencies that have ROOT defined
 set( depList "" )
 
-if( EXISTS "@ACML_ROOT@" )
-    list( APPEND depList "@ACML_ROOT@/@ACML_SUBDIR@/lib" )
+#This logic assumes that clBLAS CMakeLists.txt has been called
+get_filename_component( acmlDir "@ACML_LIBRARIES@" DIRECTORY )
+
+if( EXISTS "${acmlDir}" )
+    list( APPEND depList "${acmlDir}" )
+#    message( "acmlDir: ${acmlDir}" )
+endif( )
+
+#This logic assumes that FindGTest.cmake has been called
+get_filename_component( gtestDir "@GTEST_LIBRARY@" DIRECTORY )
+get_filename_component( gtestDirDebug "@GTEST_LIBRARY_DEBUG@" DIRECTORY )
+
+if( EXISTS "${gtestDir}" )
+    list( APPEND depList "${gtestDir}" )
+#    message( "gtestDir: ${gtestDir}" )
 endif( )
 
-if( EXISTS "@GTEST_ROOT@" )
-    list( APPEND depList "@GTEST_ROOT@/lib@SUFFIX_LIB@" )
+string( COMPARE NOTEQUAL "${gtestDir}" "${gtestDirDebug}" gtestDiffDirs )
+if( ${gtestDiffDirs} AND EXISTS "${gtestDirDebug}" )
+    list( APPEND depList "${gtestDirDebug}" )
+#    message( "gtestDirDebug: ${gtestDirDebug}" )
 endif( )
 
-if( EXISTS "@OPENCL_LIBRARIES@" )
-    get_filename_component( clLibName "@OPENCL_LIBRARIES@" NAME )
-    string( REPLACE ${clLibName} "" clLibDir "@OPENCL_LIBRARIES@" )
-    string( REGEX REPLACE "/+$" "" clLibDir ${clLibDir} )
+#This logic assumes that FindOpenCL.cmake has been called
+get_filename_component( openclDir "@OPENCL_LIBRARIES@" DIRECTORY )
 
-    list( APPEND depList "${clLibDir}" )
+if( EXISTS "${openclDir}" )
+    list( APPEND depList "${openclDir}" )
+#    message( "openclDir: ${openclDir}" )
 endif( )
  
 if( EXISTS "${testDir}" )
@@ -57,7 +70,7 @@ if( EXISTS "${testDir}" )
     endif( )
 endif( )
 
-# message( STATUS "depList: ${depList}" )
+# message( "depList: ${depList}" )
 
 # This retrieves a list of shared library dependencies from the target; they are not full path names
 # Skip system dependencies and skip recursion

From d71bb2f0a599edc92a6b15b042496b1904e987e8 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 1 Apr 2014 15:35:49 -0500
Subject: [PATCH 44/59] Travis fix: The DIRECTORY tag on
 get_filename_component( ) was introduced in cmake 2.8.10.2, which Travis CI
 does not have by default.  Revert to the old name PATH.

Updates to the main README.md file to incorporate google group links, and
updates to the build dependencies section.
---
 README.md                               | 328 ++++++++++++++----------
 src/tests/copyTestDependencies.cmake.in |  10 +-
 2 files changed, 198 insertions(+), 140 deletions(-)

diff --git a/README.md b/README.md
index e3a24d1f..728a3c01 100644
--- a/README.md
+++ b/README.md
@@ -2,158 +2,216 @@ clBLAS
 =====
 [![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.png)](https://travis-ci.org/clMathLibraries/clBLAS)
 
-This repository houses the code for the OpenCL™ BLAS portion of clMath.  The complete set of BLAS level 1, 2 & 3 routines is implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of supported routines.  In addition to GPU devices, the library also supports running on CPU devices to facilitate debugging and multicore programming.  <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available pre-packaged binary version of the library available for download for both Linux and Windows platforms.
 
-The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
+This repository houses the code for the OpenCL™ BLAS portion of clMath.
+The complete set of BLAS level 1, 2 & 3 routines is implemented. Please
+see Netlib BLAS for the list of supported routines. In addition to GPU
+devices, the library also supports running on CPU devices to facilitate
+debugging and multicore programming. APPML 1.10 is the most current
+generally available pre-packaged binary version of the library available
+for download for both Linux and Windows platforms.
+
+The primary goal of clBLAS is to make it easier for developers to
+utilize the inherent performance and power efficiency benefits of
+heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL
+interfaces, but rather leaves OpenCL state management to the control of
+the user to allow for maximum performance and flexibility. The clBLAS
+library does generate and enqueue optimized OpenCL kernels, relieving
+the user from the task of writing, optimizing and maintaining kernel
+code themselves.
 
 ## clBLAS library user documentation
-[Library and API documentation]( http://clmathlibraries.github.io/clBLAS/ ) for developers is available online as a GitHub Pages website
+
+[Library and API documentation][] for developers is available online as
+a GitHub Pages website
+
+### Google Groups
+
+Two mailing lists have been created for the clMath projects:
+
+-   [clmath@googlegroups.com][] - group whose focus is to answer
+    questions on using the library or reporting issues
+
+-   [clmath-developers@googlegroups.com][] - group whose focus is for
+    developers interested in contributing to the library code itself
 
 ## clBLAS Wiki
-The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/clMathLibraries/clBLAS/wiki/Build)
+
+The [project wiki][] contains helpful documentation, including a [build
+primer][]
 
 ## Contributing code
-Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
+
+Please refer to and read the [Contributing][] document for guidelines on
+how to contribute code to this open source project. The code in the
+/master branch is considered to be stable, and all pull-requests should
+be made against the /develop branch.
 
 ## License
-The source for clBLAS is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+
+The source for clBLAS is licensed under the [Apache License, Version
+2.0][]
 
 ## Example
-The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM
-
-```c
-#include <sys/types.h>
-#include <stdio.h>
-
-/* Include the clBLAS header. It includes the appropriate OpenCL headers
- */
-#include <clBLAS.h>
-
-/* This example uses predefined matrices and their characteristics for
- * simplicity purpose.
- */
-
-#define M  4
-#define N  3
-#define K  5
-
-static const cl_float alpha = 10;
-
-static const cl_float A[M*K] = {
-    11, 12, 13, 14, 15,
-    21, 22, 23, 24, 25,
-    31, 32, 33, 34, 35,
-    41, 42, 43, 44, 45,
-};
-static const size_t lda = K;        /* i.e. lda = K */
-
-static const cl_float B[K*N] = {
-    11, 12, 13,
-    21, 22, 23,
-    31, 32, 33,
-    41, 42, 43,
-    51, 52, 53,
-};
-static const size_t ldb = N;        /* i.e. ldb = N */
-
-static const cl_float beta = 20;
-
-static cl_float C[M*N] = {
-    11, 12, 13,
-    21, 22, 23,
-    31, 32, 33,
-    41, 42, 43, 
-};
-static const size_t ldc = N;        /* i.e. ldc = N */
-
-static cl_float result[M*N];
-
-int main( void )
-{
-    cl_int err;
-    cl_platform_id platform = 0;
-    cl_device_id device = 0;
-    cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
-    cl_context ctx = 0;
-    cl_command_queue queue = 0;
-    cl_mem bufA, bufB, bufC;
-    cl_event event = NULL;
-    int ret = 0;
-
-    /* Setup OpenCL environment. */
-    err = clGetPlatformIDs( 1, &platform, NULL );
-    err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
-
-    props[1] = (cl_context_properties)platform;
-    ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
-    queue = clCreateCommandQueue( ctx, device, 0, &err );
-
-    /* Setup clBLAS */
-    err = clblasSetup( );
-
-    /* Prepare OpenCL memory objects and place matrices inside them. */
-    bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
-                          NULL, &err );
-    bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
-                          NULL, &err );
-    bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
-                          NULL, &err );
-
-    err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
-        M * K * sizeof( *A ), A, 0, NULL, NULL );
-    err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
-        K * N * sizeof( *B ), B, 0, NULL, NULL );
-    err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
-        M * N * sizeof( *C ), C, 0, NULL, NULL );
-
-    /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
-    err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, 
-							M, N, K,
-							alpha, bufA, 0, lda,
-							bufB, 0, ldb, beta,
-							bufC, 0, ldc,
-							1, &queue, 0, NULL, &event );
-
-    /* Wait for calculations to be finished. */
-    err = clWaitForEvents( 1, &event );
-
-    /* Fetch results of calculations from GPU memory. */
-    err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
-                                M * N * sizeof(*result),
-                                result, 0, NULL, NULL );
-
-    /* Release OpenCL memory objects. */
-    clReleaseMemObject( bufC );
-    clReleaseMemObject( bufB );
-    clReleaseMemObject( bufA );
-
-    /* Finalize work with clBLAS */
-    clblasTeardown( );
-
-    /* Release OpenCL working objects. */
-    clReleaseCommandQueue( queue );
-    clReleaseContext( ctx );
-
-    return ret;
-}
-```
+
+The simple example below shows how to use clBLAS to compute an OpenCL
+accelerated SGEMM
+
+    #include <sys/types.h>
+    #include <stdio.h>
+
+    /* Include the clBLAS header. It includes the appropriate OpenCL headers
+     */
+    #include <clBLAS.h>
+
+    /* This example uses predefined matrices and their characteristics for
+     * simplicity purpose.
+     */
+
+    #define M  4
+    #define N  3
+    #define K  5
+
+    static const cl_float alpha = 10;
+
+    static const cl_float A[M*K] = {
+        11, 12, 13, 14, 15,
+        21, 22, 23, 24, 25,
+        31, 32, 33, 34, 35,
+        41, 42, 43, 44, 45,
+    };
+    static const size_t lda = K;        /* i.e. lda = K */
+
+    static const cl_float B[K*N] = {
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33,
+        41, 42, 43,
+        51, 52, 53,
+    };
+    static const size_t ldb = N;        /* i.e. ldb = N */
+
+    static const cl_float beta = 20;
+
+    static cl_float C[M*N] = {
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33,
+        41, 42, 43, 
+    };
+    static const size_t ldc = N;        /* i.e. ldc = N */
+
+    static cl_float result[M*N];
+
+    int main( void )
+    {
+        cl_int err;
+        cl_platform_id platform = 0;
+        cl_device_id device = 0;
+        cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 };
+        cl_context ctx = 0;
+        cl_command_queue queue = 0;
+        cl_mem bufA, bufB, bufC;
+        cl_event event = NULL;
+        int ret = 0;
+
+        /* Setup OpenCL environment. */
+        err = clGetPlatformIDs( 1, &platform, NULL );
+        err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL );
+
+        props[1] = (cl_context_properties)platform;
+        ctx = clCreateContext( props, 1, &device, NULL, NULL, &err );
+        queue = clCreateCommandQueue( ctx, device, 0, &err );
+
+        /* Setup clBLAS */
+        err = clblasSetup( );
+
+        /* Prepare OpenCL memory objects and place matrices inside them. */
+        bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A),
+                              NULL, &err );
+        bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B),
+                              NULL, &err );
+        bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C),
+                              NULL, &err );
+
+        err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0,
+            M * K * sizeof( *A ), A, 0, NULL, NULL );
+        err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0,
+            K * N * sizeof( *B ), B, 0, NULL, NULL );
+        err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0,
+            M * N * sizeof( *C ), C, 0, NULL, NULL );
+
+        /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
+        err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, 
+                                M, N, K,
+                                alpha, bufA, 0, lda,
+                                bufB, 0, ldb, beta,
+                                bufC, 0, ldc,
+                                1, &queue, 0, NULL, &event );
+
+        /* Wait for calculations to be finished. */
+        err = clWaitForEvents( 1, &event );
+
+        /* Fetch results of calculations from GPU memory. */
+        err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0,
+                                    M * N * sizeof(*result),
+                                    result, 0, NULL, NULL );
+
+        /* Release OpenCL memory objects. */
+        clReleaseMemObject( bufC );
+        clReleaseMemObject( bufB );
+        clReleaseMemObject( bufA );
+
+        /* Finalize work with clBLAS */
+        clblasTeardown( );
+
+        /* Release OpenCL working objects. */
+        clReleaseCommandQueue( queue );
+        clReleaseContext( ctx );
+
+        return ret;
+    }
 
 ## Build dependencies
+
 ### Library for Windows
-*  Windows® 7/8
-*  Visual Studio 2010 SP1, 2012
-*  An OpenCL SDK, such as APP SDK 2.8
-*  Latest CMake
+
+-   Windows® 7/8
+
+-   Visual Studio 2010 SP1, 2012
+
+-   An OpenCL SDK, such as APP SDK 2.9
+
+-   Latest CMake
 
 ### Library for Linux
-*  GCC 4.6 and onwards
-*  An OpenCL SDK, such as APP SDK 2.8
-*  Latest CMake
+
+-   GCC 4.6 and onwards
+
+-   An OpenCL SDK, such as APP SDK 2.9
+
+-   Latest CMake
+
+### Library for Mac OSX
+
+-   Recommended to generate Unix makefiles with cmake
 
 ### Test infrastructure
-* Latest Googletest
-* Latest ACML 
-* Latest Boost
+
+-   Googletest v1.6
+
+-   ACML on windows/linux; Accelerate on Mac OSX
+
+-   Latest Boost
 
 ### Performance infrastructure
-* Python
+
+-   Python
+
+  [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
+  [clmath@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
+  [clmath-developers@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath-developers
+  [project wiki]: https://github.com/clMathLibraries/clBLAS/wiki
+  [build primer]: https://github.com/clMathLibraries/clBLAS/wiki/Build
+  [Contributing]: CONTRIBUTING.md
+  [Apache License, Version 2.0]: http://www.apache.org/licenses/LICENSE-2.0
diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
index 5da7127a..d52832fb 100644
--- a/src/tests/copyTestDependencies.cmake.in
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -18,7 +18,7 @@ endif( )
 
 #    message( fixedTestLocation ": ${fixedTestLocation}" )
 # Get the directory that the test executable resides in; this helps get_prerequisites( ) find dependent libraries
-get_filename_component( testDir "${fixedTestLocation}" DIRECTORY )
+get_filename_component( testDir "${fixedTestLocation}" PATH )
 #    message( testDir ": ${testDir}" )
 
 set( installPath "" )
@@ -32,7 +32,7 @@ endif( )
 set( depList "" )
 
 #This logic assumes that clBLAS CMakeLists.txt has been called
-get_filename_component( acmlDir "@ACML_LIBRARIES@" DIRECTORY )
+get_filename_component( acmlDir "@ACML_LIBRARIES@" PATH )
 
 if( EXISTS "${acmlDir}" )
     list( APPEND depList "${acmlDir}" )
@@ -40,8 +40,8 @@ if( EXISTS "${acmlDir}" )
 endif( )
 
 #This logic assumes that FindGTest.cmake has been called
-get_filename_component( gtestDir "@GTEST_LIBRARY@" DIRECTORY )
-get_filename_component( gtestDirDebug "@GTEST_LIBRARY_DEBUG@" DIRECTORY )
+get_filename_component( gtestDir "@GTEST_LIBRARY@" PATH )
+get_filename_component( gtestDirDebug "@GTEST_LIBRARY_DEBUG@" PATH )
 
 if( EXISTS "${gtestDir}" )
     list( APPEND depList "${gtestDir}" )
@@ -55,7 +55,7 @@ if( ${gtestDiffDirs} AND EXISTS "${gtestDirDebug}" )
 endif( )
 
 #This logic assumes that FindOpenCL.cmake has been called
-get_filename_component( openclDir "@OPENCL_LIBRARIES@" DIRECTORY )
+get_filename_component( openclDir "@OPENCL_LIBRARIES@" PATH )
 
 if( EXISTS "${openclDir}" )
     list( APPEND depList "${openclDir}" )

From 44234c71768c8eebe0cbe097fa182d43effa7d22 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Wed, 7 May 2014 17:56:24 -0500
Subject: [PATCH 45/59] A proof-of-concept python wrapper for clBLAS.  Only
 sgemm and the init functions are callable from python.

---
 src/wrappers/python/README.txt   |  61 ++++++++++++++++
 src/wrappers/python/pyclBLAS.pxd |  84 ++++++++++++++++++++++
 src/wrappers/python/pyclBLAS.pyx | 117 +++++++++++++++++++++++++++++++
 src/wrappers/python/setup.py     |  96 +++++++++++++++++++++++++
 4 files changed, 358 insertions(+)
 create mode 100644 src/wrappers/python/README.txt
 create mode 100644 src/wrappers/python/pyclBLAS.pxd
 create mode 100644 src/wrappers/python/pyclBLAS.pyx
 create mode 100644 src/wrappers/python/setup.py

diff --git a/src/wrappers/python/README.txt b/src/wrappers/python/README.txt
new file mode 100644
index 00000000..f77bd666
--- /dev/null
+++ b/src/wrappers/python/README.txt
@@ -0,0 +1,61 @@
+pyclBLAS setup and installation
+(I've been pronouncing it 'pickleBLAS')
+------------------------------------------------------------------------
+A python extention wrapper around clBLAS from https://github.com/clMathLibraries/clBLAS
+
+Dependencies:
+1.  clBLAS from https://github.com/clMathLibraries/clBLAS ( develop branch )
+2.  PyOpenCL from http://mathema.tician.de/software/pyopencl/ ( 2013.2 minimum )
+3.  Cython from http://cython.org/, ( 0.18 minimum )
+4.  OpenCL runtime, such as AMD's catalyst package ( AMD v2.9 SDK tested )
+
+NOTE:  This has only been tested with 32-bit python on windows
+
+NOTE:  Only sgemm has been wrapped as proof-of-concept
+
+Build steps:
+------------------------------------------------------------------------
+1.  First, clone the clBLAS repo from github and make sure to build the 
+'install' step.  This is either 'make install' on linux derivatives or 
+the 'install' project on Visual Studio projects.  This should produce a 
+'package' directory in your build tree that contains ./include, ./libXX & 
+./bin.  
+
+Note:  it is necessary to build 32-bit clBLAS if using 32-bit python,
+and 64-bit clBLAS for 64-bit python.
+
+2.  Install pyopencl.  If your python distribution contains a version 
+of pyopencl that is a minimum of 2013.2, then just install with the 
+distributions package manager like pypm, pip, easy_install.  If not, download
+pyopencl yourself and follow its directions to build and install.
+
+3.  Install Cython.  If your python distribution contains a version 
+of cython that is a minimum of .18, then just install with the 
+distributions package manager like pypm, pip, easy_install.  If not, 
+download cython yourself and follow its directions to build and install.
+
+4.  An OpenCL SDK is required to build, which includes OpenCL header files
+and linkable libraries.  One such SDK is the AMD APP SDK, which can be 
+downloaded from http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/
+
+5.  Build the pyclBLAS extention.  This is accompished by running setup.py,
+which acts as a python makefile.  An example install command: 
+'python setup.py --clBlasRoot=F:\code\GitHub\clMathLibraries\bin\clBLAS\develop\vs11x32\package build_ext --inplace'
+
+'python setup.py --help' prints additional command line parameters that extend 
+the traditional distutils options.  After successfully building the extention
+module, a pyclBLAS.pyd file appears.  As shown above, it may be necessary to provide
+the setup makefile with the paths of the clBLAS 'package' directory and the 
+OpenCL SDK directory.  Setup.py does attempt to find the OpenCL SDK through 
+the environment variable AMDAPPSDKROOT or OPENCL_ROOT.
+
+NOTE:  On windows, if using a more recent version of visual studio than 2008, 
+it may be necessary to trick python to using the newer version of your compiler, 
+by creating an environment variable that it expects to exist as such:
+set VS90COMNTOOLS=%VS110COMNTOOLS%
+
+6.  Execute demoBLAS.py file to test the pyclBLAS extention.  
+    
+NOTE: It may be necessary to copy the clBLAS shared library into 
+the same directory as the extention module so that it can find 
+clBLAS at runtime
diff --git a/src/wrappers/python/pyclBLAS.pxd b/src/wrappers/python/pyclBLAS.pxd
new file mode 100644
index 00000000..0dfbfd9a
--- /dev/null
+++ b/src/wrappers/python/pyclBLAS.pxd
@@ -0,0 +1,84 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+# This pxd file defines all the enums and structs that we plan to use from
+# python.  It is used from pyclBLAS.pyx
+
+cdef extern from "clBLAS.h":
+    # These are base OpenCL enumerations that clBLAS uses
+    cdef enum:
+        CL_SUCCESS                      = 0
+        CL_INVALID_VALUE                = -30
+        CL_INVALID_COMMAND_QUEUE        = -36
+        CL_INVALID_CONTEXT              = -34
+        CL_INVALID_MEM_OBJECT           = -38
+        CL_INVALID_DEVICE               = -33
+        CL_INVALID_EVENT_WAIT_LIST      = -57
+        CL_OUT_OF_RESOURCES             = -5
+        CL_OUT_OF_HOST_MEMORY           = -6
+        CL_INVALID_OPERATION            = -59
+        CL_COMPILER_NOT_AVAILABLE       = -3
+        CL_BUILD_PROGRAM_FAILURE        = -11
+
+    cdef enum clblasStatus_:
+        clblasSuccess               = CL_SUCCESS
+        clblasInvalidValue          = CL_INVALID_VALUE
+        clblasInvalidCommandQueue   = CL_INVALID_COMMAND_QUEUE
+        clblasInvalidContext        = CL_INVALID_CONTEXT
+        clblasInvalidMemObject      = CL_INVALID_MEM_OBJECT
+        clblasInvalidDevice         = CL_INVALID_DEVICE
+        clblasInvalidEventWaitList  = CL_INVALID_EVENT_WAIT_LIST
+        clblasOutOfResources        = CL_OUT_OF_RESOURCES
+        clblasOutOfHostMemory       = CL_OUT_OF_HOST_MEMORY
+        clblasInvalidOperation      = CL_INVALID_OPERATION
+        clblasCompilerNotAvailable  = CL_COMPILER_NOT_AVAILABLE
+        clblasBuildProgramFailure   = CL_BUILD_PROGRAM_FAILURE
+        clblasNotImplemented        = -1024
+        clblasNotInitialized        = -1023
+        clblasInvalidMatA
+        clblasInvalidMatB
+        clblasInvalidMatC
+        clblasInvalidVecX
+        clblasInvalidVecY
+        clblasInvalidDim
+        clblasInvalidLeadDimA
+        clblasInvalidLeadDimB
+        clblasInvalidLeadDimC
+        clblasInvalidIncX
+        clblasInvalidIncY
+        clblasInsufficientMemMatA
+        clblasInsufficientMemMatB
+        clblasInsufficientMemMatC
+        clblasInsufficientMemVecX
+        clblasInsufficientMemVecY
+    ctypedef clblasStatus_ clblasStatus
+
+    cdef enum clblasOrder_:
+        clblasRowMajor             = 0
+        clblasColumnMajor          = 1
+    ctypedef clblasStatus_ clblasOrder
+
+    cdef enum clblasTranspose_:
+        clblasNoTrans             = 0
+        clblasTrans               = 1
+        clblasConjTrans           = 2
+    ctypedef clblasStatus_ clblasTranspose
+
+    ctypedef unsigned int cl_uint
+    ctypedef float cl_float
+    ctypedef void* cl_mem
+    ctypedef void* cl_command_queue
+    ctypedef void* cl_event
diff --git a/src/wrappers/python/pyclBLAS.pyx b/src/wrappers/python/pyclBLAS.pyx
new file mode 100644
index 00000000..2ede2694
--- /dev/null
+++ b/src/wrappers/python/pyclBLAS.pyx
@@ -0,0 +1,117 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+cimport pyclBLAS
+import pyopencl
+
+# These are prototypes from clBLAS.h that we wish to call from python
+################################################################################
+################################################################################
+cdef extern from "clBLAS.h":
+   clblasStatus clblasGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch )
+
+   clblasStatus clblasSetup( )
+
+   void clblasTeardown( )
+
+   clblasStatus clblasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
+                size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda,
+                const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc,
+                cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList,
+                const cl_event* eventWaitList, cl_event* events)
+
+################################################################################
+################################################################################
+# enums to be accessed from python
+# TODO:  is there a better way to express enums?  I like how pyopencl does it,
+# they have layers of scoped constants cl.mem_flags.READ_ONLY
+# The enums below have global scope
+RowMajor    = pyclBLAS.clblasRowMajor
+ColumnMajor = pyclBLAS.clblasColumnMajor
+NoTrans     = pyclBLAS.clblasNoTrans
+Trans       = pyclBLAS.clblasTrans
+ConjTrans   = pyclBLAS.clblasConjTrans
+
+################################################################################
+################################################################################
+# The following functions are the python callable wrapper implementations
+def Setup( ):
+   result = clblasSetup( )
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clblasSetup( ) failed initialization" )
+   return result
+
+################################################################################
+def Teardown( ):
+   clblasTeardown( )
+   return
+
+################################################################################
+def GetVersion( ):
+   cdef pyclBLAS.cl_uint pyMajor
+   cdef pyclBLAS.cl_uint pyMinor
+   cdef pyclBLAS.cl_uint pyPatch
+   result = clblasGetVersion( &pyMajor, &pyMinor, &pyPatch )
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clblasGetVersion( ) did not return version information" )
+   return pyMajor, pyMinor, pyPatch
+
+################################################################################
+# TODO:  Is there way to template these python callable functions, such that we
+# do not need to make a new function for every supported precision?
+def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
+                size_t M, size_t N, size_t K, cl_float alpha, A, size_t offA, size_t lda,
+                B, size_t offB, size_t ldb, cl_float beta, C, size_t offC, size_t ldc,
+                cl_uint numCommandQueues, commandQueues, cl_uint numEventsInWaitList,
+                eventWaitList ):
+
+   # Simplify python wrapper to only handle 1 queue at this time
+   if( numCommandQueues != 1 ):
+      raise IndexError( "pyblasSgemm( ) requires the number of queues to be 1" )
+   cdef int pIntQueue = commandQueues.int_ptr
+   cdef cl_command_queue pcqQueue = <cl_command_queue>pIntQueue
+
+   # This logic does not yet work for numEventsInWaitList > (greater than) 1
+   # Need to figure out how python & pyopencl pass lists of objects
+   cdef int pIntWaitList = 0
+   cdef cl_event* pWaitList = NULL
+   if( numEventsInWaitList > 0 ):
+      if( numEventsInWaitList < 2 ):
+         pIntWaitList = eventWaitList.int_ptr
+         pWaitList = <cl_event*>pIntWaitList
+      else:
+         raise IndexError( "pyblasSgemm( ) requires numEventsInWaitList to be <= 1" )
+
+   # Pyopencl objects contain an int_ptr method to get access to the internally wrapped
+   # OpenCL object pointers
+   cdef cl_event outEvent = NULL
+   cdef int matA = A.int_ptr
+   cdef int matB = B.int_ptr
+   cdef int matC = C.int_ptr
+
+   # Transition execution to clBLAS
+   cdef clblasStatus result = clblasSgemm( order, transA, transB, M, N, K, alpha, <const cl_mem>matA, offA, lda,
+                         <const cl_mem>matB, offB, ldb, beta, <cl_mem>matC, offC, ldc,
+                         numCommandQueues, &pcqQueue, numEventsInWaitList,
+                         pWaitList, &outEvent )
+
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clBLAS sgemm call failed" )
+
+   # Create a pyopencl Event object from the event returned from clBLAS and return
+   # it to the user
+   sgemmEvent = pyopencl.Event.from_int_ptr( <int>outEvent )
+   return sgemmEvent
diff --git a/src/wrappers/python/setup.py b/src/wrappers/python/setup.py
new file mode 100644
index 00000000..b87aa959
--- /dev/null
+++ b/src/wrappers/python/setup.py
@@ -0,0 +1,96 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+from os import path, environ
+import argparse
+
+def main():
+   parser = argparse.ArgumentParser(description='Set up the pyclBLAS extension module')
+   parser.add_argument('--clRoot',
+     dest='clRoot', default=None,
+     help='Root directory to find the OpenCL SDK, which should contain the include directory')
+   parser.add_argument('--clBlasRoot',
+     dest='clBlasRoot', default=None,
+     help='Root directory to find the clBLAS SDK, which should contain the include directory')
+
+   args, unknown_args = parser.parse_known_args( )
+
+##    print( "recognized args: ", args )
+##    print( "unknown args: ", unknown_args )
+
+   # First check environment variables for clRoot paths
+   clRootPath = None
+   if( environ.get('OPENCL_ROOT') is not None ):
+     clRootPath = environ['OPENCL_ROOT']
+
+   # Special check for environment variable set by AMD Catalyst installer
+   if( clRootPath is None and environ.get( 'AMDAPPSDKROOT' ) is not None ):
+     clRootPath = environ['AMDAPPSDKROOT']
+
+   # If user specifies a command line options, this trumps environment variables
+   print( "args.clRoot: ", args.clRoot )
+   if( args.clRoot is not None ):
+     clRootPath = args.clRoot
+
+   if( clRootPath is None ):
+     print( "This setup.py needs to know the root path of an OpenCL installation")
+     print( "Please specify the environment variable OPENCL_ROOT with a path" )
+     print( "Or pass the command line option --clRoot" )
+     exit( )
+
+   # First check environment variables for clRoot paths
+   clBlasRootPath = None
+   if( environ.get('CLBLAS_ROOT') is not None ):
+     clBlasRootPath = environ['CLBLAS_ROOT']
+
+   # If user specifies a command line options, this trumpts environment variables
+   print( "args.clBlasRoot: ", args.clBlasRoot )
+   if( args.clBlasRoot is not None ):
+     clBlasRootPath = args.clBlasRoot
+
+   if( clBlasRootPath is None ):
+     print( "This setup.py needs to know the root path of the clBLAS installation")
+     print( "Please specify the environment variable CLBLAS_ROOT with a path" )
+     print( "or pass the command line option --clBlasRoot" )
+     exit( )
+
+   module = [
+     Extension( name = 'pyclBLAS',
+               sources = ['pyclBLAS.pyx'],
+               include_dirs = [ path.join( clRootPath, 'include' ),
+                                path.join( clBlasRootPath, 'include' ) ],
+               library_dirs = [ path.join( clBlasRootPath, 'lib', 'import' ) ],
+               libraries=['clBLAS'] )
+   ]
+
+   setup(
+      name = 'pyclBLAS',
+      version = '0.0.1',
+      author = 'Kent Knox',
+      description = 'Python wrapper for clBLAS',
+      license = 'Apache License, Version 2.0',
+      cmdclass = {"build_ext": build_ext},
+      ext_modules = module,
+      script_args = unknown_args
+   )
+
+# This is the start of the execution of the python script
+# Useful for debuggers to step into script
+if __name__ == '__main__':
+    main( )

From eb01ea2da0878508d0a3a18f49920904aa25f173 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd.com>
Date: Wed, 7 May 2014 18:09:30 -0500
Subject: [PATCH 46/59] Update README.txt

---
 src/wrappers/python/README.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/wrappers/python/README.txt b/src/wrappers/python/README.txt
index f77bd666..0a8a9612 100644
--- a/src/wrappers/python/README.txt
+++ b/src/wrappers/python/README.txt
@@ -53,8 +53,6 @@ NOTE:  On windows, if using a more recent version of visual studio than 2008,
 it may be necessary to trick python to using the newer version of your compiler, 
 by creating an environment variable that it expects to exist as such:
 set VS90COMNTOOLS=%VS110COMNTOOLS%
-
-6.  Execute demoBLAS.py file to test the pyclBLAS extention.  
     
 NOTE: It may be necessary to copy the clBLAS shared library into 
 the same directory as the extention module so that it can find 

From a731a5ca8c9d220fa705e087432dd4ab0f1e91d7 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 13 May 2014 10:50:01 -0500
Subject: [PATCH 47/59] Updated setup.py file to add appropriate paths for
 64-bit and Linux

---
 src/wrappers/python/README.txt |  2 +-
 src/wrappers/python/setup.py   | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/wrappers/python/README.txt b/src/wrappers/python/README.txt
index 0a8a9612..10c3cf8c 100644
--- a/src/wrappers/python/README.txt
+++ b/src/wrappers/python/README.txt
@@ -9,7 +9,7 @@ Dependencies:
 3.  Cython from http://cython.org/, ( 0.18 minimum )
 4.  OpenCL runtime, such as AMD's catalyst package ( AMD v2.9 SDK tested )
 
-NOTE:  This has only been tested with 32-bit python on windows
+NOTE:  This has been tested with 32-bit python on windows & 64-bit on OpenSUSE
 
 NOTE:  Only sgemm has been wrapped as proof-of-concept
 
diff --git a/src/wrappers/python/setup.py b/src/wrappers/python/setup.py
index b87aa959..7092714c 100644
--- a/src/wrappers/python/setup.py
+++ b/src/wrappers/python/setup.py
@@ -19,6 +19,7 @@
 from Cython.Distutils import build_ext
 from os import path, environ
 import argparse
+import platform
 
 def main():
    parser = argparse.ArgumentParser(description='Set up the pyclBLAS extension module')
@@ -70,12 +71,22 @@ def main():
      print( "or pass the command line option --clBlasRoot" )
      exit( )
 
+   # 64bit and 32bit have different library paths
+   if( platform.architecture( )[0] == '64bit' ):
+     libraryPath = 'lib64'
+   else:
+     libraryPath = 'lib'
+
+   # Windows and linux have different library paths
+   if( platform.system( ) == 'Windows' ):
+     libraryPath = path.join( libraryPath, 'import' )
+
    module = [
      Extension( name = 'pyclBLAS',
                sources = ['pyclBLAS.pyx'],
                include_dirs = [ path.join( clRootPath, 'include' ),
                                 path.join( clBlasRootPath, 'include' ) ],
-               library_dirs = [ path.join( clBlasRootPath, 'lib', 'import' ) ],
+               library_dirs = [ path.join( clBlasRootPath, libraryPath ) ],
                libraries=['clBLAS'] )
    ]
 

From b5bbbc2815e18f5bba47a9a953496e4c97fad627 Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Tue, 13 May 2014 14:22:15 -0500
Subject: [PATCH 48/59] Converting int types to intptr_t types for pyopencl
 integration

---
 src/wrappers/python/pyclBLAS.pxd |  1 +
 src/wrappers/python/pyclBLAS.pyx | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/wrappers/python/pyclBLAS.pxd b/src/wrappers/python/pyclBLAS.pxd
index 0dfbfd9a..b3c4e8ec 100644
--- a/src/wrappers/python/pyclBLAS.pxd
+++ b/src/wrappers/python/pyclBLAS.pxd
@@ -16,6 +16,7 @@
 
 # This pxd file defines all the enums and structs that we plan to use from
 # python.  It is used from pyclBLAS.pyx
+from libc.stdint cimport intptr_t, uintptr_t
 
 cdef extern from "clBLAS.h":
     # These are base OpenCL enumerations that clBLAS uses
diff --git a/src/wrappers/python/pyclBLAS.pyx b/src/wrappers/python/pyclBLAS.pyx
index 2ede2694..6e944c47 100644
--- a/src/wrappers/python/pyclBLAS.pyx
+++ b/src/wrappers/python/pyclBLAS.pyx
@@ -81,12 +81,12 @@ def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
    # Simplify python wrapper to only handle 1 queue at this time
    if( numCommandQueues != 1 ):
       raise IndexError( "pyblasSgemm( ) requires the number of queues to be 1" )
-   cdef int pIntQueue = commandQueues.int_ptr
+   cdef intptr_t pIntQueue = commandQueues.int_ptr
    cdef cl_command_queue pcqQueue = <cl_command_queue>pIntQueue
 
    # This logic does not yet work for numEventsInWaitList > (greater than) 1
    # Need to figure out how python & pyopencl pass lists of objects
-   cdef int pIntWaitList = 0
+   cdef intptr_t pIntWaitList = 0
    cdef cl_event* pWaitList = NULL
    if( numEventsInWaitList > 0 ):
       if( numEventsInWaitList < 2 ):
@@ -98,9 +98,9 @@ def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
    # Pyopencl objects contain an int_ptr method to get access to the internally wrapped
    # OpenCL object pointers
    cdef cl_event outEvent = NULL
-   cdef int matA = A.int_ptr
-   cdef int matB = B.int_ptr
-   cdef int matC = C.int_ptr
+   cdef intptr_t matA = A.int_ptr
+   cdef intptr_t matB = B.int_ptr
+   cdef intptr_t matC = C.int_ptr
 
    # Transition execution to clBLAS
    cdef clblasStatus result = clblasSgemm( order, transA, transB, M, N, K, alpha, <const cl_mem>matA, offA, lda,
@@ -113,5 +113,5 @@ def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
 
    # Create a pyopencl Event object from the event returned from clBLAS and return
    # it to the user
-   sgemmEvent = pyopencl.Event.from_int_ptr( <int>outEvent )
+   sgemmEvent = pyopencl.Event.from_int_ptr( <intptr_t>outEvent )
    return sgemmEvent

From 900b2110d729c7f9b4b4f1b4bc402776dd8319bd Mon Sep 17 00:00:00 2001
From: Kent Knox <kent.knox@amd>
Date: Mon, 19 May 2014 14:13:41 -0500
Subject: [PATCH 49/59] Changes to compile for vs2013/vs12

---
 src/tests/include/BlasBase.h  | 1 +
 src/tests/include/blas-math.h | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tests/include/BlasBase.h b/src/tests/include/BlasBase.h
index 1901afad..6c13e520 100644
--- a/src/tests/include/BlasBase.h
+++ b/src/tests/include/BlasBase.h
@@ -20,6 +20,7 @@
 
 #include <clBLAS.h>
 #include <common.h>
+#include <algorithm>
 
 #if _MSC_VER
 #pragma warning (disable:4127)
diff --git a/src/tests/include/blas-math.h b/src/tests/include/blas-math.h
index a7e3293e..784c44b6 100644
--- a/src/tests/include/blas-math.h
+++ b/src/tests/include/blas-math.h
@@ -20,10 +20,12 @@
 
 #if defined (_MSC_VER)
 
+#if( _MSC_VER <= 1700 )
 static unsigned long long ROW_NAN = 0x7ff0000000000000LL;
-static unsigned int ROW_NANF = 0x7fc00000;
-
 #define NAN *(reinterpret_cast<double*>(&ROW_NAN))
+#endif
+
+static unsigned int ROW_NANF = 0x7fc00000;
 #define NANF *(reinterpret_cast<float*>(&ROW_NANF))
 
 #else   /* _MSC_VER */

From d8ca5878b5971e608fa8ef63bbee8fcf9684425c Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Thu, 5 Jun 2014 15:26:46 -0500
Subject: [PATCH 50/59] bug fix of hemm and symm hanging with tuning KDB file
 and ssyr2k crashes on Kaveri with tuning KDB file

---
 src/library/blas/generic/solution_seq_make.c |  9 ++++++---
 src/library/blas/gens/syrxk.c                | 21 +++++++++++++++-----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c
index 0eee1fd7..8a5e402d 100644
--- a/src/library/blas/generic/solution_seq_make.c
+++ b/src/library/blas/generic/solution_seq_make.c
@@ -1435,9 +1435,12 @@ getStepGranulation(SolutionStep *step)
             }
         }
 
-        status = getGranularityInfo(&step->device, mempat->name,
-                                    step->args.dtype, step->extraFlags,
-                                    (int)MNK, dims, &step->pgran, &time);
+		if( step->funcID != CLBLAS_GEMM2 )
+		{
+			status = getGranularityInfo(&step->device, mempat->name,
+										step->args.dtype, step->extraFlags,
+										(int)MNK, dims, &step->pgran, &time);
+		}
         /*
          * Disable blocking for implementations dealing with cache reads
          * from the global memory
diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
index e4a00eec..cf47ba9a 100644
--- a/src/library/blas/gens/syrxk.c
+++ b/src/library/blas/gens/syrxk.c
@@ -21,6 +21,7 @@
 
 #include <string.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <assert.h>
 
 #include <clBLAS.h>
@@ -1219,10 +1220,11 @@ genUpdateGenericDiagTile(
     // type of the vectorized coordinates
     Kstring vctype;
     Kstring constOffs, constShifts, constMasks;
-    unsigned int i, j, nops;
+    unsigned int i, j, nops,size;
     unsigned int maxFetches = 0;
     const char *yname, *xname;
     const char *ldcName;
+	char hexadec[1];
 
     batch = createStmtBatch();
     if (batch == NULL) {
@@ -1253,6 +1255,14 @@ genUpdateGenericDiagTile(
     tifl = (isUpper) ? TILE_ITER_BACKWARD_ROWS :
                        TILE_ITER_BACKWARD_COLS;
     iterInit(&iter, &tileTempC, 1, tifl);
+	nops = 0;
+	while (!iterIsEnd(&iter)) {
+		nops++;
+		size = nops / nrCols;
+		iterIterate(&iter);
+	}
+
+	iterInit(&iter, &tileTempC, 1, tifl);
 
     initTmpResTile(&tileTempC, gset, true);
 
@@ -1316,7 +1326,7 @@ genUpdateGenericDiagTile(
     maxFetches = umin(maxFetches, i);
 
     // declare vectorized coordinates
-    declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", tempRows);
+    declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", size);
 
     /*
      * real y coordinate, offset mask and
@@ -1326,8 +1336,8 @@ genUpdateGenericDiagTile(
                      "unsigned int mask;\n"
                      "int hit;\n");
     if (withBeta) {
-        declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", tempRows);
-        declareDiagUpresIndexedVars(ctx, typeName, "betaNew", tempRows);
+        declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", size);
+        declareDiagUpresIndexedVars(ctx, typeName, "betaNew", size);
     }
 
     // declare tile
@@ -1443,7 +1453,8 @@ genUpdateGenericDiagTile(
             ksprintf(&kstr, "cc%u", i);
         }
         else {
-            ksprintf(&kstr, "cc%u.s%u", i, iter.col);
+			itoa(iter.col, hexadec, 16);
+            ksprintf(&kstr, "cc%u.s%s", i, hexadec);
         }
 
         // prepare multipliers and fetch

From 342cc8f517f6b48af09a269d9e7fda61a2ed3667 Mon Sep 17 00:00:00 2001
From: Timmy <timmy.liu@amd.com>
Date: Thu, 5 Jun 2014 16:53:54 -0500
Subject: [PATCH 51/59] use snprintf instead of itoa to support linux system

---
 src/library/blas/gens/syrxk.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
index cf47ba9a..54574ed2 100644
--- a/src/library/blas/gens/syrxk.c
+++ b/src/library/blas/gens/syrxk.c
@@ -1224,7 +1224,7 @@ genUpdateGenericDiagTile(
     unsigned int maxFetches = 0;
     const char *yname, *xname;
     const char *ldcName;
-	char hexadec[1];
+	char hexadec[2];
 
     batch = createStmtBatch();
     if (batch == NULL) {
@@ -1453,7 +1453,8 @@ genUpdateGenericDiagTile(
             ksprintf(&kstr, "cc%u", i);
         }
         else {
-			itoa(iter.col, hexadec, 16);
+			snprintf(hexadec, sizeof(char)*2, "%x", iter.col);
+			//itoa(iter.col, hexadec, 16);
             ksprintf(&kstr, "cc%u.s%s", i, hexadec);
         }
 

From 23db183cbf6e669da66626996de7c2bdcdee9180 Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Thu, 12 Jun 2014 16:55:28 +0200
Subject: [PATCH 52/59] Add addBuildOpt helper function for building option
 string

addBuildOpt should be called with the build option which is NOT
surrounded by spaces since it will add spaces to separate options
itself, if necessary.
---
 src/library/blas/generic/common.c          | 17 +++++++++++++++++
 src/library/blas/include/clblas-internal.h |  5 +++++
 2 files changed, 22 insertions(+)

diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index de99f72a..e77a4ce1 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -22,6 +22,7 @@
 #include <clkern.h>
 #include <cltypes.h>
 #include <stdio.h>
+#include <ctype.h>
 
 #include "clblas-internal.h"
 
@@ -537,6 +538,22 @@ setupBuildOpts(
     }
 }
 
+void addBuildOpt(
+    char * opts,
+    size_t len,
+    const char * option)
+{
+    size_t l = strlen(opts);
+
+    if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
+      opts[l] = ' ';
+      opts[l+1]   = '\0';
+    }
+
+    strlcat(opts, option, len);
+}
+
+
 char VISIBILITY_HIDDEN
 *sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
 {
diff --git a/src/library/blas/include/clblas-internal.h b/src/library/blas/include/clblas-internal.h
index 81ab5127..7a9afcdc 100644
--- a/src/library/blas/include/clblas-internal.h
+++ b/src/library/blas/include/clblas-internal.h
@@ -240,6 +240,11 @@ setupBuildOpts(
     cl_device_id devID,
     MemoryPattern *mempat);
 
+void addBuildOpt(
+    char * opts,
+    size_t len,
+    const char * option);
+
 // Internal scatter image API
 
 int

From 8c01bc38fa66318d162f037ce0d15525e00f664e Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Thu, 12 Jun 2014 17:04:08 +0200
Subject: [PATCH 53/59] Replace 'strcat (buildOptStr,...' with
 'addBuildOpt(buildOptStr,...'

This fixes the invalid program options kernel build error on OSX
10.9.3 (cf. issue #37).
---
 src/library/blas/generic/common.c          |  4 +--
 src/library/blas/gens/asum.cpp             |  8 ++---
 src/library/blas/gens/axpy_reg.cpp         |  6 ++--
 src/library/blas/gens/copy_reg.cpp         |  6 ++--
 src/library/blas/gens/dot.cpp              |  6 ++--
 src/library/blas/gens/gbmv.cpp             | 16 +++++-----
 src/library/blas/gens/gemm_cached.cpp      | 36 ++++++++++-----------
 src/library/blas/gens/gemm_tail_cached.cpp | 37 +++++++++++-----------
 src/library/blas/gens/ger_lds.cpp          |  2 +-
 src/library/blas/gens/her2_lds.cpp         |  8 ++---
 src/library/blas/gens/her_lds.cpp          |  8 ++---
 src/library/blas/gens/iamax.cpp            |  8 ++---
 src/library/blas/gens/nrm2.cpp             | 12 +++----
 src/library/blas/gens/reduction.cpp        | 16 +++++-----
 src/library/blas/gens/rotg_reg.cpp         |  4 +--
 src/library/blas/gens/rotm_reg.cpp         |  8 ++---
 src/library/blas/gens/rotmg_reg.cpp        |  2 +-
 src/library/blas/gens/scal_reg.cpp         |  4 +--
 src/library/blas/gens/swap_reg.cpp         |  6 ++--
 src/library/blas/gens/symm_cached.cpp      | 14 ++++----
 src/library/blas/gens/syr2_lds.cpp         |  4 +--
 src/library/blas/gens/syr_lds.cpp          |  4 +--
 src/library/blas/gens/trmv_reg.cpp         | 10 +++---
 src/library/blas/gens/trsv_gemv.cpp        |  4 +--
 src/library/blas/gens/trsv_trtri.cpp       |  6 ++--
 25 files changed, 120 insertions(+), 119 deletions(-)

diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index e77a4ce1..10ec595d 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -527,14 +527,14 @@ setupBuildOpts(
     opts[0] = '\0';
 
 #if !defined NDEBUG
-    strcpy(opts, "-g ");
+    addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-g");
 #endif  /* NDEBUG */
 
     if (target.ident.vendor == VENDOR_NVIDIA &&
         !strcmp(mempat->name, "2-staged cached global memory based "
                               "block trsm")) {
 
-        strcat(opts, "-cl-opt-disable");
+        addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
     }
 }
 
diff --git a/src/library/blas/gens/asum.cpp b/src/library/blas/gens/asum.cpp
index 3260acbe..06b9f544 100644
--- a/src/library/blas/gens/asum.cpp
+++ b/src/library/blas/gens/asum.cpp
@@ -125,23 +125,23 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_DOT
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if ( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE))
 	{
-		strcat( buildOptStr, " -DCOMPLEX ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
 		#ifdef DEBUG_ASUM
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldb.vector) < 1) {
-        strcat( buildOptStr, " -DINCX_NEGATIVE ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NEGATIVE");
     }
 	return;
 }
diff --git a/src/library/blas/gens/axpy_reg.cpp b/src/library/blas/gens/axpy_reg.cpp
index 0f8ced01..52aab71f 100644
--- a/src/library/blas/gens/axpy_reg.cpp
+++ b/src/library/blas/gens/axpy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_AXPY
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/copy_reg.cpp b/src/library/blas/gens/copy_reg.cpp
index d9f70951..ba1ff398 100644
--- a/src/library/blas/gens/copy_reg.cpp
+++ b/src/library/blas/gens/copy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_COPY
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/dot.cpp b/src/library/blas/gens/dot.cpp
index 3f68221d..ed3e72b8 100644
--- a/src/library/blas/gens/dot.cpp
+++ b/src/library/blas/gens/dot.cpp
@@ -128,16 +128,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_DOT
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/gbmv.cpp b/src/library/blas/gens/gbmv.cpp
index 115ffbc0..ab8e5e2a 100644
--- a/src/library/blas/gens/gbmv.cpp
+++ b/src/library/blas/gens/gbmv.cpp
@@ -116,7 +116,7 @@ setBuildOpts(
 
 	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_GBMV
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
@@ -124,15 +124,15 @@ setBuildOpts(
 
     if( kargs->pigFuncID == CLBLAS_TBMV )
 	{
-		strcat( buildOptStr, " -DTBMV_ONLY ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTBMV_ONLY");
 		if( kargs->diag == clblasUnit )
 		{
-		    strcat( buildOptStr, " -DUNIT_DIAG ");
+		    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUNIT_DIAG");
 		}
 	}
 	if( ((kargs->pigFuncID == CLBLAS_GBMV) || (kargs->pigFuncID == CLBLAS_TBMV)) && (kargs->transA == clblasConjTrans) )
 	{
-	    strcat( buildOptStr, " -DDO_CONJ ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
 	}
 
 	if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )
@@ -141,15 +141,15 @@ setBuildOpts(
 	    isUpper = ( kargs->order == clblasColumnMajor )? !isUpper : isUpper;
 
 	    if( isUpper )
-	            strcat( buildOptStr, " -DGIVEN_SHBMV_UPPER ");
-	    else    strcat( buildOptStr, " -DGIVEN_SHBMV_LOWER ");
+	            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_UPPER");
+	    else    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_LOWER");
 
         if(kargs->pigFuncID == CLBLAS_HBMV)
         {
-            strcat( buildOptStr, " -DHBMV_ONLY ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHBMV_ONLY");
             if( kargs->order == clblasColumnMajor )  // Since routine calls Row-major, the whole matrix has to be conjugated while loading
             {
-                strcat( buildOptStr, " -DDO_CONJ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
             }
         }
 	}
diff --git a/src/library/blas/gens/gemm_cached.cpp b/src/library/blas/gens/gemm_cached.cpp
index 09231f90..5c7c3526 100644
--- a/src/library/blas/gens/gemm_cached.cpp
+++ b/src/library/blas/gens/gemm_cached.cpp
@@ -158,36 +158,36 @@ setBuildOpts(
 
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
     }
 
     if (isComplexType(kargs->dtype))
     {
-        strcat(buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
 
     if ((bestSize.useBarrier) == 1)
     {
-	    strcat(buildOptStr, " -DGEMM_NEEDS_BARRIER ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGEMM_NEEDS_BARRIER");
     }
 
     if (kargs->M % dims->y)
 	{
-		strcat(buildOptStr, " -DM_TAIL_PRESENT ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DM_TAIL_PRESENT");
     }
 
 	if (kargs->N % dims->x)
 	{
-		strcat(buildOptStr, " -DN_TAIL_PRESENT ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DN_TAIL_PRESENT");
 	}
 
     if (kflags & KEXTRA_CONJUGATE_A)
     {
-        strcat( buildOptStr, " -DCONJUGATE_A ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
     }
     if (kflags & KEXTRA_CONJUGATE_B)
     {
-        strcat( buildOptStr, " -DCONJUGATE_B ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
     }
 
     switch(kargs->pigFuncID)
@@ -201,46 +201,46 @@ setBuildOpts(
             #endif
             if (kargs->side == clblasLeft)
             {
-                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
             }
             if (kargs->side == clblasRight)
             {
-                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
             }
             if (kargs->uplo == clblasLower)
             {
-                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
             }
             if (kargs->uplo == clblasUpper)
             {
-                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
             }
             // Define the order for Legacy sake.
             if (kargs->order == clblasColumnMajor)
             {
-                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
             } else {
-                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
             }
             if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
             {
-                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
             }
             if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
             {
-                strcat(buildOptStr, " -D__HEMM__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
             }
             break;
 
          case CLBLAS_HERK:
-            strcat( buildOptStr, " -DHERK");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
             if(kargs->uplo == clblasLower)
             {
-                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
             }
             else if(kargs->uplo == clblasUpper)
             {
-                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
             }
             break;
 
diff --git a/src/library/blas/gens/gemm_tail_cached.cpp b/src/library/blas/gens/gemm_tail_cached.cpp
index ea792499..ff144af9 100644
--- a/src/library/blas/gens/gemm_tail_cached.cpp
+++ b/src/library/blas/gens/gemm_tail_cached.cpp
@@ -96,10 +96,10 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     KernelExtraFlags kflags = step->extraFlags;
 
-	strcat(buildOptStr, " -DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT");
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_GEMM_TAIL
         printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
@@ -107,16 +107,16 @@ setBuildOpts(
 
     if (isComplexType(kargs->dtype))
     {
-        strcat(buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
 
     if (kflags & KEXTRA_CONJUGATE_A)
     {
-        strcat( buildOptStr, " -DCONJUGATE_A ");
-}
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
+    }
     if (kflags & KEXTRA_CONJUGATE_B)
     {
-        strcat( buildOptStr, " -DCONJUGATE_B ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
     }
 
 
@@ -127,14 +127,14 @@ setBuildOpts(
             break;
 
         case CLBLAS_HERK:
-            strcat( buildOptStr, " -DHERK");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
             if(kargs->uplo == clblasLower)
             {
-                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
             }
             else if(kargs->uplo == clblasUpper)
             {
-                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
             }
             break;
 
@@ -147,33 +147,34 @@ setBuildOpts(
             #endif
             if (kargs->side == clblasLeft)
             {
-                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
             }
             if (kargs->side == clblasRight)
             {
-                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
             }
             if (kargs->uplo == clblasLower)
             {
-                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
             }
             if (kargs->uplo == clblasUpper)
             {
-                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
             }
+            // Define the order for Legacy sake.
             if (kargs->order == clblasColumnMajor)
             {
-                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
             } else {
-                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
             }
-            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL)  || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
+            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
             {
-                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
             }
             if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
             {
-                strcat(buildOptStr, " -D__HEMM__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
             }
             break;
 
diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp
index ebcd577c..f72d1975 100644
--- a/src/library/blas/gens/ger_lds.cpp
+++ b/src/library/blas/gens/ger_lds.cpp
@@ -137,7 +137,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 
 		#ifdef DEBUG_GER
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp
index e724f118..5adda19d 100644
--- a/src/library/blas/gens/her2_lds.cpp
+++ b/src/library/blas/gens/her2_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_HER2
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( kargs->order == clblasRowMajor )
 	{
-		strcat( buildOptStr, " -DHER2_ROWMAJOR ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ROWMAJOR");
 		#ifdef DEBUG_HER2
 		printf("Setting build options ... HERMITIAN2_ROWMAJOR... for row-major support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_HPR2 )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	//Build options for syr2_her2.clT to generate HER2 related code.
-	strcat( buildOptStr, " -DHER2_ONLY ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ONLY");
 	return;
 }
 
diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp
index 6b489cf5..1a8365f0 100644
--- a/src/library/blas/gens/her_lds.cpp
+++ b/src/library/blas/gens/her_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_HER
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( kargs->order == clblasRowMajor )
 	{
-		strcat( buildOptStr, " -DHERMITIAN_ROWMAJOR ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERMITIAN_ROWMAJOR");
 		#ifdef DEBUG_HER
 		printf("Setting build options ... HERMITIAN_ROWMAJOR... for row-major support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_HPR )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	//Build options for syr_her.clT to generate HER related code.
-	strcat( buildOptStr, " -DHER_ONLY ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER_ONLY");
 	return;
 }
 
diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp
index bf20afd0..7a5966de 100644
--- a/src/library/blas/gens/iamax.cpp
+++ b/src/library/blas/gens/iamax.cpp
@@ -124,7 +124,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_AMAX
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
@@ -132,17 +132,17 @@ setBuildOpts(
 
     if( (kargs->ldb.vector) != 1)
     {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
 
     if( (kargs->ldb.vector) < 1)
     {
-        strcat( buildOptStr, " -DRETURN_ON_INVALID ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
     }
 
     if( (kargs->redctnType == REDUCE_MAX_WITH_INDEX_ATOMICS))
     {
-        strcat( buildOptStr, " -DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
     }
 
 	return;
diff --git a/src/library/blas/gens/nrm2.cpp b/src/library/blas/gens/nrm2.cpp
index 832f5e41..d898ffbc 100644
--- a/src/library/blas/gens/nrm2.cpp
+++ b/src/library/blas/gens/nrm2.cpp
@@ -128,22 +128,22 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
     if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-        strcat( buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
     if(kargs->redctnType == REDUCE_BY_HYPOT) {
-            strcat( buildOptStr, "-DUSE_HYPOT ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_HYPOT");
     } else if(kargs->redctnType == REDUCE_BY_SSQ) {
-            strcat( buildOptStr, " -DUSE_SSQ ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_SSQ");
     }
 
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldb.vector) < 1) {
-        strcat( buildOptStr, " -DRETURN_ON_INVALID");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
     }
 	return;
 }
diff --git a/src/library/blas/gens/reduction.cpp b/src/library/blas/gens/reduction.cpp
index 1c81c0b7..5c005280 100644
--- a/src/library/blas/gens/reduction.cpp
+++ b/src/library/blas/gens/reduction.cpp
@@ -130,29 +130,29 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
     switch(kargs->redctnType)
     {
-        case REDUCE_BY_SUM:                 strcat( buildOptStr, "-DREDUCE_BY_SUM ");
+        case REDUCE_BY_SUM:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SUM");
                                             break;
 
-        case REDUCE_BY_MAX:                 strcat( buildOptStr, "-DREDUCE_BY_MAX ");
+        case REDUCE_BY_MAX:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MAX");
                                             break;
 
-        case REDUCE_BY_MIN:                 strcat( buildOptStr, "-DREDUCE_BY_MIN ");
+        case REDUCE_BY_MIN:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MIN");
                                             break;
 
-        case REDUCE_MAX_WITH_INDEX:         strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX ");
+        case REDUCE_MAX_WITH_INDEX:         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX");
                                             break;
 
-        case REDUCE_BY_HYPOT:               strcat( buildOptStr, "-DREDUCE_BY_HYPOT ");
+        case REDUCE_BY_HYPOT:               addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_HYPOT");
                                             break;
 
-        case REDUCE_BY_SSQ:                 strcat( buildOptStr, "-DREDUCE_BY_SSQ ");
+        case REDUCE_BY_SSQ:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SSQ");
                                             break;
 
-        case REDUCE_MAX_WITH_INDEX_ATOMICS: strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+        case REDUCE_MAX_WITH_INDEX_ATOMICS: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
                                             break;
 
         default:                            printf("Invalid reduction type!!\n");
diff --git a/src/library/blas/gens/rotg_reg.cpp b/src/library/blas/gens/rotg_reg.cpp
index 0ec1eb0a..4d1ded18 100644
--- a/src/library/blas/gens/rotg_reg.cpp
+++ b/src/library/blas/gens/rotg_reg.cpp
@@ -98,10 +98,10 @@ setBuildOpts(
 	const SolutionStep *step = (const SolutionStep *)args;
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 	if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-	    strcat( buildOptStr, " -DCOMPLEX ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
 	}
 
 	return;
diff --git a/src/library/blas/gens/rotm_reg.cpp b/src/library/blas/gens/rotm_reg.cpp
index 2b044192..2b87507e 100644
--- a/src/library/blas/gens/rotm_reg.cpp
+++ b/src/library/blas/gens/rotm_reg.cpp
@@ -121,17 +121,17 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 	if(kargs->pigFuncID == CLBLAS_ROT)
 	{
-	    strcat( buildOptStr, " -DDO_ROT ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_ROT");
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/rotmg_reg.cpp b/src/library/blas/gens/rotmg_reg.cpp
index b256ac6f..be62004e 100644
--- a/src/library/blas/gens/rotmg_reg.cpp
+++ b/src/library/blas/gens/rotmg_reg.cpp
@@ -97,7 +97,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
 	}
 
 	return;
diff --git a/src/library/blas/gens/scal_reg.cpp b/src/library/blas/gens/scal_reg.cpp
index d82362b1..8b853106 100644
--- a/src/library/blas/gens/scal_reg.cpp
+++ b/src/library/blas/gens/scal_reg.cpp
@@ -125,13 +125,13 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SCAL
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/swap_reg.cpp b/src/library/blas/gens/swap_reg.cpp
index 5b44cebe..b75e1004 100644
--- a/src/library/blas/gens/swap_reg.cpp
+++ b/src/library/blas/gens/swap_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SWAP
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/symm_cached.cpp b/src/library/blas/gens/symm_cached.cpp
index 40011823..0d9ea8d3 100644
--- a/src/library/blas/gens/symm_cached.cpp
+++ b/src/library/blas/gens/symm_cached.cpp
@@ -99,7 +99,7 @@ setBuildOpts(
 
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_TRMV
         printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
@@ -107,23 +107,23 @@ setBuildOpts(
 
 	if (kargs->side == clblasLeft)
 	{
-		strcat(buildOptStr, " -D__SYMM_LEFT__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__ ");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_RIGHT__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
 	}
 
 	if (kargs->uplo == clblasUpper)
 	{
-		strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
 	}
 
 	if (kargs->order == clblasColumnMajor)
 	{
-		strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
 	}
 
 	strcat(buildOptStr, " -cl-mad-enable ");
diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp
index 4abb4ad9..f5c20cb1 100644
--- a/src/library/blas/gens/syr2_lds.cpp
+++ b/src/library/blas/gens/syr2_lds.cpp
@@ -139,14 +139,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SYR2
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_SPR2 )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp
index 2379a6b6..16911bb4 100644
--- a/src/library/blas/gens/syr_lds.cpp
+++ b/src/library/blas/gens/syr_lds.cpp
@@ -142,14 +142,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SYR
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_SPR )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp
index 25e750e4..9cacd0f1 100644
--- a/src/library/blas/gens/trmv_reg.cpp
+++ b/src/library/blas/gens/trmv_reg.cpp
@@ -136,28 +136,28 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_TRMV
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (step->funcID == CLBLAS_HEMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
 	{
-		strcat( buildOptStr, " -DHEMV_ONLY ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ONLY");
 		/*
 		if(kargs->diag == clblasUnit)
 		{
-			strcat( buildOptStr, " -DHEMV_ZERO_DIAG ");
+			addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ZERO_DIAG");
 		}
 		*/
 	}
     if ( kargs->pigFuncID == CLBLAS_SPMV )
     {
-        strcat( buildOptStr, " -DSPMV_ONLY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DSPMV_ONLY");
     }
     if( (kargs->pigFuncID == CLBLAS_TPMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp
index 5047bf17..ca73fbe5 100644
--- a/src/library/blas/gens/trsv_gemv.cpp
+++ b/src/library/blas/gens/trsv_gemv.cpp
@@ -128,14 +128,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_TRSV_GEMV
 		printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_TPSV)
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
         #ifdef DEBUG_TRSV_GEMV
             printf("TPSV GEMV: Setting build options ... PACKED\n");
         #endif
diff --git a/src/library/blas/gens/trsv_trtri.cpp b/src/library/blas/gens/trsv_trtri.cpp
index 071565ff..0bae0f99 100644
--- a/src/library/blas/gens/trsv_trtri.cpp
+++ b/src/library/blas/gens/trsv_trtri.cpp
@@ -128,21 +128,21 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_TRSV_TRTRI
         printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
     }
     if( kargs->pigFuncID == CLBLAS_TPSV)
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
         #ifdef DEBUG_TRSV_TRTRI
             printf("TPSV TRTRI: Setting build options ... PACKED\n");
         #endif
     }
     if( kargs->pigFuncID == CLBLAS_TBSV)
     {
-        strcat( buildOptStr, " -DBANDED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED");
         #ifdef DEBUG_TRSV_TRTRI
         printf("TBSV TRTRI: Setting build options .. BANDED\n");
         #endif

From 9624cccb7421dbdf0d5d2eabe89721d6f394324b Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Thu, 12 Jun 2014 17:55:42 +0200
Subject: [PATCH 54/59] addBuildOpt: use strncat instead of strlcat

Linux does not have strlcat.
---
 src/library/blas/generic/common.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index 10ec595d..fef08800 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -548,9 +548,10 @@ void addBuildOpt(
     if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
       opts[l] = ' ';
       opts[l+1]   = '\0';
+      l++;
     }
 
-    strlcat(opts, option, len);
+    strncat(opts, option, len - l - 1);
 }
 
 

From dbe77410f4041d5ca274b1d0ae5dd1929c92e628 Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Fri, 13 Jun 2014 15:51:56 +0200
Subject: [PATCH 55/59] OSX: don't try to call *nrm2 blas calls if incx < 1 or
 N < 1

Because calling nrm2 with negative incx will lead to a crash on
OSX. See issue #37.
---
 src/tests/correctness/blas-lapack.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index 4c93104a..d75a5dd8 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -840,22 +840,50 @@ int izamax( int n, doublecomplex *x, int incx)
 
 float snrm2( int n, float *x, int incx)
 {
+#ifdef __APPLE__
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_snrm2(n, x, incx);
+#else
     return snrm2_(&n, x, &incx);
+#endif
 }
 
 double dnrm2( int n, double *x, int incx)
 {
+#ifdef __APPLE__
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_dnrm2(n, x, incx);
+#else
     return dnrm2_(&n, x, &incx);
+#endif
 }
 
 float scnrm2( int n, complex *x, int incx)
 {
+#ifdef __APPLE__
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_scnrm2(n, x, incx);
+#else
     return scnrm2_(&n, x, &incx);
+#endif
 }
 
 double dznrm2( int n, doublecomplex *x, int incx)
 {
+#ifdef __APPLE__
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_dznrm2(n, x, incx);
+#else
     return dznrm2_(&n, x, &incx);
+#endif
 }
 
 float sasum( int n, float *x, int incx)

From 4e2c14c77e5768c5ad407bb8017af964853f812d Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Fri, 13 Jun 2014 15:54:39 +0200
Subject: [PATCH 56/59] OSX: Use cblas interface for *dot blas functions

Using the documented cblas_*dot interface for the *dot BLAS
functions on OSX (using the Accelerate framework) will give the
expected result in contrast do using *dot_ functions.
---
 src/tests/correctness/blas-lapack.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index d75a5dd8..2d02137e 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -630,12 +630,20 @@ void zdscal( int n, double alpha, doublecomplex *x, int incx)
 
 float sdot( int n, float *x, int incx,  float *y, int incy)
 {
+#ifdef __APPLE__
+    return cblas_sdot(n, x, incx, y, incy);
+#else
     return sdot_(&n, x, &incx, y, &incy);
+#endif
 }
 
 double ddot( int n, double *x, int incx,  double *y, int incy)
 {
+#ifdef __APPLE__
+    return cblas_ddot(n, x, incx, y, incy);
+#else
     return ddot_(&n, x, &incx, y, &incy);
+#endif
 }
 
 complex cdotu( int n, complex *x, int incx, complex *y, int incy)

From 1eb79e7cb01d06486d9fa7e04ea7d7bd87085acf Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Fri, 13 Jun 2014 15:56:27 +0200
Subject: [PATCH 57/59] OSX: Use cblas interface for *asum blas functions

Using cblas_*asum instead of *asum_ will give the expected result
---
 src/tests/correctness/blas-lapack.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index 2d02137e..1f582729 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -896,22 +896,38 @@ double dznrm2( int n, doublecomplex *x, int incx)
 
 float sasum( int n, float *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_sasum(n, x, incx);
+#else
     return sasum_(&n, x, &incx);
+#endif
 }
 
 double dasum( int n, double *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_dasum(n, x, incx);
+#else
     return dasum_(&n, x, &incx);
+#endif
 }
 
 float scasum( int n, complex *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_scasum(n, x, incx);
+#else
     return scasum_(&n, x, &incx);
+#endif
 }
 
 double dzasum( int n, doublecomplex *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_dzasum(n, x, incx);
+#else
     return dzasum_(&n, x, &incx);
+#endif
 }
 
 #endif

From e75d11dc840a010bea5285dc49d67ee8fabc625c Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Fri, 13 Jun 2014 23:27:02 +0200
Subject: [PATCH 58/59] Fix c&p bug in rotmg gen introduced by addBuildOpt
 change

---
 src/library/blas/gens/rotmg_reg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/library/blas/gens/rotmg_reg.cpp b/src/library/blas/gens/rotmg_reg.cpp
index be62004e..7c333c6f 100644
--- a/src/library/blas/gens/rotmg_reg.cpp
+++ b/src/library/blas/gens/rotmg_reg.cpp
@@ -97,7 +97,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 
 	return;

From c87d8c6fcf774fbd7c514b4340bb40c317837493 Mon Sep 17 00:00:00 2001
From: Christian Kellner <christian@kellner.me>
Date: Tue, 17 Jun 2014 17:22:31 +0200
Subject: [PATCH 59/59] Add an comment to explain the nrm2 workaround on OSX

---
 src/tests/correctness/blas-lapack.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index 1f582729..9687bdf3 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -849,6 +849,8 @@ int izamax( int n, doublecomplex *x, int incx)
 float snrm2( int n, float *x, int incx)
 {
 #ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
     if (n < 1 || incx < 1) {
         return 0;
     }
@@ -861,6 +863,8 @@ float snrm2( int n, float *x, int incx)
 double dnrm2( int n, double *x, int incx)
 {
 #ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
     if (n < 1 || incx < 1) {
         return 0;
     }
@@ -873,6 +877,8 @@ double dnrm2( int n, double *x, int incx)
 float scnrm2( int n, complex *x, int incx)
 {
 #ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
     if (n < 1 || incx < 1) {
         return 0;
     }
@@ -885,6 +891,8 @@ float scnrm2( int n, complex *x, int incx)
 double dznrm2( int n, doublecomplex *x, int incx)
 {
 #ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
     if (n < 1 || incx < 1) {
         return 0;
     }