diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..2d30e3b3
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,44 @@
+language: cpp
+
+compiler:
+  - gcc
+
+before_install:
+  - sudo apt-get update -qq
+  - sudo apt-get install -qq fglrx opencl-headers libboost-program-options-dev libgtest-dev
+# Uncomment below to help verify the installs above work
+#  - ls -la /usr/lib/libboost*
+#  - ls -la /usr/include/boost
+#  - ls -la /usr/src/gtest
+
+install:
+  - mkdir -p bin/gTest
+  - cd bin/gTest
+  - cmake -DCMAKE_BUILD_TYPE=Release /usr/src/gtest
+  - make
+  - sudo mv libg* /usr/lib
+
+before_script:
+  - cd ${TRAVIS_BUILD_DIR}
+  - mkdir -p bin/clBLAS
+  - cd bin/clBLAS
+  - cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TEST=OFF -DBUILD_CLIENT=ON ../../src
+
+script: 
+  - make install
+#  - ls -Rla package
+# Run a simple test to validate that the build works; CPU device in a VM
+  - cd package/bin
+  - export LD_LIBRARY_PATH=${TRAVIS_BUILD_DIR}/bin/clBLAS/package/lib64:${LD_LIBRARY_PATH}
+  - ./client --cpu
+
+after_success:
+  - cd ${TRAVIS_BUILD_DIR}/bin/clBLAS
+  - make package
+
+notifications:
+   email:
+     - clmath-developers@googlegroups.com
+   on_success: change
+   on_failure: always
+   
\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 9cd3d900..03b9faff 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -243,34 +243,3 @@ For example:
 	./example_sgemm
 		- Run a simple client; one example is provided for each supported main 
 		BLAS function family.
-_______________________________________________________________________________
-(C) 2010-2013 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD 
-Arrow logo, ATI, the ATI logo, Radeon, FireStream, FireGL, Catalyst, and 
-combinations thereof are trademarks of Advanced Micro Devices, Inc. Microsoft 
-(R), Windows, and Windows Vista (R) are registered trademarks of Microsoft 
-Corporation in the U.S. and/or other jurisdictions. OpenCL and the OpenCL logo 
-are trademarks of Apple Inc. used by permission by Khronos. Other names are for 
-informational purposes only and may be trademarks of their respective owners.
-
-The contents of this document are provided in connection with Advanced Micro 
-Devices, Inc. ("AMD") products. AMD makes no representations or warranties with 
-respect to the accuracy or completeness of the contents of this publication and 
-reserves the right to make changes to specifications and product descriptions 
-at any time without notice. The information contained herein may be of a 
-preliminary or advance nature and is subject to change without notice. No 
-license, whether express, implied, arising by estoppel or otherwise, to any 
-intellectual property rights is granted by this publication. Except as set forth
-in AMD's Standard Terms and Conditions of Sale, AMD assumes no liability 
-whatsoever, and disclaims any express or implied warranty, relating to its 
-products including, but not limited to, the implied warranty of 
-merchantability, fitness for a particular purpose, or infringement of any 
-intellectual property right.
-
-AMD's products are not designed, intended, authorized or warranted for use as 
-components in systems intended for surgical implant into the body, or in other 
-applications intended to support or sustain life, or in any other application 
-in which the failure of AMD's product could create a situation where personal 
-injury, death, or severe property or environmental damage may occur. AMD 
-reserves the right to discontinue or make changes to its products at any time 
-without notice.
-_______________________________________________________________________________
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 61932f1f..0dc5c7e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,7 +8,7 @@ Firstly, in order to contribute code to this project, a contributor must have a
 * After forking, the contributor [clones their repository](https://help.github.com/articles/create-a-repo) locally on their machine
 * Code is developed and checked into the contributor's repository.  These commits are eventually pushed upstream to their GitHub repository
 * The contributor then issues a [pull-request](https://help.github.com/articles/using-pull-requests) against the **develop** branch of this repository, which is the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow which is well suited for working with GitHub
-    * A [git extention](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.  Refer to the projects wiki
+    * A [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.  Refer to the projects wiki
 
 At this point, the repository maintainers will be notified by GitHub that a 'pull request' exists pending against their repository.  A code review should be completed within a few days, depending on the scope of submitted code, and the code will either be accepted, rejected or commented on for extra feedback.
 
@@ -32,5 +32,5 @@ guidelines over time
 Pull requests will be reviewed by the set of collaborators that are assigned for the repository.  Pull requests may be accepted, declined or a conversation may start on the pull request thread with feedback.  If the pull request is trivial and all the submission guidelines defined above are honored, the pull request may be accepted without delay.  If the pull request is good, but the guidelines defined above are not followed, the collaborators may leave feedback on the pull request and engage in a conversation with the contributor with what they can do to improve the pull request.  At any time, collaborators may decline a pull request if they decide the contribution is not appropriate for the project, or the feedback from reviewers on a pull request is not being addressed in an appropriate amount of time.
 
 ## Is it possible to become an official collaborator of the repository?
-Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project.  When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator.  These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes.  It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project.  The benefit of being a repository collaborator allows you to be able to be able to manage other peoples pull requests.
+Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project.  When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator.  These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes.  It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project.  The benefit of being a repository collaborator allows you to be able to manage other peoples pull requests.
 
diff --git a/README.md b/README.md
index dfdaa645..5f0338e9 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,110 @@
 clBLAS
 =====
+[![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.png)](https://travis-ci.org/clMathLibraries/clBLAS)
+
+
+This repository houses the code for the OpenCL™ BLAS portion of clMath.
+The complete set of BLAS level 1, 2 & 3 routines is implemented. Please
+see Netlib BLAS for the list of supported routines. In addition to GPU
+devices, the library also supports running on CPU devices to facilitate
+debugging and multicore programming. APPML 1.10 is the most current
+generally available pre-packaged binary version of the library available
+for download for both Linux and Windows platforms.
+
+The primary goal of clBLAS is to make it easier for developers to
+utilize the inherent performance and power efficiency benefits of
+heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL
+interfaces, but rather leaves OpenCL state management to the control of
+the user to allow for maximum performance and flexibility. The clBLAS
+library does generate and enqueue optimized OpenCL kernels, relieving
+the user from the task of writing, optimizing and maintaining kernel
+code themselves.
 
-clMATH is a software library containing FFT and BLAS functions written in OpenCL. In addition to GPU devices, the libraries also support running on CPU devices to facilitate debugging and multicore programming.
+## clBLAS library user documentation
 
-<a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-math-libraries/">APPML 1.10</a> is the most current generally available version of the library, and pre-built binaries are available for download on both Linux and Windows platforms.
+[Library and API documentation][] for developers is available online as
+a GitHub Pages website
 
-This repository houses the code for the OpenCL™ BLAS portion of APPML.  The complete set of BLAS level 1, 2 & 3 routines has been  implemented. Please see <a href="http://www.netlib.org/blas/index.html"> Netlib BLAS </a> for the list of routines.  For more information on supported graphics cards, see the <a href="http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/system-requirements-driver-compatibility/">AMD APP System Requirements</a>.
+### Google Groups
 
-The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing.  clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility.  The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
+Two mailing lists have been created for the clMath projects:
 
-## clBLAS library user documentation
-[Library and API documentation]( http://clmathlibraries.github.io/clBLAS/ ) for developers is available online as a GitHub Pages website
+-   [clmath@googlegroups.com][] - group whose focus is to answer
+    questions on using the library or reporting issues
+
+-   [clmath-developers@googlegroups.com][] - group whose focus is for
+    developers interested in contributing to the library code itself
 
 ## clBLAS Wiki
-The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/clMathLibraries/clBLAS/wiki/Build)
+
+The [project wiki][] contains helpful documentation, including a [build
+primer][]
 
 ## Contributing code
-Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
+
+Please refer to and read the [Contributing][] document for guidelines on
+how to contribute code to this open source project. The code in the
+/master branch is considered to be stable, and all pull-requests should
+be made against the /develop branch.
 
 ## License
-The source for clFFT is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+
+The source for clBLAS is licensed under the [Apache License, Version
+2.0][]
 
 ## Example
-The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM
 
-```c
-#include <sys/types.h>
-#include <stdio.h>
+The simple example below shows how to use clBLAS to compute an OpenCL
+accelerated SGEMM
 
-/* Include the clBLAS header. It includes the appropriate OpenCL headers
+    #include <sys/types.h>
+    #include <stdio.h>
+
+    /* Include the clBLAS header. It includes the appropriate OpenCL headers
  */
-#include <clBLAS.h>
+    #include <clBLAS.h>
 
-/* This example uses predefined matrices and their characteristics for
+    /* This example uses predefined matrices and their characteristics for
  * simplicity purpose.
  */
 
-#define M  4
-#define N  3
-#define K  5
+    #define M  4
+    #define N  3
+    #define K  5
 
-static const cl_float alpha = 10;
+    static const cl_float alpha = 10;
 
-static const cl_float A[M*K] = {
+    static const cl_float A[M*K] = {
     11, 12, 13, 14, 15,
     21, 22, 23, 24, 25,
     31, 32, 33, 34, 35,
     41, 42, 43, 44, 45,
-};
-static const size_t lda = K;        /* i.e. lda = K */
+    };
+    static const size_t lda = K;        /* i.e. lda = K */
 
-static const cl_float B[K*N] = {
+    static const cl_float B[K*N] = {
     11, 12, 13,
     21, 22, 23,
     31, 32, 33,
     41, 42, 43,
     51, 52, 53,
-};
-static const size_t ldb = N;        /* i.e. ldb = N */
+    };
+    static const size_t ldb = N;        /* i.e. ldb = N */
 
-static const cl_float beta = 20;
+    static const cl_float beta = 20;
 
-static cl_float C[M*N] = {
+    static cl_float C[M*N] = {
     11, 12, 13,
     21, 22, 23,
     31, 32, 33,
     41, 42, 43, 
-};
-static const size_t ldc = N;        /* i.e. ldc = N */
+    };
+    static const size_t ldc = N;        /* i.e. ldc = N */
 
-static cl_float result[M*N];
+    static cl_float result[M*N];
 
-int main( void )
-{
+    int main( void )
+    {
     cl_int err;
     cl_platform_id platform = 0;
     cl_device_id device = 0;
@@ -138,25 +170,48 @@ int main( void )
     clReleaseContext( ctx );
 
     return ret;
-}
-```
+    }
 
 ## Build dependencies
+
 ### Library for Windows
-*  Windows® 7/8
-*  Visual Studio 2010 SP1
-*  An OpenCL SDK, such as APP SDK 2.8
-*  Latest CMake
+
+-   Windows® 7/8
+
+-   Visual Studio 2010 SP1, 2012
+
+-   An OpenCL SDK, such as APP SDK 2.9
+
+-   Latest CMake
 
 ### Library for Linux
-*  GCC 4.6 and onwards
-*  An OpenCL SDK, such as APP SDK 2.8
-*  Latest CMake
+
+-   GCC 4.6 and onwards
+
+-   An OpenCL SDK, such as APP SDK 2.9
+
+-   Latest CMake
+
+### Library for Mac OSX
+
+-   Recommended to generate Unix makefiles with cmake
 
 ### Test infrastructure
-* Latest Googletest
-* Latest ACML 
-* Latest Boost
+
+-   Googletest v1.6
+
+-   ACML on windows/linux; Accelerate on Mac OSX
+
+-   Latest Boost
 
 ### Performance infrastructure
-* Python
\ No newline at end of file
+
+-   Python
+
+  [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
+  [clmath@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
+  [clmath-developers@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath-developers
+  [project wiki]: https://github.com/clMathLibraries/clBLAS/wiki
+  [build primer]: https://github.com/clMathLibraries/clBLAS/wiki/Build
+  [Contributing]: CONTRIBUTING.md
+  [Apache License, Version 2.0]: http://www.apache.org/licenses/LICENSE-2.0
diff --git a/doc/clBLAS.doxy b/doc/clBLAS.doxy
index 86fbbfc4..afc15ae0 100644
--- a/doc/clBLAS.doxy
+++ b/doc/clBLAS.doxy
@@ -52,7 +52,7 @@ PROJECT_LOGO           =
 # If a relative path is entered, it will be relative to the location 
 # where doxygen was started. If left blank the current directory will be used.
 
-OUTPUT_DIRECTORY       = F:\code\git-svn\clBLAS.head\bin\master\vs10x64.superbuild\docs
+OUTPUT_DIRECTORY       = ..\..\bin\clBLAS.doxy
 
 # If the CREATE_SUBDIRS tag is set to YES, then doxygen will create 
 # 4096 sub-directories (in 2 levels) under the output directory of each output 
@@ -651,17 +651,17 @@ WARN_LOGFILE           =
 # directories like "/usr/src/myproject". Separate the files or directories 
 # with spaces.
 
-INPUT                  = clBLAS.h \
-                         include/cltypes.h \
-                         include/kerngen.h \
-                         include/solver.h \
-                         include/mempat.h \
-                         src/blas/gens/blas_kgen.h \
-                         src/blas/include/clblas-internal.h \
-                         src/blas/include/kernel_extra.h \
-                         src/blas/include/solution_seq.h \
-                         include/granulation.h \
-                         src/tools/ktest/step.h
+INPUT                  = ../src/clBLAS.h \
+                         ../src/include/cltypes.h \
+                         ../src/include/kerngen.h \
+                         ../src/include/solver.h \
+                         ../src/include/mempat.h \
+                         ../src/library/gens/blas_kgen.h \
+                         ../src/library/include/clblas-internal.h \
+                         ../src/library/include/kernel_extra.h \
+                         ../src/library/include/solution_seq.h \
+                         ../src/include/granulation.h \
+                         ../src/library/tools/ktest/step.h
 
 # This tag can be used to specify the character encoding of the source files 
 # that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is 
@@ -721,7 +721,7 @@ EXCLUDE_SYMBOLS        =
 # directories that contain example code fragments that are included (see 
 # the \include command).
 
-EXAMPLE_PATH           = samples
+EXAMPLE_PATH           = ../src/samples
 
 # If the value of the EXAMPLE_PATH tag contains directories, you can use the 
 # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6602b795..41b54ab6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ########################################################################
 
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 2.8)
 
 #User toggle-able options that can be changed on the command line with -D
 option( BUILD_RUNTIME "Build the BLAS runtime library" ON )
@@ -28,26 +28,39 @@ option( BUILD_KTEST "A command line tool for testing single clBLAS kernel" ON )
 # However, test-correctness can instead use NETLIB as a reference library
 set(CORR_TEST_WITH_ACML ON CACHE BOOL "Use ACML library in correctness tests")
 
-# uncomment these to print compiler invocation lines for nmake files
-# set( CMAKE_START_TEMP_FILE "" )
-# set( CMAKE_END_TEMP_FILE "" )
-# set( CMAKE_VERBOSE_MAKEFILE 1 )
+if( CMAKE_GENERATOR MATCHES "NMake" )
+  option( NMAKE_COMPILE_VERBOSE "Print compile and link strings to the console" OFF )
+  if( NMAKE_COMPILE_VERBOSE )
+    set( CMAKE_START_TEMP_FILE "" )
+    set( CMAKE_END_TEMP_FILE "" )
+    set( CMAKE_VERBOSE_MAKEFILE 1 )
+  endif( )
+endif( )
 
 # If we are on linux, and we wish to link with the netlib BLAS implementation, we need to have a valid fortran compiler
-if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE )
   project(clBLAS Fortran C CXX )
 else( )
   project(clBLAS C CXX)
 endif( )
 
 # Define a version for the code
-set( clBLAS_VERSION_MAJOR 2 )
-set( clBLAS_VERSION_MINOR 0 )
-set( clBLAS_VERSION_PATCH 0 )
+if( NOT DEFINED clBLAS_VERSION_MAJOR )
+    set( clBLAS_VERSION_MAJOR 2 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_MINOR )
+    set( clBLAS_VERSION_MINOR 2 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_PATCH )
+    set( clBLAS_VERSION_PATCH 0 )
+endif( )
+
 set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
 
 # Increment this if we break backward compatibility.
-set(clBLAS_SOVERSION 1)
+set( clBLAS_SOVERSION 2 )
 
 # We have custom written Find* modules now in the root source directory
 set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR} )
@@ -58,24 +71,34 @@ if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
 	set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE )
 endif( )
 
-set( ACMLROOT $ENV{ACMLROOT} CACHE PATH "AMD ACML root path")
-
 if(NOT CMAKE_BUILD_TYPE)
   set(CMAKE_BUILD_TYPE Debug CACHE STRING
       "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
       FORCE)
 endif()
 
+# These variables are meant to contain string which should be appended to the installation paths 
+# of library and executable binaries, respectively.  They are meant to be user configurable/overridable.  
+set( SUFFIX_LIB_DEFAULT "" )
+set( SUFFIX_BIN_DEFAULT "" )
+
 if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64)
     set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+    if( TARGET_PLATFORM EQUAL 64 )
+        set( SUFFIX_LIB_DEFAULT "64" )
+    endif( )
 else()
     if(CMAKE_SIZEOF_VOID_P MATCHES 8)
         set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+        set( SUFFIX_LIB_DEFAULT "64" )
     else()
         set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
     endif()
 endif()
 
+set( SUFFIX_LIB ${SUFFIX_LIB_DEFAULT} CACHE STRING "String to append to 'lib' install path" )
+set( SUFFIX_BIN ${SUFFIX_BIN_DEFAULT} CACHE STRING "String to append to 'bin' install path" )
+
 if( MSVC_IDE )
     set_property( GLOBAL PROPERTY USE_FOLDERS TRUE )
 endif( )
@@ -98,19 +121,14 @@ endif()
 # TODO: maybe this could be written using the FindBLAS module in the future
 if( BUILD_TEST )
 	if(NOT CORR_TEST_WITH_ACML)
-		if( WIN32 )
+	        if(APPLE)
+			find_library(BLAS_LIBRARIES Accelerate)
+		       	MARK_AS_ADVANCED(BLAS_LIBRARIES)
+		       	message(STATUS "Using Accelerate framework on Mac OS-X")
+	       	else()
 			find_package( Netlib COMPONENTS BLAS REQUIRED )
+              	endif()
 		else( )
-			if( $ENV{REFBLAS_ROOT} )
-				set( REFBLAS_ROOT $ENV{REFBLAS_ROOT} CACHE PATH "NetLib BLAS root path")
-			else( )
-				message(FATAL_ERROR "Cannot find reference BLAS, please set REFBLAS_ROOT environment variable")
-			endif( )
-			
-			# Find reference BLAS implementation
-			include( ${REFBLAS_ROOT}/package/cmake/exportBLAS.cmake )
-		endif( )
-	else( )
 		# Find ACML BLAS implementation
 		# platform dependent ACML subdirectory
 		if (WIN32)
@@ -121,9 +139,10 @@ if( BUILD_TEST )
 
 		find_path(ACML_INCLUDE_DIRS acml.h
 			HINTS
-				$ENV{ACMLROOT}/include
-				${ACMLROOT}/include
-				${ACMLROOT}/${ACML_SUBDIR}/include
+				${ACML_ROOT}/include
+				${ACML_ROOT}/${ACML_SUBDIR}/include
+				$ENV{ACML_ROOT}/include
+                                $ENV{ACML_ROOT}/${ACML_SUBDIR}/include
 		)
 
 		if( ACML_INCLUDE_DIRS )
@@ -132,27 +151,30 @@ if( BUILD_TEST )
 		endif()
 		
 		if( UNIX )
-			find_library(ACML_LIBRARIES acml acml_mp
+			find_library(ACML_LIBRARIES acml_mp
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			find_library(_acml_mv_library acml_mv
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 			mark_as_advanced(_acml_mv_library)
 		endif( )
 		
 		if(WIN32)
-			find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
+			find_library(ACML_LIBRARIES libacml_mp_dll
 				HINTS
-					$ENV{ACMLROOT}/lib
-					${ACMLROOT}/lib
-					${ACMLROOT}/${ACML_SUBDIR}/lib
+					${ACML_ROOT}/lib
+					${ACML_ROOT}/${ACML_SUBDIR}/lib
+					$ENV{ACML_ROOT}/lib
+                                        $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
 			)
 		endif( )
 		
@@ -213,16 +235,21 @@ if(CMAKE_COMPILER_IS_GNUCXX)
     if(TARGET_PLATFORM EQUAL 32)
         set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin")
     endif()
-endif()
+elseif( MSVC )
+	# CMake sets huge stack frames for windows, for whatever reason.  We go with compiler default.
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" )
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" )
+	string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" ) 
+endif( )
 
 if (WIN32)
     add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-endif()
+endif( )
 
 #TODO:  We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( )
 add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS )
 
-configure_file( "${PROJECT_SOURCE_DIR}/version.h.in" "${PROJECT_BINARY_DIR}/include/version.h" )
+configure_file( "${PROJECT_SOURCE_DIR}/clBLAS.version.h.in" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" )
 
 # configure a header file to pass the CMake version settings to the source, and package the header files in the output archive
 install( FILES 
@@ -230,7 +257,7 @@ install( FILES
 			"clAmdBlas.h"
 			"clAmdBlas.version.h"
 			"clBLAS-complex.h"
-			"${PROJECT_BINARY_DIR}/include/version.h"
+			"${PROJECT_BINARY_DIR}/include/clBLAS.version.h"
 		DESTINATION 
 			"./include" )
 
diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
index 1cdc43de..8725612f 100644
--- a/src/FindOpenCL.cmake
+++ b/src/FindOpenCL.cmake
@@ -46,21 +46,17 @@
 #    target_link_libraries(foo ${OPENCL_LIBRARIES})
 #
 #-----------------------
-if( DEFINED ENV{AMDAPPSDKROOT} )
-	set( OPENCL_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-else( )
-	set( OPENCL_ROOT "/usr/lib" CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-endif( )
 
 find_path(OPENCL_INCLUDE_DIRS
-	NAMES OpenCL/cl.h CL/cl.h
+    NAMES OpenCL/cl.h CL/cl.h
     HINTS
-		${OPENCL_ROOT}/include
-		ENV AMDAPPSDKROOT/include
-	PATHS
-		/usr/include
-		/usr/local/include
-	DOC "OpenCL header file path"
+        ${OPENCL_ROOT}/include
+        $ENV{AMDAPPSDKROOT}/include
+        $ENV{CUDA_PATH}/include
+    PATHS
+        /usr/include
+        /usr/local/include
+    DOC "OpenCL header file path"
 )
 mark_as_advanced( OPENCL_INCLUDE_DIRS )
 
@@ -68,23 +64,29 @@ mark_as_advanced( OPENCL_INCLUDE_DIRS )
 get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
 
 if( LIB64 )
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
             ${OPENCL_ROOT}/lib
-            ENV AMDAPPSDKROOT/lib
-		DOC "OpenCL dynamic library path"
-		PATH_SUFFIXES x86_64 x64
-	)
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86_64 x64
+        PATHS
+            /usr/lib
+    )
 else( )
-	find_library( OPENCL_LIBRARIES
-		NAMES OpenCL
-		HINTS
+    find_library( OPENCL_LIBRARIES
+        NAMES OpenCL
+        HINTS
             ${OPENCL_ROOT}/lib
-            ENV AMDAPPSDKROOT/lib
-		DOC "OpenCL dynamic library path"
-		PATH_SUFFIXES x86
-	)
+            $ENV{AMDAPPSDKROOT}/lib
+            $ENV{CUDA_PATH}/lib
+        DOC "OpenCL dynamic library path"
+        PATH_SUFFIXES x86 Win32
+        PATHS
+            /usr/lib
+    )
 endif( )
 mark_as_advanced( OPENCL_LIBRARIES )
 
@@ -92,5 +94,5 @@ include( FindPackageHandleStandardArgs )
 FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
 
 if( NOT OPENCL_FOUND )
-	message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+    message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
 endif()
diff --git a/src/clAmdBlas.h b/src/clAmdBlas.h
index 1921473e..c994eccc 100644
--- a/src/clAmdBlas.h
+++ b/src/clAmdBlas.h
@@ -8528,14 +8528,11 @@ clAmdBlasCgemm(
     size_t K,
     FloatComplex alpha,
     const cl_mem A,
-    size_t offA,
     size_t lda,
     const cl_mem B,
-    size_t offB,
     size_t ldb,
     FloatComplex beta,
     cl_mem C,
-    size_t offC,
     size_t ldc,
     cl_uint numCommandQueues,
     cl_command_queue *commandQueues,
diff --git a/src/clBLAS.def b/src/clBLAS.def
index 5111ff2a..0a9f9b6b 100644
--- a/src/clBLAS.def
+++ b/src/clBLAS.def
@@ -1,6 +1,18 @@
-;/***********************************************************************
-;**	Copyright (C) 2010 Advanced Micro Devices, Inc. All Rights Reserved.
-;***********************************************************************/
+;/* ************************************************************************
+; * Copyright 2013 Advanced Micro Devices, Inc.
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; * ************************************************************************/
 
 LIBRARY	clBLAS
 
diff --git a/src/clBLAS.h b/src/clBLAS.h
index 6d219c33..7d89b9f6 100644
--- a/src/clBLAS.h
+++ b/src/clBLAS.h
@@ -56,6 +56,10 @@ extern "C" {
  * keeping interfaces familiar to users who know how to use BLAS. All
  * functions accept matrices through buffer objects.
  *
+ * This library is entirely thread-safe with the exception of the following API :
+ * clblasSetup and clblasTeardown. 
+ * Developers using the library can safely using any blas routine from different thread. 
+ *
  * @section deprecated
  * This library provided support for the creation of scratch images to achieve better performance
  * on older <a href="http://developer.amd.com/gpu/AMDAPPSDK/Pages/default.aspx">AMD APP SDK's</a>.
diff --git a/src/version.h.in b/src/clBLAS.version.h.in
similarity index 100%
rename from src/version.h.in
rename to src/clBLAS.version.h.in
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index 5154a313..2ebebf11 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -26,7 +26,11 @@ set(CLIENT_HEADER
     clfunc_xtrmm.hpp
     clfunc_xtrsm.hpp
     clfunc_xsyrk.hpp
-    clfunc_xsyr2k.hpp)
+    clfunc_xsyr2k.hpp
+	clfunc_xhemm.hpp
+	clfunc_xsymm.hpp
+	clfunc_xherk.hpp
+	clfunc_xher2k.hpp)
 
 set(WRAPPER_SRC testPerfWrapper.cpp)
 
@@ -48,21 +52,15 @@ include_directories(
 
 add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
 target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 add_executable(testPerfWrapper ${WRAPPER_SRC})
 target_link_libraries(testPerfWrapper ${Boost_LIBRARIES})
-
-if( TARGET_PLATFORM EQUAL 64 )
-    set( BIN_DIR bin64 )
-    set( LIB_DIR lib64 )
-else()
-    set( BIN_DIR bin32 )
-    set( LIB_DIR lib32 )
-endif()
+set_target_properties( testPerfWrapper PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 # CPack configuration; include the executable into the package
 install( TARGETS client testPerfWrapper
-		RUNTIME DESTINATION ${BIN_DIR}
-		LIBRARY DESTINATION ${LIB_DIR}
-		ARCHIVE DESTINATION ${LIB_DIR}/import
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
 		)
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 4876daf5..bda11866 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -28,6 +28,11 @@
 #include "dis_warning.h"
 
 #include "clBLAS.h"
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl_ext.h>
+#endif
 
 template<typename T>
 static T
@@ -243,6 +248,7 @@ class clblasFunc
         OPENCL_V_THROW(err, "creating context");
         queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
 
+
         timer_id = timer.getUniqueID( "clfunc", 0 );
 
 
@@ -307,13 +313,18 @@ class clblasFunc
     virtual void reset_gpu_write_buffer() = 0;
 	virtual void read_gpu_buffer() = 0;
 	virtual void roundtrip_func() = 0;
+	virtual void roundtrip_func_rect() {}
+	virtual void allochostptr_roundtrip_func() {}
+	virtual void usehostptr_roundtrip_func() {}
+	virtual void copyhostptr_roundtrip_func() {}
+	virtual void usepersismem_roundtrip_func() {}
 	virtual void roundtrip_setup_buffer(int order_option, int side_option,
                               int uplo_option, int diag_option, int
                               transA_option, int transB_option,
                               size_t M, size_t N, size_t K, size_t lda,
                               size_t ldb, size_t ldc, size_t offA, size_t offBX,
                               size_t offCY, double alpha, double beta) = 0;
-
+	virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
     StatisticalTimer& timer;
     StatisticalTimer::sTimerID timer_id;
 
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 17223a62..fcd40a79 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -62,20 +62,13 @@ class xGemm : public clblasFunc
 
     ~xGemm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
     }
 
     void call_func()
     {
-        std::cout << "xGemm::call_func\n";
+		timer.Start(timer_id);
+		xGemm_Function(true);
+		timer.Stop(timer_id);
     }
 
     double gflops()
@@ -420,7 +413,307 @@ class xGemm : public clblasFunc
 
 	void roundtrip_func()
 	{
-		std::cout << "xGemm::roundtrip_func\n";
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void roundtrip_func_rect()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+		//rect
+		size_t a_buffer_origin[3] = {0,0,0}; 
+		size_t a_host_origin[3] = {0,0,0};
+		size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
+		size_t a_buffer_row_pitch=0*sizeof(T);//lda
+		size_t a_buffer_slice_pitch=0;
+		size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
+		size_t a_host_slice_pitch=0;
+
+		size_t b_buffer_origin[3] = {0,0,0}; 
+		size_t b_host_origin[3] = {0,0,0};
+		size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
+		size_t b_buffer_row_pitch=0*sizeof(T);//ldb
+		size_t b_buffer_slice_pitch=0;
+		size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
+		size_t b_host_slice_pitch=0;
+
+		size_t c_buffer_origin[3] = {0,0,0}; 
+		size_t c_host_origin[3] = {0,0,0};
+		size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
+		size_t c_buffer_row_pitch=0*sizeof(T);//ldc
+		size_t c_buffer_slice_pitch=0;
+		size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
+		size_t c_host_slice_pitch=0;
+
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                       (buffer_.k_*buffer_.m_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.k_ * buffer_.n_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.m_ * buffer_.n_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+        /*
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+		
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+                                   buffer_.offC_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                   sizeof(T),
+                                   buffer_.c_, 0, NULL, NULL);*/
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
+										a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
+										b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
+        err = clEnqueueWriteBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
+
+		if(buffer_.trans_a_==clblasNoTrans)
+		{
+			buffer_.lda_=buffer_.m_;
+		}
+		else
+		{
+			buffer_.lda_=buffer_.k_;
+		}
+		if(buffer_.trans_b_==clblasNoTrans)
+		{
+			buffer_.ldb_=buffer_.k_;
+		}
+		else
+		{
+			buffer_.ldb_=buffer_.n_;
+		}
+		buffer_.ldc_=buffer_.m_;
+		xGemm_Function(false);
+		/*
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		*/
+		err = ::clEnqueueReadBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+										c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}	
+	void allochostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+
+		cl_int err;
+		// Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+
+		// map the buffers to pointers at host device
+		T *map_a,*map_b,*map_c;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           buffer_.offB_) * sizeof(T),
+										   0, NULL, NULL, &err);
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		// memcpy the input A, B, C to the host pointers
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+		// calling clBLAS
+		xGemm_Function(false);
+		// map the C buffer again to read output
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+
+	timer.Stop(timer_id);
+	}
+	void usehostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        buffer_.b_, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        buffer_.c_, &err);
+		xGemm_Function(true);
+	timer.Stop(timer_id);
+	}
+	void copyhostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+		cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        buffer_.b_, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        buffer_.c_, &err);
+		xGemm_Function(false);
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+			                      buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usepersismem_roundtrip_func()
+	{
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+	timer.Start(timer_id);
+
+		cl_int err;
+
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                       (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+                                       NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(T),
+                                        NULL, &err);
+
+		// map the buffers to pointers at host devices
+		T *map_a,*map_b,*map_c;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.a_num_vectors_ +
+                                           buffer_.offA_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.ldb_*buffer_.b_num_vectors_ +
+                                           buffer_.offB_) * sizeof(T),
+										   0, NULL, NULL, &err);
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		// memcpy the input A, B, C to the host pointers
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+		// calling clBLAS
+		xGemm_Function(false);
+		// map the C buffer again to read output
+	    map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0, 
+										  (buffer_.lda_*buffer_.c_num_vectors_ +
+                                           buffer_.offC_) * sizeof(T),
+										   0, NULL, NULL, &err);
+		memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+
+	timer.Stop(timer_id);
+#else
+		std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
@@ -659,6 +952,20 @@ class xGemm : public clblasFunc
         buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_ ];
 
     }
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+	}
 
 protected:
     void initialize_scalars(double alpha, double beta)
@@ -669,296 +976,86 @@ class xGemm : public clblasFunc
 
 private:
     xGemmBuffer<T> buffer_;
-
-}; // class xgemm
+	void xGemm_Function(bool flush);
 
 
+}; // class xgemm
 
 template<>
-void
+void 
 xGemm<cl_float>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
 	clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float>::
-roundtrip_func()
-{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_float),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_float),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_float),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_float),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_float), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_float),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
-
 template<>
-void
+void 
 xGemm<cl_double>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
 	clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-	//set up buffer
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_double),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_double),
-                                        NULL, &err);
-		//initialize gpu buffer
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_double),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_double),
-                                   buffer_.c_, 0, NULL, NULL);
-		//call_func
-		clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_double), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_double),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
+}
 
 template<>
-void
+void 
 xGemm<cl_float2>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
-    clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+	clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_float2>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_float2),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float2),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_float2),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_float2),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_float2),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_float2),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
+}
 
 template<>
-void
+void 
 xGemm<cl_double2>::
-call_func()
+xGemm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
-    clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+	clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
                      buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
                      buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
-	clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm<cl_double2>::
-roundtrip_func()
+	//flush==true if only the kernel time (library call) is timed
+	//flush==false if memory time is also timed
+	if (flush==true)
 	{
-    timer.Start(timer_id);
-	cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                       (buffer_.lda_*buffer_.a_num_vectors_ +
-                                           buffer_.offA_) * sizeof(cl_double2),
-                                       NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double2),
-                                        NULL, &err);
-
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
-                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
-                                            buffer_.offC_) * sizeof(cl_double2),
-                                        NULL, &err);
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.b_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-                                   buffer_.offC_ * sizeof(cl_double2),
-                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                   sizeof(cl_double2),
-                                   buffer_.c_, 0, NULL, NULL);
-		clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
-                     buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
-                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
-			                      buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ *
-                                       sizeof(cl_double2),
-								  buffer_.c_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+		clWaitForEvents(1, &event_);
 	}
-
+}
 
 template<>
 double
diff --git a/src/client/clfunc_xgemv.hpp b/src/client/clfunc_xgemv.hpp
index 2d1d5b06..cc851094 100644
--- a/src/client/clfunc_xgemv.hpp
+++ b/src/client/clfunc_xgemv.hpp
@@ -286,6 +286,12 @@ class xGemv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 
 protected:
     void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xger.hpp b/src/client/clfunc_xger.hpp
index 05899cd7..d2f36dbc 100644
--- a/src/client/clfunc_xger.hpp
+++ b/src/client/clfunc_xger.hpp
@@ -217,6 +217,12 @@ class xGer : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xgerc.hpp b/src/client/clfunc_xgerc.hpp
index 829d9380..ed39f797 100644
--- a/src/client/clfunc_xgerc.hpp
+++ b/src/client/clfunc_xgerc.hpp
@@ -98,7 +98,12 @@ class xGerc : public clblasFunc
 		{}
 
   void call_func();
-
+  void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/clfunc_xgeru.hpp b/src/client/clfunc_xgeru.hpp
index 8c7d02c9..dbcecc9e 100644
--- a/src/client/clfunc_xgeru.hpp
+++ b/src/client/clfunc_xgeru.hpp
@@ -94,7 +94,12 @@ class xGeru : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
index 8e46d1e3..9f4047e2 100644
--- a/src/client/clfunc_xhemm.hpp
+++ b/src/client/clfunc_xhemm.hpp
@@ -45,7 +45,7 @@
 template <typename T>
 struct xHemmBuffer
 {
-	clblasOrder order;
+  clblasOrder order;
   clblasSide side;
   clblasUplo uplo;
   size_t M;
@@ -78,22 +78,30 @@ class xHemm : public clblasFunc
 
   ~xHemm()
   {
-    delete buffer.cpuA;
-    delete buffer.cpuB;
-    delete buffer.cpuC;
-    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
   }
 
   double gflops()
   {
-    return (buffer.N*(buffer.N+1))/time_in_ns();
+	  if (buffer.side == clblasLeft)
+	  {
+		return (8*buffer.M*buffer.M*buffer.N)/time_in_ns();
+	  }
+	  else
+	  {
+		return (8*buffer.N*buffer.N*buffer.M)/time_in_ns();
+	  }
   }
 
   std::string gflops_formula()
   {
-    return "M*(M+1)/time";
+	  if (buffer.side == clblasLeft)
+	  {
+		  return "8*M*M*N/time";
+	  }
+	  else
+	  {
+		  return "8*N*N*M/time";
+	  }
   }
 
   void setup_buffer(int order_option, int side_option, int
@@ -106,20 +114,137 @@ class xHemm : public clblasFunc
   void initialize_gpu_buffer();
   void reset_gpu_write_buffer();
   void call_func();
-  	void read_gpu_buffer()
+  void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+			                    buffer.offc * sizeof(T),
+								buffer.ldc*buffer.N*sizeof(T),
+								buffer.cpuC,0,NULL,NULL);
 	}
-	void roundtrip_func()
-	{//to-do need to fill up
+  void roundtrip_func()
+	{
+		std::cout << "xHemm::roundtrip_func" <<std::endl;
 	}
-	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+  void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
-                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      size_t ldc, size_t offA, size_t offB, size_t offC,
                       double alpha, double beta)
-		{}
+	{
+		  initialize_scalars(alpha, beta);
+		  buffer.offa = offA;
+		  buffer.offb = offB;
+		  buffer.offc = offC;
+		  buffer.M = M;
+		  buffer.N = N;
+		  if (order_option == 0)
+		  {
+			buffer.order = clblasRowMajor;
+		  }
+		  else
+		  {
+			buffer.order = clblasColumnMajor;
+		  }
+		  if (uplo_option == 0)
+		  {
+			buffer.uplo = clblasUpper;
+		  }
+		  else
+		  {
+			buffer.uplo = clblasLower;
+		  }
+		  if (side_option == 0)
+		  {
+			  buffer.side = clblasLeft;
+			  buffer.a_num_vectors = M;
+			  if (lda == 0)
+			  {
+				buffer.lda = buffer.M;
+			  }
+			  else if (lda < buffer.M)
+			  {
+				std::cerr << "lda:wrong size\n";
+				exit(1);
+			  }
+			  else
+			  {
+				buffer.lda = lda;
+			  }
+		  }
+		  else
+		  {
+			  buffer.side = clblasRight;
+			  buffer.a_num_vectors = N;
+			  if (lda == 0)
+			  {
+				buffer.lda = buffer.N;
+			  }
+			  else if (lda < buffer.N)
+			  {
+				std::cerr << "lda:wrong size\n";
+				exit(1);
+			  }
+			  else
+			  {
+				buffer.lda = lda;
+			  }
+		  }
+		  /*}
+		  if (lda == 0)
+		  {
+			buffer.lda = buffer.M;
+		  }
+		  else if (lda < buffer.M)
+		  {
+			std::cerr << "lda:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.lda = lda;
+		  }*/
+		  if (ldb == 0)
+		  {
+			buffer.ldb = buffer.M;
+		  }
+		  else if (ldb < buffer.M)
+		  {
+			std::cerr << "ldb:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.ldb = ldb;
+		  }
+		  if (ldc == 0)
+		  {
+			buffer.ldc = buffer.M;
+		  }
+		  else if (ldc < buffer.M)
+		  {
+			std::cerr << "ldc:wrong size\n";
+			exit(1);
+		  }
+		  else
+		  {
+			buffer.ldc = ldc;
+		  }
+		  buffer.cpuB = new T[buffer.N * buffer.ldb];
+		  buffer.cpuC = new T[buffer.N * buffer.ldc];
+		  buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
+	}
+  void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer.cpuA;
+		delete buffer.cpuB;
+		delete buffer.cpuC;
+		OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+	}
 
 protected:
 protected:
@@ -247,7 +372,7 @@ void xHemm<T>::setup_buffer(int order_option, int side_option, int
                                 buffer.a_num_vectors * buffer.lda*sizeof(T),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(T),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -350,10 +475,12 @@ void xHemm<T>::initialize_gpu_buffer()
                               buffer.a_num_vectors * buffer.lda*sizeof(T),
                               buffer.cpuA, 0, NULL, NULL);
 
-  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+  err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(T),
                               buffer.ldb*buffer.N*sizeof(T),
                               buffer.cpuB, 0, NULL, NULL);
-  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+  err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(T),
                               buffer.ldc*buffer.N*sizeof(T),
                               buffer.cpuC, 0, NULL, NULL);
 }
@@ -379,6 +506,50 @@ void xHemm<cl_float2>::call_func()
   timer.Stop(timer_id);
 }
 
+template <>
+void xHemm<cl_float2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+	cl_int err;
+	//create buffer
+	buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                                NULL, &err);
+
+    buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer.N*buffer.ldb*sizeof(cl_float2),
+                                    NULL, &err);
+    buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_float2),
+                                    NULL, &err);
+	//write gpu buffer
+	err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_float2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+                              buffer.cpuA, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(cl_float2),
+                              buffer.ldb*buffer.N*sizeof(cl_float2),
+                              buffer.cpuB, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(cl_float2),
+                              buffer.ldc*buffer.N*sizeof(cl_float2),
+                              buffer.cpuC, 0, NULL, NULL);
+
+	clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+	//read gpu buffer
+	err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE, 
+							  buffer.offc * sizeof(cl_float2),
+                              buffer.ldc*buffer.N*sizeof(cl_float2),
+                              buffer.cpuC, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+
+}
 template <>
 void xHemm<cl_double2>::call_func()
 {
@@ -390,5 +561,48 @@ void xHemm<cl_double2>::call_func()
   clWaitForEvents(1, &event_);
   timer.Stop(timer_id);
 }
+template <>
+void xHemm<cl_double2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+	cl_int err;
+	//create buffer
+	buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                                NULL, &err);
 
+    buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                    buffer.N*buffer.ldb*sizeof(cl_double2),
+                                    NULL, &err);
+    buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                    buffer.N*buffer.ldc*sizeof(cl_double2),
+                                    NULL, &err);
+	//write gpu buffer
+	err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+                              buffer.offa * sizeof(cl_double2),
+                              buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+                              buffer.cpuA, 0, NULL, NULL);
+
+    err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+	                          buffer.offb * sizeof(cl_double2),
+                              buffer.ldb*buffer.N*sizeof(cl_double2),
+                              buffer.cpuB, 0, NULL, NULL);
+    err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+							  buffer.offc * sizeof(cl_double2),
+                              buffer.ldc*buffer.N*sizeof(cl_double2),
+                              buffer.cpuC, 0, NULL, NULL);
+
+	clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+      buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+      buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+      0, NULL,NULL);
+	//read gpu buffer
+	err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE, 
+							  buffer.offc * sizeof(cl_double2),
+                              buffer.ldc*buffer.N*sizeof(cl_double2),
+                              buffer.cpuC, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+
+}
 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xhemv.hpp b/src/client/clfunc_xhemv.hpp
index 570c3fce..6211114c 100644
--- a/src/client/clfunc_xhemv.hpp
+++ b/src/client/clfunc_xhemv.hpp
@@ -95,7 +95,12 @@ class xHemv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+  	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher.hpp b/src/client/clfunc_xher.hpp
index e624b558..5144b22b 100644
--- a/src/client/clfunc_xher.hpp
+++ b/src/client/clfunc_xher.hpp
@@ -90,7 +90,12 @@ class xHer : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher2.hpp b/src/client/clfunc_xher2.hpp
index 27d95f34..aec7cc83 100644
--- a/src/client/clfunc_xher2.hpp
+++ b/src/client/clfunc_xher2.hpp
@@ -94,6 +94,12 @@ class xHer2 : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
new file mode 100644
index 00000000..15095fa8
--- /dev/null
+++ b/src/client/clfunc_xher2k.hpp
@@ -0,0 +1,676 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER2K_HXX__
+#define CLBLAS_BENCHMARK_XHER2K_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHer2kBuffer
+{
+    clblasOrder order_;
+    clblasUplo uplo_;
+    clblasTranspose transA_;
+    size_t N_;
+    size_t K_;
+    T alpha_;
+	cl_mem A_;
+    size_t offa_;
+    size_t lda_;
+	cl_mem B_;
+	size_t offb_;
+	size_t ldb_;
+    T beta_;
+    cl_mem C_;
+    size_t offc_;
+    size_t ldc_;
+	size_t a_num_vectors_;
+	size_t b_num_vectors_;
+    size_t c_num_vectors_;
+	T* cpuA_;
+	T* cpuB_;
+	T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHer2k : public clblasFunc
+{
+public:
+  xHer2k(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHer2k", 0);
+  }
+
+  ~xHer2k()
+  {
+  }
+
+  double gflops()
+  {
+    return static_cast<double>(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "(8*K*N*N+2*N)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offb_ = offB;
+		buffer_.offc_ = offC;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+				buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+				buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+		buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(T),
+                                        NULL, &err);
+
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(T),
+                                        NULL, &err);
+  }
+  void initialize_cpu_buffer()
+  {
+	  srand(10);
+	  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.lda_; ++j)
+		  {
+                buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+	  for (size_t i = 0; i < buffer_.N_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.ldc_; ++j)
+		  {
+                buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+  }
+  void initialize_gpu_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuA_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void reset_gpu_write_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void call_func();
+  void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+								  buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.cpuC_, 0, NULL, NULL);
+	}
+	void roundtrip_func();
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offb_ = offBX;
+		buffer_.offc_ = offCY;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+				buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+				buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+		buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.cpuA_;
+		delete buffer_.cpuB_;
+		delete buffer_.cpuC_;
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.B_), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+	}
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer_.alpha_ = makeScalar<T>(alpha);
+      buffer_.beta_ = makeScalar<T>(beta);
+  }
+
+private:
+  xHer2kBuffer<T> buffer_;
+};
+
+template<>
+void 
+xHer2k<cl_float2>::call_func()
+{
+	timer.Start(timer_id);
+	clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHer2k<cl_float2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_float2),
+                                        NULL, &err);
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(cl_float2),
+                                        NULL, &err);
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_float2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+		clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_float2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHer2k<cl_double2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k<cl_double2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_double2),
+                                        NULL, &err);
+	    buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offb_) * sizeof(cl_double2),
+                                        NULL, &err);
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_double2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+	   clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_,
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.B_, buffer_.offb_, buffer_.ldb_,
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_double2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
new file mode 100644
index 00000000..74871a39
--- /dev/null
+++ b/src/client/clfunc_xherk.hpp
@@ -0,0 +1,535 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHERK_HXX__
+#define CLBLAS_BENCHMARK_XHERK_HXX__
+
+#include "clfunc_common.hpp"
+
+template <typename T>
+struct xHerkBuffer
+{
+    clblasOrder order_;
+    clblasUplo uplo_;
+    clblasTranspose transA_;
+    size_t N_;
+    size_t K_;
+    T alpha_;
+	cl_mem A_;
+    size_t offa_;
+    size_t lda_;
+    T beta_;
+    cl_mem C_;
+    size_t offc_;
+    size_t ldc_;
+	size_t a_num_vectors_;
+    size_t c_num_vectors_;
+	T* cpuA_;
+	T* cpuC_;
+}; // struct buffer
+
+template <typename T>
+class xHerk : public clblasFunc
+{
+public:
+  xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
+  {
+    timer.getUniqueID("clHerk", 0);
+  }
+
+  ~xHerk()
+  {
+  }
+
+  double gflops()
+  {
+    return static_cast<double>(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
+  }
+
+  std::string gflops_formula()
+  {
+    return "4*K*N*(N+1)/time";
+  }
+
+  void setup_buffer(int order_option, int side_option, int
+                    uplo_option, int diag_option, int transA_option, int
+                    transB_option, size_t M, size_t N, size_t K,
+                    size_t lda, size_t ldb, size_t ldc,size_t offA,
+					          size_t offB, size_t offC, double alpha,
+                    double beta)
+  {
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offB);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offc_ = offC;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(T),
+                                        NULL, &err);
+  }
+  void initialize_cpu_buffer()
+  {
+	  srand(10);
+	  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.lda_; ++j)
+		  {
+                buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+	  for (size_t i = 0; i < buffer_.N_; ++i)
+	  {
+		  for (size_t j = 0; j < buffer_.ldc_; ++j)
+		  {
+                buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
+                                               randomScale<T>();
+		  }
+	  }
+  }
+  void initialize_gpu_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuA_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offa_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void reset_gpu_write_buffer()
+  {
+	    cl_int err;
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(T),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.cpuC_, 0, NULL, NULL);
+  }
+  void call_func();
+  void read_gpu_buffer()
+	{
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+								  buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.cpuC_, 0, NULL, NULL);
+	}
+	void roundtrip_func();
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+	}
+	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+                      int diag_option, int transA_option, int  transB_option,
+                      size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+                      size_t ldc, size_t offA, size_t offBX, size_t offCY,
+                      double alpha, double beta)
+	{
+        DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+		initialize_scalars(alpha,beta);
+
+		buffer_.N_ = N;
+		buffer_.K_ = K;
+		buffer_.offa_ = offA;
+		buffer_.offc_ = offCY;
+
+		if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+		
+		if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+		      
+		buffer_.c_num_vectors_ = N;
+
+		if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.transA_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.transA_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.transA_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.transA_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.cpuA_;
+		delete buffer_.cpuC_;
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+	}
+protected:
+protected:
+  void initialize_scalars(double alpha, double beta)
+  {
+      buffer_.alpha_ = makeScalar<T>(alpha);
+      buffer_.beta_ = makeScalar<T>(beta);
+  }
+
+private:
+  xHerkBuffer<T> buffer_;
+};
+
+template<>
+void 
+xHerk<cl_float2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHerk<cl_float2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_float2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+		clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_float2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+
+template<>
+void 
+xHerk<cl_double2>::call_func()
+{
+	timer.Start(timer_id);
+
+	clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk<cl_double2>::roundtrip_func()
+{
+		timer.Start(timer_id);
+        cl_int err;
+        buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offa_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+        buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offc_) * sizeof(cl_double2),
+                                        NULL, &err);
+		this->initialize_gpu_buffer();
+
+		clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+				buffer_.A_, buffer_.offa_, buffer_.lda_, 
+				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+				buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+		err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+                                   buffer_.offc_ * sizeof(cl_double2),
+                                   buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+                                   buffer_.cpuC_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+		timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index e9fe9818..a7558e92 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -58,12 +58,6 @@ class xSymm : public clblasFunc
 
   ~xSymm()
   {
-    delete buffer.cpuA;
-    delete buffer.cpuB;
-    delete buffer.cpuC;
-    OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
-    OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
   }
 
   double gflops()
@@ -104,6 +98,10 @@ class xSymm : public clblasFunc
 	{
 				std::cout << "xSymm::roundtrip_func\n";
 	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xSymm::zerocopy_roundtrip_func\n";
+	}
   void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
@@ -212,6 +210,17 @@ class xSymm : public clblasFunc
   buffer.cpuC = new T[buffer.N * buffer.ldc];
   buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
   }
+  	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer.cpuA;
+		delete buffer.cpuB;
+		delete buffer.cpuC;
+		OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+		OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
@@ -337,7 +346,7 @@ void xSymm<T>::setup_buffer(int order_option, int side_option, int
                                 buffer.a_num_vectors * buffer.lda*sizeof(T),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(T),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -423,7 +432,7 @@ void xSymm<cl_float>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_float),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -476,7 +485,7 @@ void xSymm<cl_double>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_double),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -529,7 +538,7 @@ void xSymm<cl_float2>::roundtrip_func()
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_float2),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -577,12 +586,12 @@ void xSymm<cl_double2>::roundtrip_func()
 {
   timer.Start(timer_id);
   //set up buffer
-    cl_int err;
+  cl_int err;
   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
                                 NULL, &err);
 
-  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+  buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
                                     buffer.N*buffer.ldb*sizeof(cl_double2),
                                     NULL, &err);
   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
diff --git a/src/client/clfunc_xsymv.hpp b/src/client/clfunc_xsymv.hpp
index 625c7ec7..c9285410 100644
--- a/src/client/clfunc_xsymv.hpp
+++ b/src/client/clfunc_xsymv.hpp
@@ -209,6 +209,12 @@ class xSymv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
diff --git a/src/client/clfunc_xsyr.hpp b/src/client/clfunc_xsyr.hpp
index 172032c9..4c70e69c 100644
--- a/src/client/clfunc_xsyr.hpp
+++ b/src/client/clfunc_xsyr.hpp
@@ -90,6 +90,12 @@ class xSyr : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        //to-do
+	}
 
 protected:
 protected:
diff --git a/src/client/clfunc_xsyr2.hpp b/src/client/clfunc_xsyr2.hpp
index 761c6167..9977d08a 100644
--- a/src/client/clfunc_xsyr2.hpp
+++ b/src/client/clfunc_xsyr2.hpp
@@ -94,7 +94,12 @@ class xSyr2 : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
-
+ 	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
 protected:
   void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 4faa3997..ae60f9e0 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -61,15 +61,6 @@ class xSyr2k : public clblasFunc
 
     ~xSyr2k()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
     }
 
     void call_func()
@@ -78,13 +69,12 @@ class xSyr2k : public clblasFunc
 
     double gflops()
     {
-        return 2.0*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
-            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+        return (2*buffer_.k_*buffer_.n_*buffer_.n_+buffer_.n_)/time_in_ns();
     }
 
     std::string gflops_formula()
     {
-        return "2.0*(M*(M+1)*N+M*(M+1))/time";
+        return "(2*K*N*N+N)/time";
     }
 
     void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -293,7 +283,7 @@ class xSyr2k : public clblasFunc
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                         NULL, &err);
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
@@ -364,19 +354,232 @@ class xSyr2k : public clblasFunc
     }
 	void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(T),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.c_, 0, NULL, NULL);
 	}
 	void roundtrip_func()
-	{//to-do need to fill up
+	{
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
-		{}
+	{
+		DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offB_ = offBX;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.b_num_vectors_ = K;
+                buffer_.trans_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = N;
+                }
+                else if (ldb < N)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                buffer_.b_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_ = clblasConjTrans;
+                }
 
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+
+                if (ldb == 0)
+                {
+                    buffer_.ldb_ = K;
+                }
+                else if (ldb < K)
+                {
+                    std::cerr << "ldb:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.ldb_ = ldb;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        delete buffer_.a_;
+        delete buffer_.b_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+                        "releasing buffer B");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
@@ -406,6 +609,41 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<float>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(float),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(float),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(float),
+                                        NULL, &err);
+
+	this->initialize_gpu_buffer();
+	clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(float),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(float),
+								  buffer_.c_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyr2k<double>::
@@ -423,6 +661,41 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<double>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(double),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(double),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(double),
+                                        NULL, &err);
+
+	this->initialize_gpu_buffer();
+    clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(double),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(double),
+								  buffer_.c_, 0, NULL, &event_);
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyr2k<cl_float2>::
@@ -440,6 +713,56 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<cl_float2>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_float2),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+
+	clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(cl_float2),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_float2),
+								  buffer_.c_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_float2>::gflops()
+{
+        return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
+}
+
+template<>
+std::string 
+xSyr2k<cl_float2>::gflops_formula()
+{
+        return "(8*K*N*N+2*N)/time";
+}
+
 template<>
 void
 xSyr2k<cl_double2>::
@@ -457,4 +780,53 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyr2k<cl_double2>::
+roundtrip_func()
+{
+    timer.Start(timer_id);
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+    buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(cl_double2),
+                                        NULL, &err);
+	buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+    clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+                      buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                      buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+                      buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                      buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_ * sizeof(cl_double2),
+								  buffer_.ldc_ * buffer_.c_num_vectors_ *
+                                       sizeof(cl_double2),
+								  buffer_.c_, 0, NULL, &event_);
+
+	clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k<cl_double2>::gflops()
+{
+        return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
+}
+
+template<>
+std::string 
+xSyr2k<cl_double2>::gflops_formula()
+{
+        return "(8*K*N*N+2*N)/time";
+}
+
 #endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index 5bfd0e3c..e9b6a7a5 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -56,13 +56,7 @@ class xSyrk : public clblasFunc
 
     ~xSyrk()
     {
-        delete buffer_.a_;
-        delete buffer_.c_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
-                        "releasing buffer C");
-        }
+    }
 
     void call_func()
     {
@@ -70,13 +64,12 @@ class xSyrk : public clblasFunc
 
     double gflops()
     {
-        return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
-            buffer_.n_*(buffer_.n_+1)/time_in_ns();
+        return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
     }
 
     std::string gflops_formula()
     {
-        return "(N*(N+1)*K+N*(N+1))/time";
+        return "(N*(N+1)*K)/time";
     }
 
     void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -224,7 +217,7 @@ class xSyrk : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
                                             buffer_.offC_) * sizeof(T),
                                         NULL, &err);
@@ -281,18 +274,168 @@ class xSyrk : public clblasFunc
     }
  	void read_gpu_buffer()
 	{
-		//cl_int err;
-		//to-do need to fill up
+		cl_int err;
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+								  buffer_.c_, 0, NULL, NULL);
 	}
 	void roundtrip_func()
-	{//to-do need to fill up
+	{
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xSyrk::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
-		{}
+	{
+		DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+        DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+        initialize_scalars(alpha, beta);
+
+        buffer_.n_ = N;
+        buffer_.k_ = K;
+        buffer_.offA_ = offA;
+        buffer_.offC_ = offCY;
+
+        if (uplo_option == 0)
+        {
+            buffer_.uplo_ = clblasUpper;
+        }
+        else
+        {
+            buffer_.uplo_ = clblasLower;
+        }
+
+
+        if (ldc == 0)
+        {
+            buffer_.ldc_ = N;
+        }
+        else if (ldc < N)
+        {
+            std::cerr << "ldc:wrong size\n";
+        }
+        else
+        {
+            buffer_.ldc_ = ldc;
+        }
+        buffer_.c_num_vectors_ = N;
+
+        if (order_option == 0)
+        {
+            order_ = clblasRowMajor;
+            if (transA_option == 0)
+            {
+                buffer_.trans_a_ = clblasNoTrans;
+                buffer_.a_num_vectors_ = N;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = K;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+        else
+        {
+            order_ = clblasColumnMajor;
+            if (transA_option == 0)
+            {
+                buffer_.a_num_vectors_ = K;
+                buffer_.trans_a_ = clblasNoTrans;
+                if (lda == 0)
+                {
+                    buffer_.lda_ = N;
+                }
+                else if (lda < N)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+            else
+            {
+                buffer_.a_num_vectors_ = N;
+                if (transA_option == 1)
+                {
+                    buffer_.trans_a_ = clblasTrans;
+                }
+                else if (transA_option == 2)
+                {
+                    buffer_.trans_a_ = clblasConjTrans;
+                }
+
+                if (lda == 0)
+                {
+                    buffer_.lda_ = K;
+                }
+                else if (lda < K)
+                {
+                    std::cerr << "lda:wrong size\n";
+                    exit(1);
+                }
+                else
+                {
+                    buffer_.lda_ = lda;
+                }
+            }
+        }
+
+        buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+        buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		delete buffer_.a_;
+        delete buffer_.c_;
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+                        "releasing buffer A");
+        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+                        "releasing buffer C");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
@@ -321,6 +464,35 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<float>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(float),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(float),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyrk<double>::
@@ -337,6 +509,35 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<double>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(double),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(double),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(double), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(double),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
 template<>
 void
 xSyrk<cl_float2>::
@@ -353,6 +554,48 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<cl_float2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_float2),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_float2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(cl_float2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_float2),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_float2>::gflops()
+{
+        return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyrk<cl_float2>::gflops_formula()
+{
+        return "(4*N*(N+1)*K)/time";
+}
 template<>
 void
 xSyrk<cl_double2>::
@@ -369,4 +612,47 @@ call_func()
     timer.Stop(timer_id);
 }
 
+template<>
+void
+xSyrk<cl_double2>::roundtrip_func()
+{
+	timer.Start(timer_id);
+
+	cl_int err;
+    buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(cl_double2),
+                                        NULL, &err);
+
+    buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldc_ * buffer_.c_num_vectors_ +
+                                            buffer_.offC_) * sizeof(cl_double2),
+                                        NULL, &err);
+	this->initialize_gpu_buffer();
+	clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+                     buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+                     buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+                     buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+	err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+								  buffer_.offC_*sizeof(cl_double2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_double2),
+								  buffer_.c_, 0, NULL, &event_);
+
+    clWaitForEvents(1, &event_);
+    timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk<cl_double2>::gflops()
+{
+        return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string 
+xSyrk<cl_double2>::gflops_formula()
+{
+        return "(4*N*(N+1)*K)/time";
+}
+
 #endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index d47ddfdb..2e05300c 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -57,12 +57,6 @@ class xTrmm : public clblasFunc
 
     ~xTrmm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
-                       "releasing buffer A");
-        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
-                       "releasing buffer B");
     }
 
     void call_func()
@@ -238,7 +232,7 @@ class xTrmm : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                         NULL, &err);
@@ -310,7 +304,11 @@ class xTrmm : public clblasFunc
 	}
 	void roundtrip_func()
 	{
-		std::cout << "xGemm::roundtrip_func\n";
+		std::cout << "xTrmm::roundtrip_func\n";
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
@@ -450,6 +448,17 @@ class xTrmm : public clblasFunc
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
 	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+                       "releasing buffer A");
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+                       "releasing buffer B");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
@@ -493,7 +502,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float),
                                         NULL, &err);
@@ -557,7 +566,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double),
                                         NULL, &err);
@@ -621,7 +630,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_float2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_float2),
                                         NULL, &err);
@@ -685,7 +694,7 @@ roundtrip_func()
                                             buffer_.offA_) * sizeof(cl_double2),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(cl_double2),
                                         NULL, &err);
diff --git a/src/client/clfunc_xtrmv.hpp b/src/client/clfunc_xtrmv.hpp
index 725e9f31..80d5004c 100644
--- a/src/client/clfunc_xtrmv.hpp
+++ b/src/client/clfunc_xtrmv.hpp
@@ -225,6 +225,12 @@ class xTrmv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 8ae85c30..2eb64cfb 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -22,6 +22,7 @@
 
 #include "clfunc_common.hpp"
 
+
 template <typename T>
 struct xTrsmBuffer
 {
@@ -57,17 +58,13 @@ class xTrsm : public clblasFunc
 
     ~xTrsm()
     {
-        delete buffer_.a_;
-        delete buffer_.b_;
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
-                        "releasing buffer A");
-        OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
-                        "releasing buffer B");
     }
 
     void call_func()
     {
-        std::cout << "xtrsm::call_func\n";
+    timer.Start(timer_id);
+	xTrsm_Function(true);
+    timer.Stop(timer_id);
     }
 
     double gflops()
@@ -237,7 +234,7 @@ class xTrsm : public clblasFunc
                                             buffer_.offA_) * sizeof(T),
                                         NULL, &err);
 
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
                                         (buffer_.ldb_ * buffer_.b_num_vectors_ +
                                             buffer_.offB_) * sizeof(T),
                                          NULL, &err);
@@ -317,7 +314,179 @@ class xTrsm : public clblasFunc
 	}
 	void roundtrip_func()
 	{
-		std::cout << "xtrsm::call_func\n";
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		//initialize gpu buffer
+		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+                                   buffer_.offA_ * sizeof(T),
+                                   buffer_.lda_ * buffer_.a_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.a_, 0, NULL, NULL);
+
+        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+                                   buffer_.offB_ * sizeof(T),
+                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+                                   buffer_.b_, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void allochostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		// Map the buffers to pointers at host device
+		T *map_a,*map_b;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		// memcpy the input A, B to the mapped regions
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		// map the B buffer again to read the output
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usehostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         buffer_.b_, &err);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+		clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void copyhostptr_roundtrip_func()
+	{
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        buffer_.a_, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         buffer_.b_, &err);
+		//call func
+		xTrsm_Function(false);
+		//read gpu buffer
+		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+			                      buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+                                       sizeof(T),
+								  buffer_.b_, 0, NULL, &event_);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+	}
+	void usepersismem_roundtrip_func()
+	{
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+	timer.Start(timer_id);
+	    //set up buffer
+        cl_int err;
+        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
+                                            buffer_.offA_) * sizeof(T),
+                                        NULL, &err);
+
+        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+                                         NULL, &err);
+		// Map the buffers to pointers at host device
+		T *map_a,*map_b;
+		map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		// memcpy the input A, B to the mapped regions
+		memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		// unmap the buffers
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+		//call func
+		xTrsm_Function(false);
+		// map the B buffer again to read the output
+		map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+                                          (buffer_.ldb_ * buffer_.b_num_vectors_ +
+                                            buffer_.offB_) * sizeof(T),
+											0, NULL, NULL, &err);
+		memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+		clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+	clWaitForEvents(1, &event_);
+	timer.Stop(timer_id);
+#else
+		std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"<<std::endl;
+#endif
+	}
+	void zerocopy_roundtrip_func()
+	{
+		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
 	}
 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
                       int diag_option, int transA_option, int  transB_option,
@@ -456,6 +625,17 @@ class xTrsm : public clblasFunc
         buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
         buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
 	}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+        delete buffer_.a_;
+        delete buffer_.b_;
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+                       "releasing buffer A");
+        OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+                       "releasing buffer B");
+	}
 protected:
     void initialize_scalars(double alpha, double beta)
     {
@@ -465,261 +645,79 @@ class xTrsm : public clblasFunc
 
 private:
     xTrsmBuffer<T> buffer_;
+	void xTrsm_Function(bool flush);
 
 }; // class xtrsm
 
 template<>
 void
 xTrsm<cl_float>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_float),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_double>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-    clWaitForEvents(1, &event_);
-    timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_double),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_float2>::
-call_func()
+xTrsm_Function(bool flush)
 {
-    timer.Start(timer_id);
-
     clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
                      1, &queue_, 0, NULL, &event_);
-
-  clWaitForEvents(1, &event_);
-  timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_float2>::
-roundtrip_func()
-{
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_float2),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_float2),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_float2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_float2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
-                     buffer_.trans_a_, buffer_.diag_,
-                     buffer_.m_, buffer_.n_, buffer_.alpha_,
-                     buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                     buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_float2),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm<cl_double2>::
-call_func()
-{
-  timer.Start(timer_id);
-
-  clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
-                   buffer_.trans_a_, buffer_.diag_,
-                   buffer_.m_, buffer_.n_, buffer_.alpha_,
-                   buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
-                   buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                   1, &queue_, 0, NULL, &event_);
-
-      clWaitForEvents(1, &event_);
-      timer.Stop(timer_id);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
 template<>
 void
 xTrsm<cl_double2>::
-roundtrip_func()
+xTrsm_Function(bool flush)
 {
-	timer.Start(timer_id);
-	    //set up buffer
-        cl_int err;
-        buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.lda_ * buffer_.a_num_vectors_ +
-                                            buffer_.offA_) * sizeof(cl_double2),
-                                        NULL, &err);
-
-        buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
-                                        (buffer_.ldb_ * buffer_.b_num_vectors_ +
-                                            buffer_.offB_) * sizeof(cl_double2),
-                                         NULL, &err);
-		//initialize gpu buffer
-		err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
-                                   buffer_.offA_ * sizeof(cl_double2),
-                                   buffer_.lda_ * buffer_.a_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.a_, 0, NULL, NULL);
-
-        err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-                                   buffer_.offB_ * sizeof(cl_double2),
-                                   buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-                                   buffer_.b_, 0, NULL, NULL);
-		//call func
-		clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+    clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
                      buffer_.trans_a_, buffer_.diag_,
                      buffer_.m_, buffer_.n_, buffer_.alpha_,
                      buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
                      buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
-                     1, &queue_, 0, NULL, NULL);
-		//read gpu buffer
-		err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
-			                      buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
-                                       sizeof(cl_double2),
-								  buffer_.b_, 0, NULL, &event_);
-	clWaitForEvents(1, &event_);
-	timer.Stop(timer_id);
+                     1, &queue_, 0, NULL, &event_);
+	if(flush==true)
+	{
+		clWaitForEvents(1, &event_);
+	}
 }
 
+
 template<>
 double
 xTrsm<cl_float2>::
diff --git a/src/client/clfunc_xtrsv.hpp b/src/client/clfunc_xtrsv.hpp
index f0b728ab..4eb0e5b8 100644
--- a/src/client/clfunc_xtrsv.hpp
+++ b/src/client/clfunc_xtrsv.hpp
@@ -218,6 +218,12 @@ class xTrsv : public clblasFunc
                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
                       double alpha, double beta)
 		{}
+	void releaseGPUBuffer_deleteCPUBuffer()
+	{
+		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+		//need to do this before we eventually hit the destructor
+		//to-do
+	}
 protected:
   void initialize_scalars(double alpha, double beta)
   {
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 8f60a07a..16186095 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -39,6 +39,8 @@
 #include "clfunc_xhemv.hpp"
 #include "clfunc_xhemm.hpp"
 #include "clfunc_xsymm.hpp"
+#include "clfunc_xherk.hpp"
+#include "clfunc_xher2k.hpp"
 
 namespace po = boost::program_options;
 
@@ -67,6 +69,7 @@ int main(int argc, char *argv[])
   std::string function;
   std::string precision;
   std::string roundtrip;
+  std::string memalloc;
   int side_option;
   int uplo_option;
   int diag_option;
@@ -98,7 +101,8 @@ int main(int argc, char *argv[])
     ( "uplo", po::value<int>( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" )  // xsymv xsyrk xsyr2k xtrsm xtrmm
     ( "diag", po::value<int>( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
     ( "profile,p", po::value<cl_uint>( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
-	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"calculate the time for round trips")
+	( "roundtrip", po::value<std::string>( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+	( "memalloc", po::value<std::string>( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
     ;
 
   po::variables_map vm;
@@ -130,6 +134,8 @@ int main(int argc, char *argv[])
       && function != "hemv"
       && function != "hemm"
       && function != "symm"
+	  && function != "herk"
+	  && function != "her2k"
       )
   {
     std::cerr << "Invalid value for --function" << std::endl;
@@ -432,6 +438,30 @@ int main(int argc, char *argv[])
       return -1;
     }
   }
+  else if (function == "herk")
+  {
+    if (precision == "c")
+      my_function = new xHerk<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHerk<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her function" << std::endl;
+      return -1;
+    }
+  }
+  else if (function == "her2k")
+  {
+    if (precision == "c")
+      my_function = new xHer2k<cl_float2>(timer, deviceType);
+    else if (precision == "z")
+      my_function = new xHer2k<cl_double2>(timer, deviceType);
+    else
+    {
+      std::cerr << "Unknown her2 function" << std::endl;
+      return -1;
+    }
+  }
   else if (function == "symm")
   {
     if (precision == "s")
@@ -483,8 +513,33 @@ int main(int argc, char *argv[])
     my_function->call_func();
 	my_function->read_gpu_buffer();
     my_function->reset_gpu_write_buffer();*/
-	my_function->roundtrip_func();
-	my_function->reset_gpu_write_buffer();
+	
+	if(memalloc=="default")
+	{
+		my_function->roundtrip_func();
+	}
+	else if (memalloc=="alloc_host_ptr")
+	{
+		my_function->allochostptr_roundtrip_func();
+	}
+	else if (memalloc=="use_host_ptr")
+	{
+		my_function->usehostptr_roundtrip_func();
+	}
+	else if (memalloc=="copy_host_ptr")
+	{
+		my_function->copyhostptr_roundtrip_func();
+	}
+	else if (memalloc=="use_persistent_mem_amd")
+	{
+		my_function->usepersismem_roundtrip_func();
+	}
+	else if (memalloc=="rect_mem")
+	{
+		my_function->roundtrip_func_rect();
+	}
+	//my_function->reset_gpu_write_buffer();
+	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }
 
   if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -512,7 +567,8 @@ int main(int argc, char *argv[])
     my_function->initialize_gpu_buffer();
     my_function->call_func();
 	my_function->read_gpu_buffer();
-    my_function->reset_gpu_write_buffer();
+    //my_function->reset_gpu_write_buffer();
+	my_function->releaseGPUBuffer_deleteCPUBuffer();
   }
 
   if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -525,7 +581,7 @@ int main(int argc, char *argv[])
       std::endl;
   }
   }
-
+  delete my_function;
   return 0;
 }
 
diff --git a/src/include/defbool.h b/src/include/defbool.h
index e90736dd..26caf6af 100644
--- a/src/include/defbool.h
+++ b/src/include/defbool.h
@@ -18,7 +18,7 @@
 #ifndef DEFBOOL_H_
 #define DEFBOOL_H_
 
-#if defined(_MSC_VER) && _MSC_VER <= 1600
+#if defined(_MSC_VER) && _MSC_VER <= 1700
 
 /*
 FIX for windows compilation
@@ -48,10 +48,10 @@ typedef  int  _Bool;
 #endif /* !__cplusplus */
 
 
-#else /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#else /* defined(_MSC_VER) && _MSC_VER <= 1700 */
 
 #include <stdbool.h>
 
-#endif /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#endif /* defined(_MSC_VER) && _MSC_VER <= 1700 */
 
 #endif /* DEFBOOL_H_ */
diff --git a/src/include/kern_cache.h b/src/include/kern_cache.h
index b6749c59..af14a855 100644
--- a/src/include/kern_cache.h
+++ b/src/include/kern_cache.h
@@ -55,6 +55,7 @@ typedef struct Kernel {
     void *extra;
     size_t extraSize;
     void (*dtor)(struct Kernel *kern);
+    int noSource;
 } Kernel;
 
 typedef int
diff --git a/src/include/kerngen.h b/src/include/kerngen.h
index dd44b9ea..73ee1912 100644
--- a/src/include/kerngen.h
+++ b/src/include/kerngen.h
@@ -42,6 +42,12 @@
  */
 /*@{*/
 
+#ifdef _MSC_VER
+#define SPREFIX "I"
+#else
+#define SPREFIX "z"
+#endif
+
 #define SUBDIM_UNUSED (size_t)-1
 
 enum {
diff --git a/src/include/trace_malloc.h b/src/include/trace_malloc.h
index 3dfa3152..acc97531 100644
--- a/src/include/trace_malloc.h
+++ b/src/include/trace_malloc.h
@@ -48,7 +48,7 @@ void releaseMallocTrace(void);
 
 static __inline void initMallocTrace(void)
 {
-    /* do noting */
+    /* do nothing */
 }
 
 static __inline void printMallocStatistics(void)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 5bc8e2aa..f06282e6 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -277,53 +277,77 @@ if( BLAS_PRINT_BUILD_ERRORS )
     add_definitions( -DPRINT_BUILD_ERRORS )
 endif()
 
-#add_executable(tplgen tools/tplgen/tplgen.cpp)
-if (CMAKE_COMPILER_IS_GNUCXX)
-    include(ExternalProject)
-    ExternalProject_Add(
-        tplgen
-        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
-        INSTALL_COMMAND ""
-    )
-    add_custom_target( GENERATE_CLT
-                   COMMAND ${CMAKE_BINARY_DIR}/library/tplgen-prefix/src/tplgen-build/tplgen -o ../../include/ ${SRC_CL_TEMPLATES}
-                   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
-                )
-    add_dependencies(GENERATE_CLT tplgen)
+include( ExternalProject )
+ExternalProject_Add( tplgen
+    URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
+    INSTALL_COMMAND ""
+)
+
+ExternalProject_Get_Property( tplgen binary_dir )
+
+set( tplgenBinaryDir "" )
+if( CMAKE_COMPILER_IS_GNUCXX )
+    set( tplgenBinaryDir ${binary_dir} )
 else()
-    include(ExternalProject)
-    ExternalProject_Add(
-        tplgen
-        URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
-        CONFIGURE_COMMAND "${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen\\configure.bat"
-        BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Debug
-#        BUILD_COMMAND MSBuild.exe tplgen.sln /m /fl /flp1:logfile=errors.log;errorsonly /flp2:logfile=warnings.log;warningsonly /t:rebuild
-        INSTALL_COMMAND ""
-    )
-    add_custom_target( GENERATE_CLT
-        COMMAND ${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen-build\\Debug\\tplgen.exe -o ..\\..\\include ${SRC_CL_TEMPLATES}
-        WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}\\library\\blas\\gens\\clTemplates
-    )
-    add_dependencies(GENERATE_CLT tplgen)
+    set( tplgenBinaryDir "${binary_dir}/${CMAKE_CFG_INTDIR}" )
 endif()
+
+add_custom_target( GENERATE_CLT
+    COMMAND ${tplgenBinaryDir}/tplgen -o ${clBLAS_BINARY_DIR}/include ${SRC_CL_TEMPLATES}
+    WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
+)
+
+add_dependencies( GENERATE_CLT tplgen )
+
+if( CMAKE_COMPILER_IS_GNUCC )
+    configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLAS.pc.in
+                    ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc @ONLY )
+
+    install( FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc
+             DESTINATION lib${SUFFIX_LIB}/pkgconfig )
+endif( )
+
 add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
 add_dependencies(clBLAS GENERATE_CLT)
 set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
 set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
+set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS clBLAS
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS clBLAS
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS clBLAS
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
+
+# For debug builds, include the debug runtimes into the package for testing on non-developer machines
+set( CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY true )
+
+if( WIN32 )
+    set( CLBLAS_RUNTIME_DESTINATION bin${SUFFIX_BIN} )
+else( )
+    set( CLBLAS_RUNTIME_DESTINATION lib${SUFFIX_LIB} )
+endif( )
+
+include( InstallRequiredSystemLibraries )
+
+# Install necessary runtime files for debug builds
+install(    PROGRAMS ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS}
+            CONFIGURATIONS Debug
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION} )
+
+# Install all *.pdb files for debug builds
+install(    DIRECTORY ${PROJECT_BINARY_DIR}/staging/
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+            OPTIONAL
+            CONFIGURATIONS Debug
+            FILES_MATCHING PATTERN "*.pdb" )
+
+# Install a snapshot of the source as it was for this build; useful for the .pdb's
+install(    DIRECTORY ${PROJECT_SOURCE_DIR}
+            DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+            OPTIONAL
+            CONFIGURATIONS Debug )
diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index 9e26887d..fef08800 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -22,6 +22,7 @@
 #include <clkern.h>
 #include <cltypes.h>
 #include <stdio.h>
+#include <ctype.h>
 
 #include "clblas-internal.h"
 
@@ -364,6 +365,7 @@ Kernel VISIBILITY_HIDDEN
         kernel->extra = calloc(1, kernel->extraSize);
         *(CLBLASKernExtra*)(kernel->extra) = *extra;
         kernel->dtor = extraDtor;
+        kernel->noSource = 1;
     }
     else {
         putKernel(NULL, kernel);
@@ -491,6 +493,7 @@ Kernel
 #if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
     if (err == CL_SUCCESS) {
         err = dropProgramSource(&kernel->program, context, device);
+        kernel->noSource = 1;
     }
 #endif  /* !DUMP_CLBLAS_KERNELS */
 
@@ -524,17 +527,34 @@ setupBuildOpts(
     opts[0] = '\0';
 
 #if !defined NDEBUG
-    strcpy(opts, "-g ");
+    addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-g");
 #endif  /* NDEBUG */
 
     if (target.ident.vendor == VENDOR_NVIDIA &&
         !strcmp(mempat->name, "2-staged cached global memory based "
                               "block trsm")) {
 
-        strcat(opts, "-cl-opt-disable");
+        addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
     }
 }
 
+void addBuildOpt(
+    char * opts,
+    size_t len,
+    const char * option)
+{
+    size_t l = strlen(opts);
+
+    if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
+      opts[l] = ' ';
+      opts[l+1]   = '\0';
+      l++;
+    }
+
+    strncat(opts, option, len - l - 1);
+}
+
+
 char VISIBILITY_HIDDEN
 *sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
 {
diff --git a/src/library/blas/generic/kdump.c b/src/library/blas/generic/kdump.c
index 5345fc78..a48204a0 100644
--- a/src/library/blas/generic/kdump.c
+++ b/src/library/blas/generic/kdump.c
@@ -17,7 +17,7 @@
 
 #include <stdio.h>
 #include <string.h>
-#include <malloc.h>
+#include <stdlib.h>
 
 #include <cltypes.h>
 #include <clblas-internal.h>
diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c
index 0eee1fd7..8a5e402d 100644
--- a/src/library/blas/generic/solution_seq_make.c
+++ b/src/library/blas/generic/solution_seq_make.c
@@ -1435,9 +1435,12 @@ getStepGranulation(SolutionStep *step)
             }
         }
 
-        status = getGranularityInfo(&step->device, mempat->name,
-                                    step->args.dtype, step->extraFlags,
-                                    (int)MNK, dims, &step->pgran, &time);
+		if( step->funcID != CLBLAS_GEMM2 )
+		{
+			status = getGranularityInfo(&step->device, mempat->name,
+										step->args.dtype, step->extraFlags,
+										(int)MNK, dims, &step->pgran, &time);
+		}
         /*
          * Disable blocking for implementations dealing with cache reads
          * from the global memory
diff --git a/src/library/blas/gens/asum.cpp b/src/library/blas/gens/asum.cpp
index 3260acbe..06b9f544 100644
--- a/src/library/blas/gens/asum.cpp
+++ b/src/library/blas/gens/asum.cpp
@@ -125,23 +125,23 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_DOT
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if ( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE))
 	{
-		strcat( buildOptStr, " -DCOMPLEX ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
 		#ifdef DEBUG_ASUM
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldb.vector) < 1) {
-        strcat( buildOptStr, " -DINCX_NEGATIVE ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NEGATIVE");
     }
 	return;
 }
diff --git a/src/library/blas/gens/axpy_reg.cpp b/src/library/blas/gens/axpy_reg.cpp
index 0f8ced01..52aab71f 100644
--- a/src/library/blas/gens/axpy_reg.cpp
+++ b/src/library/blas/gens/axpy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_AXPY
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/copy_reg.cpp b/src/library/blas/gens/copy_reg.cpp
index d9f70951..ba1ff398 100644
--- a/src/library/blas/gens/copy_reg.cpp
+++ b/src/library/blas/gens/copy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_COPY
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/dot.cpp b/src/library/blas/gens/dot.cpp
index 3f68221d..ed3e72b8 100644
--- a/src/library/blas/gens/dot.cpp
+++ b/src/library/blas/gens/dot.cpp
@@ -128,16 +128,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_DOT
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/gbmv.cpp b/src/library/blas/gens/gbmv.cpp
index 115ffbc0..ab8e5e2a 100644
--- a/src/library/blas/gens/gbmv.cpp
+++ b/src/library/blas/gens/gbmv.cpp
@@ -116,7 +116,7 @@ setBuildOpts(
 
 	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_GBMV
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
@@ -124,15 +124,15 @@ setBuildOpts(
 
     if( kargs->pigFuncID == CLBLAS_TBMV )
 	{
-		strcat( buildOptStr, " -DTBMV_ONLY ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTBMV_ONLY");
 		if( kargs->diag == clblasUnit )
 		{
-		    strcat( buildOptStr, " -DUNIT_DIAG ");
+		    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUNIT_DIAG");
 		}
 	}
 	if( ((kargs->pigFuncID == CLBLAS_GBMV) || (kargs->pigFuncID == CLBLAS_TBMV)) && (kargs->transA == clblasConjTrans) )
 	{
-	    strcat( buildOptStr, " -DDO_CONJ ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
 	}
 
 	if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )
@@ -141,15 +141,15 @@ setBuildOpts(
 	    isUpper = ( kargs->order == clblasColumnMajor )? !isUpper : isUpper;
 
 	    if( isUpper )
-	            strcat( buildOptStr, " -DGIVEN_SHBMV_UPPER ");
-	    else    strcat( buildOptStr, " -DGIVEN_SHBMV_LOWER ");
+	            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_UPPER");
+	    else    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_LOWER");
 
         if(kargs->pigFuncID == CLBLAS_HBMV)
         {
-            strcat( buildOptStr, " -DHBMV_ONLY ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHBMV_ONLY");
             if( kargs->order == clblasColumnMajor )  // Since routine calls Row-major, the whole matrix has to be conjugated while loading
             {
-                strcat( buildOptStr, " -DDO_CONJ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
             }
         }
 	}
diff --git a/src/library/blas/gens/gemm_cached.cpp b/src/library/blas/gens/gemm_cached.cpp
index 09231f90..5c7c3526 100644
--- a/src/library/blas/gens/gemm_cached.cpp
+++ b/src/library/blas/gens/gemm_cached.cpp
@@ -158,36 +158,36 @@ setBuildOpts(
 
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
     }
 
     if (isComplexType(kargs->dtype))
     {
-        strcat(buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
 
     if ((bestSize.useBarrier) == 1)
     {
-	    strcat(buildOptStr, " -DGEMM_NEEDS_BARRIER ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGEMM_NEEDS_BARRIER");
     }
 
     if (kargs->M % dims->y)
 	{
-		strcat(buildOptStr, " -DM_TAIL_PRESENT ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DM_TAIL_PRESENT");
     }
 
 	if (kargs->N % dims->x)
 	{
-		strcat(buildOptStr, " -DN_TAIL_PRESENT ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DN_TAIL_PRESENT");
 	}
 
     if (kflags & KEXTRA_CONJUGATE_A)
     {
-        strcat( buildOptStr, " -DCONJUGATE_A ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
     }
     if (kflags & KEXTRA_CONJUGATE_B)
     {
-        strcat( buildOptStr, " -DCONJUGATE_B ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
     }
 
     switch(kargs->pigFuncID)
@@ -201,46 +201,46 @@ setBuildOpts(
             #endif
             if (kargs->side == clblasLeft)
             {
-                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
             }
             if (kargs->side == clblasRight)
             {
-                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
             }
             if (kargs->uplo == clblasLower)
             {
-                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
             }
             if (kargs->uplo == clblasUpper)
             {
-                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
             }
             // Define the order for Legacy sake.
             if (kargs->order == clblasColumnMajor)
             {
-                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
             } else {
-                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
             }
             if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
             {
-                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
             }
             if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
             {
-                strcat(buildOptStr, " -D__HEMM__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
             }
             break;
 
          case CLBLAS_HERK:
-            strcat( buildOptStr, " -DHERK");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
             if(kargs->uplo == clblasLower)
             {
-                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
             }
             else if(kargs->uplo == clblasUpper)
             {
-                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
             }
             break;
 
diff --git a/src/library/blas/gens/gemm_tail_cached.cpp b/src/library/blas/gens/gemm_tail_cached.cpp
index ea792499..ff144af9 100644
--- a/src/library/blas/gens/gemm_tail_cached.cpp
+++ b/src/library/blas/gens/gemm_tail_cached.cpp
@@ -96,10 +96,10 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     KernelExtraFlags kflags = step->extraFlags;
 
-	strcat(buildOptStr, " -DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT");
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_GEMM_TAIL
         printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
@@ -107,16 +107,16 @@ setBuildOpts(
 
     if (isComplexType(kargs->dtype))
     {
-        strcat(buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
 
     if (kflags & KEXTRA_CONJUGATE_A)
     {
-        strcat( buildOptStr, " -DCONJUGATE_A ");
-}
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
+    }
     if (kflags & KEXTRA_CONJUGATE_B)
     {
-        strcat( buildOptStr, " -DCONJUGATE_B ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
     }
 
 
@@ -127,14 +127,14 @@ setBuildOpts(
             break;
 
         case CLBLAS_HERK:
-            strcat( buildOptStr, " -DHERK");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
             if(kargs->uplo == clblasLower)
             {
-                strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
             }
             else if(kargs->uplo == clblasUpper)
             {
-                strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
             }
             break;
 
@@ -147,33 +147,34 @@ setBuildOpts(
             #endif
             if (kargs->side == clblasLeft)
             {
-                strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
             }
             if (kargs->side == clblasRight)
             {
-                strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
             }
             if (kargs->uplo == clblasLower)
             {
-                strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
             }
             if (kargs->uplo == clblasUpper)
             {
-                strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
             }
+            // Define the order for Legacy sake.
             if (kargs->order == clblasColumnMajor)
             {
-                strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
             } else {
-                strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
             }
-            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL)  || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
+            if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
             {
-                strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
             }
             if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
             {
-                strcat(buildOptStr, " -D__HEMM__ ");
+                addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
             }
             break;
 
diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp
index b74945ea..f72d1975 100644
--- a/src/library/blas/gens/ger_lds.cpp
+++ b/src/library/blas/gens/ger_lds.cpp
@@ -137,7 +137,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 
 		#ifdef DEBUG_GER
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
@@ -282,8 +282,8 @@ generator(
 
 	BH = subdims->y;
 	BW = subdims->x;
-	sprintf( bhStr, "%d", BH );
-	sprintf( bwStr, "%d", BW );
+	sprintf( bhStr, "%" SPREFIX "u", BH );
+	sprintf( bwStr, "%" SPREFIX "u", BW );
 
 	#ifdef DEBUG_GER
     printf("BH = %s\n", bhStr);
diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp
index a409c1ad..5adda19d 100644
--- a/src/library/blas/gens/her2_lds.cpp
+++ b/src/library/blas/gens/her2_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_HER2
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( kargs->order == clblasRowMajor )
 	{
-		strcat( buildOptStr, " -DHER2_ROWMAJOR ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ROWMAJOR");
 		#ifdef DEBUG_HER2
 		printf("Setting build options ... HERMITIAN2_ROWMAJOR... for row-major support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_HPR2 )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	//Build options for syr2_her2.clT to generate HER2 related code.
-	strcat( buildOptStr, " -DHER2_ONLY ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ONLY");
 	return;
 }
 
@@ -301,7 +301,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_HER2
diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp
index e174de2d..1a8365f0 100644
--- a/src/library/blas/gens/her_lds.cpp
+++ b/src/library/blas/gens/her_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_HER
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( kargs->order == clblasRowMajor )
 	{
-		strcat( buildOptStr, " -DHERMITIAN_ROWMAJOR ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERMITIAN_ROWMAJOR");
 		#ifdef DEBUG_HER
 		printf("Setting build options ... HERMITIAN_ROWMAJOR... for row-major support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_HPR )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	//Build options for syr_her.clT to generate HER related code.
-	strcat( buildOptStr, " -DHER_ONLY ");
+	addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER_ONLY");
 	return;
 }
 
@@ -300,7 +300,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_HER
diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp
index bf20afd0..7a5966de 100644
--- a/src/library/blas/gens/iamax.cpp
+++ b/src/library/blas/gens/iamax.cpp
@@ -124,7 +124,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_AMAX
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
@@ -132,17 +132,17 @@ setBuildOpts(
 
     if( (kargs->ldb.vector) != 1)
     {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
 
     if( (kargs->ldb.vector) < 1)
     {
-        strcat( buildOptStr, " -DRETURN_ON_INVALID ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
     }
 
     if( (kargs->redctnType == REDUCE_MAX_WITH_INDEX_ATOMICS))
     {
-        strcat( buildOptStr, " -DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
     }
 
 	return;
diff --git a/src/library/blas/gens/kprintf.cpp b/src/library/blas/gens/kprintf.cpp
index 54772fa2..d5cbecb8 100644
--- a/src/library/blas/gens/kprintf.cpp
+++ b/src/library/blas/gens/kprintf.cpp
@@ -346,7 +346,7 @@ char* kprintf::mystrtok( char* in, const char* tok)
             bool tokenFound = false;
             for( size_t i=0 ; i <= (strlen(tok) - 1); i++)
             {
-                if ((*strtokPtr == tok[i]))
+                if (*strtokPtr == tok[i])
                 {
                     if ( tok[i] == '(')
                     {
diff --git a/src/library/blas/gens/legacy/tests/CMakeLists.txt b/src/library/blas/gens/legacy/tests/CMakeLists.txt
index 9c5a0f37..fae11cc5 100644
--- a/src/library/blas/gens/legacy/tests/CMakeLists.txt
+++ b/src/library/blas/gens/legacy/tests/CMakeLists.txt
@@ -45,19 +45,11 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_blkmul ${SRC_BLKMUL})
 target_link_libraries(t_blkmul ${OPENCL_LIBRARIES})
+set_target_properties( t_blkmul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_blkmul
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_blkmul
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_blkmul
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/blas/gens/legacy/tests/t_blkmul.c b/src/library/blas/gens/legacy/tests/t_blkmul.c
index 4983ce0d..590231ee 100644
--- a/src/library/blas/gens/legacy/tests/t_blkmul.c
+++ b/src/library/blas/gens/legacy/tests/t_blkmul.c
@@ -15,7 +15,11 @@
  * ************************************************************************/
 
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/src/library/blas/gens/nrm2.cpp b/src/library/blas/gens/nrm2.cpp
index 832f5e41..d898ffbc 100644
--- a/src/library/blas/gens/nrm2.cpp
+++ b/src/library/blas/gens/nrm2.cpp
@@ -128,22 +128,22 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
     if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-        strcat( buildOptStr, " -DCOMPLEX ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
     }
     if(kargs->redctnType == REDUCE_BY_HYPOT) {
-            strcat( buildOptStr, "-DUSE_HYPOT ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_HYPOT");
     } else if(kargs->redctnType == REDUCE_BY_SSQ) {
-            strcat( buildOptStr, " -DUSE_SSQ ");
+            addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_SSQ");
     }
 
     if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldb.vector) < 1) {
-        strcat( buildOptStr, " -DRETURN_ON_INVALID");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
     }
 	return;
 }
diff --git a/src/library/blas/gens/reduction.cpp b/src/library/blas/gens/reduction.cpp
index 1c81c0b7..5c005280 100644
--- a/src/library/blas/gens/reduction.cpp
+++ b/src/library/blas/gens/reduction.cpp
@@ -130,29 +130,29 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
     switch(kargs->redctnType)
     {
-        case REDUCE_BY_SUM:                 strcat( buildOptStr, "-DREDUCE_BY_SUM ");
+        case REDUCE_BY_SUM:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SUM");
                                             break;
 
-        case REDUCE_BY_MAX:                 strcat( buildOptStr, "-DREDUCE_BY_MAX ");
+        case REDUCE_BY_MAX:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MAX");
                                             break;
 
-        case REDUCE_BY_MIN:                 strcat( buildOptStr, "-DREDUCE_BY_MIN ");
+        case REDUCE_BY_MIN:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MIN");
                                             break;
 
-        case REDUCE_MAX_WITH_INDEX:         strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX ");
+        case REDUCE_MAX_WITH_INDEX:         addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX");
                                             break;
 
-        case REDUCE_BY_HYPOT:               strcat( buildOptStr, "-DREDUCE_BY_HYPOT ");
+        case REDUCE_BY_HYPOT:               addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_HYPOT");
                                             break;
 
-        case REDUCE_BY_SSQ:                 strcat( buildOptStr, "-DREDUCE_BY_SSQ ");
+        case REDUCE_BY_SSQ:                 addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SSQ");
                                             break;
 
-        case REDUCE_MAX_WITH_INDEX_ATOMICS: strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+        case REDUCE_MAX_WITH_INDEX_ATOMICS: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
                                             break;
 
         default:                            printf("Invalid reduction type!!\n");
diff --git a/src/library/blas/gens/rotg_reg.cpp b/src/library/blas/gens/rotg_reg.cpp
index 0ec1eb0a..4d1ded18 100644
--- a/src/library/blas/gens/rotg_reg.cpp
+++ b/src/library/blas/gens/rotg_reg.cpp
@@ -98,10 +98,10 @@ setBuildOpts(
 	const SolutionStep *step = (const SolutionStep *)args;
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 	if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
-	    strcat( buildOptStr, " -DCOMPLEX ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
 	}
 
 	return;
diff --git a/src/library/blas/gens/rotm_reg.cpp b/src/library/blas/gens/rotm_reg.cpp
index 2b044192..2b87507e 100644
--- a/src/library/blas/gens/rotm_reg.cpp
+++ b/src/library/blas/gens/rotm_reg.cpp
@@ -121,17 +121,17 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 	if(kargs->pigFuncID == CLBLAS_ROT)
 	{
-	    strcat( buildOptStr, " -DDO_ROT ");
+	    addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_ROT");
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/rotmg_reg.cpp b/src/library/blas/gens/rotmg_reg.cpp
index b256ac6f..7c333c6f 100644
--- a/src/library/blas/gens/rotmg_reg.cpp
+++ b/src/library/blas/gens/rotmg_reg.cpp
@@ -97,7 +97,7 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 	}
 
 	return;
diff --git a/src/library/blas/gens/scal_reg.cpp b/src/library/blas/gens/scal_reg.cpp
index d82362b1..8b853106 100644
--- a/src/library/blas/gens/scal_reg.cpp
+++ b/src/library/blas/gens/scal_reg.cpp
@@ -125,13 +125,13 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SCAL
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/swap_reg.cpp b/src/library/blas/gens/swap_reg.cpp
index 5b44cebe..b75e1004 100644
--- a/src/library/blas/gens/swap_reg.cpp
+++ b/src/library/blas/gens/swap_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SWAP
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
 	if( (kargs->ldb.vector) != 1) {
-        strcat( buildOptStr, " -DINCX_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
     }
     if( (kargs->ldc.vector) != 1) {
-        strcat( buildOptStr, " -DINCY_NONUNITY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
     }
 
 	return;
diff --git a/src/library/blas/gens/symm_cached.cpp b/src/library/blas/gens/symm_cached.cpp
index cc8c0350..0d9ea8d3 100644
--- a/src/library/blas/gens/symm_cached.cpp
+++ b/src/library/blas/gens/symm_cached.cpp
@@ -99,7 +99,7 @@ setBuildOpts(
 
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_TRMV
         printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
@@ -107,23 +107,23 @@ setBuildOpts(
 
 	if (kargs->side == clblasLeft)
 	{
-		strcat(buildOptStr, " -D__SYMM_LEFT__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__ ");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_RIGHT__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
 	}
 
 	if (kargs->uplo == clblasUpper)
 	{
-		strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
 	}
 
 	if (kargs->order == clblasColumnMajor)
 	{
-		strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
 	} else {
-		strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
 	}
 
 	strcat(buildOptStr, " -cl-mad-enable ");
@@ -193,10 +193,10 @@ generator(
 		printf("WARNING: SYMM- generator: subdim and blocksize in-compatible.\n");
 	}
 
-	sprintf(width, "%d", Y);
-	sprintf(itemy, "%lu", ITEMY);
-	sprintf(itemx, "%lu", ITEMX);
-	sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
+	sprintf(width, "%" SPREFIX "u", Y);
+	sprintf(itemy, "%" SPREFIX "u", ITEMY);
+	sprintf(itemx, "%" SPREFIX "u", ITEMX);
+	sprintf(itemy_by_width, "%" SPREFIX "u", (size_t) ITEMY/kextra->vecLenA);
 
 	kobj.put("%WIDTH", width);
 	kobj.put("%ITEMX", itemx);
diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp
index 9fccb059..f5c20cb1 100644
--- a/src/library/blas/gens/syr2_lds.cpp
+++ b/src/library/blas/gens/syr2_lds.cpp
@@ -139,14 +139,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SYR2
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_SPR2 )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
@@ -308,7 +308,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_SYR2
diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp
index 0a12ef4e..16911bb4 100644
--- a/src/library/blas/gens/syr_lds.cpp
+++ b/src/library/blas/gens/syr_lds.cpp
@@ -142,14 +142,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE )
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_SYR
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_SPR )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
@@ -308,7 +308,7 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
 
-	sprintf( targetRows, "%d", TARGETROWS );
+	sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
 	sprintf( blockSize, "%d", BLOCKSIZE );
 
 	#ifdef DEBUG_SYR
diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
index a0f6a295..54574ed2 100644
--- a/src/library/blas/gens/syrxk.c
+++ b/src/library/blas/gens/syrxk.c
@@ -21,6 +21,7 @@
 
 #include <string.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <assert.h>
 
 #include <clBLAS.h>
@@ -1219,10 +1220,11 @@ genUpdateGenericDiagTile(
     // type of the vectorized coordinates
     Kstring vctype;
     Kstring constOffs, constShifts, constMasks;
-    unsigned int i, j, nops;
+    unsigned int i, j, nops,size;
     unsigned int maxFetches = 0;
     const char *yname, *xname;
     const char *ldcName;
+	char hexadec[2];
 
     batch = createStmtBatch();
     if (batch == NULL) {
@@ -1253,6 +1255,14 @@ genUpdateGenericDiagTile(
     tifl = (isUpper) ? TILE_ITER_BACKWARD_ROWS :
                        TILE_ITER_BACKWARD_COLS;
     iterInit(&iter, &tileTempC, 1, tifl);
+	nops = 0;
+	while (!iterIsEnd(&iter)) {
+		nops++;
+		size = nops / nrCols;
+		iterIterate(&iter);
+	}
+
+	iterInit(&iter, &tileTempC, 1, tifl);
 
     initTmpResTile(&tileTempC, gset, true);
 
@@ -1316,7 +1326,7 @@ genUpdateGenericDiagTile(
     maxFetches = umin(maxFetches, i);
 
     // declare vectorized coordinates
-    declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", tempRows);
+    declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", size);
 
     /*
      * real y coordinate, offset mask and
@@ -1326,8 +1336,8 @@ genUpdateGenericDiagTile(
                      "unsigned int mask;\n"
                      "int hit;\n");
     if (withBeta) {
-        declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", tempRows);
-        declareDiagUpresIndexedVars(ctx, typeName, "betaNew", tempRows);
+        declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", size);
+        declareDiagUpresIndexedVars(ctx, typeName, "betaNew", size);
     }
 
     // declare tile
@@ -1401,7 +1411,7 @@ genUpdateGenericDiagTile(
                                 "cc%u = ((%s)mask &\n"
                                 "       %s) >>\n"
                                 "      %s;\n"
-                                "cc%u = %u - mad24(cc%u, %s, 0);\n",
+                                "cc%u = %u - mad24(cc%u, %s, 0u);\n",
 
                                 iter.row,
                                 (1 << (nrCols - 1)),
@@ -1416,7 +1426,7 @@ genUpdateGenericDiagTile(
                                 "cc%u = ((%s)mask &\n"
                                 "       %s) >>\n"
                                 "      %s;\n"
-                                "cc%u = mad24(cc%u, %s, 0);\n",
+                                "cc%u = mad24(cc%u, %s, 0u);\n",
 
                                 nrRows - 1, iter.row,
                                 i, vctype.buf, constMasks.buf, constShifts.buf,
@@ -1443,7 +1453,9 @@ genUpdateGenericDiagTile(
             ksprintf(&kstr, "cc%u", i);
         }
         else {
-            ksprintf(&kstr, "cc%u.s%u", i, iter.col);
+			snprintf(hexadec, sizeof(char)*2, "%x", iter.col);
+			//itoa(iter.col, hexadec, 16);
+            ksprintf(&kstr, "cc%u.s%s", i, hexadec);
         }
 
         // prepare multipliers and fetch
diff --git a/src/library/blas/gens/tests/CMakeLists.txt b/src/library/blas/gens/tests/CMakeLists.txt
index f945b1eb..6d10e3fe 100644
--- a/src/library/blas/gens/tests/CMakeLists.txt
+++ b/src/library/blas/gens/tests/CMakeLists.txt
@@ -42,19 +42,11 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_tilemul ${SRC_TILEMUL})
 target_link_libraries(t_tilemul ${OPENCL_LIBRARIES})
+set_target_properties( t_tilemul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_tilemul
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_tilemul
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_tilemul
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/blas/gens/tests/t_tilemul.c b/src/library/blas/gens/tests/t_tilemul.c
index ba4b49c9..4b4dd803 100644
--- a/src/library/blas/gens/tests/t_tilemul.c
+++ b/src/library/blas/gens/tests/t_tilemul.c
@@ -14,8 +14,11 @@
  * limitations under the License.
  * ************************************************************************/
 
-
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string.h>
 #include <stdlib.h>
 #include <assert.h>
diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp
index 28ee1f26..9cacd0f1 100644
--- a/src/library/blas/gens/trmv_reg.cpp
+++ b/src/library/blas/gens/trmv_reg.cpp
@@ -136,28 +136,28 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_TRMV
 		printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( (step->funcID == CLBLAS_HEMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
 	{
-		strcat( buildOptStr, " -DHEMV_ONLY ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ONLY");
 		/*
 		if(kargs->diag == clblasUnit)
 		{
-			strcat( buildOptStr, " -DHEMV_ZERO_DIAG ");
+			addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ZERO_DIAG");
 		}
 		*/
 	}
     if ( kargs->pigFuncID == CLBLAS_SPMV )
     {
-        strcat( buildOptStr, " -DSPMV_ONLY ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DSPMV_ONLY");
     }
     if( (kargs->pigFuncID == CLBLAS_TPMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
     }
 
 	return;
@@ -381,8 +381,8 @@ generator(
 	}
 	kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
 
-    sprintf( targetRows, "%d", TARGETROWS );
-	sprintf( blockSize, "%d", BLOCKSIZE );
+    sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
+	sprintf( blockSize, "%" SPREFIX "u", BLOCKSIZE );
 
 	#ifdef DEBUG_TRMV
     printf("TARGET ROWS = %s\n", targetRows);
diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp
index 49d5371b..ca73fbe5 100644
--- a/src/library/blas/gens/trsv_gemv.cpp
+++ b/src/library/blas/gens/trsv_gemv.cpp
@@ -128,14 +128,14 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
 	if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
 	{
-		strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+		addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
 		#ifdef DEBUG_TRSV_GEMV
 		printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n");
 		#endif
 	}
     if( kargs->pigFuncID == CLBLAS_TPSV)
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
         #ifdef DEBUG_TRSV_GEMV
             printf("TPSV GEMV: Setting build options ... PACKED\n");
         #endif
@@ -415,9 +415,9 @@ generator(
 		{
 			return 0;
 		}
-        sprintf( TARGETHEIGHT_S, "%d", TARGETHEIGHT );
+        sprintf( TARGETHEIGHT_S, "%" SPREFIX "u", TARGETHEIGHT );
 	    sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE );
-        sprintf( TRIANGLE_HEIGHT_S, "%d", subdims->y );
+        sprintf( TRIANGLE_HEIGHT_S, "%" SPREFIX "u", subdims->y );
 
 		kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S);
 		kobj.put("%BLOCKSIZE", BLOCKSIZE_S);
@@ -433,9 +433,9 @@ generator(
 		{
 			return 0;
 		}
-        sprintf( TARGETROWS_S, "%d", TARGETROWS );
-	    sprintf( TARGETWIDTH_S, "%d", TARGETWIDTH );
-        sprintf( NLOOPS_S, "%d", NLOOPS );
+        sprintf( TARGETROWS_S, "%" SPREFIX "u", TARGETROWS );
+	    sprintf( TARGETWIDTH_S, "%" SPREFIX "u", TARGETWIDTH );
+        sprintf( NLOOPS_S, "%" SPREFIX "u", NLOOPS );
 		kobj.put("%TARGET_ROWS", TARGETROWS_S);
 		kobj.put("%TARGET_WIDTH", TARGETWIDTH_S);
 		kobj.put("%NLOOPS", NLOOPS_S);
diff --git a/src/library/blas/gens/trsv_trtri.cpp b/src/library/blas/gens/trsv_trtri.cpp
index 071565ff..0bae0f99 100644
--- a/src/library/blas/gens/trsv_trtri.cpp
+++ b/src/library/blas/gens/trsv_trtri.cpp
@@ -128,21 +128,21 @@ setBuildOpts(
     const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
     if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
     {
-        strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
         #ifdef DEBUG_TRSV_TRTRI
         printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
         #endif
     }
     if( kargs->pigFuncID == CLBLAS_TPSV)
     {
-        strcat( buildOptStr, " -DPACKED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
         #ifdef DEBUG_TRSV_TRTRI
             printf("TPSV TRTRI: Setting build options ... PACKED\n");
         #endif
     }
     if( kargs->pigFuncID == CLBLAS_TBSV)
     {
-        strcat( buildOptStr, " -DBANDED ");
+        addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED");
         #ifdef DEBUG_TRSV_TRTRI
         printf("TBSV TRTRI: Setting build options .. BANDED\n");
         #endif
diff --git a/src/library/blas/include/clblas-internal.h b/src/library/blas/include/clblas-internal.h
index 81ab5127..7a9afcdc 100644
--- a/src/library/blas/include/clblas-internal.h
+++ b/src/library/blas/include/clblas-internal.h
@@ -240,6 +240,11 @@ setupBuildOpts(
     cl_device_id devID,
     MemoryPattern *mempat);
 
+void addBuildOpt(
+    char * opts,
+    size_t len,
+    const char * option);
+
 // Internal scatter image API
 
 int
diff --git a/src/library/blas/init.c b/src/library/blas/init.c
index 5095cb0f..2b257a8e 100644
--- a/src/library/blas/init.c
+++ b/src/library/blas/init.c
@@ -18,7 +18,7 @@
 #include <clBLAS.h>
 #include <toolslib.h>
 #include <kern_cache.h>
-#include <version.h>
+#include <clBLAS.version.h>
 #include <trace_malloc.h>
 
 #include "clblas-internal.h"
diff --git a/src/library/blas/xaxpy.c b/src/library/blas/xaxpy.c
index 7499c414..d57b4c23 100644
--- a/src/library/blas/xaxpy.c
+++ b/src/library/blas/xaxpy.c
@@ -60,11 +60,11 @@ doAxpy(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xcopy.c b/src/library/blas/xcopy.c
index e0ea2a03..8e375976 100644
--- a/src/library/blas/xcopy.c
+++ b/src/library/blas/xcopy.c
@@ -60,11 +60,11 @@ doCopy(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xdot.c b/src/library/blas/xdot.c
index f29cdb6f..67bf4cd2 100644
--- a/src/library/blas/xdot.c
+++ b/src/library/blas/xdot.c
@@ -67,20 +67,20 @@ doDot(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
 		// Minimum size of scratchBuff is N
-		if (retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET))) {
 			printf("Insufficient ScratchBuff\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for dotProduct\n");
             return retCode;
 		}
diff --git a/src/library/blas/xgemm2.c b/src/library/blas/xgemm2.c
index 0a5ae436..2bba00ae 100644
--- a/src/library/blas/xgemm2.c
+++ b/src/library/blas/xgemm2.c
@@ -209,18 +209,18 @@ doGemm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET)) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
     if (K != 0) {
-        if (retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET )) {
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET))) {
             return retCode;
         }
-        if (retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET )) {
+        if ((retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET))) {
             return retCode;
         }
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET))) {
             return retCode;
     }
 
diff --git a/src/library/blas/xger.c b/src/library/blas/xger.c
index 92d4b311..c9e9e1c9 100644
--- a/src/library/blas/xger.c
+++ b/src/library/blas/xger.c
@@ -58,23 +58,23 @@ doGer(
 
 		/* Validate arguments */
 
-		if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+		if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 			printf("Invalid mem object..\n");
             return retCode;
 		}
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET )) {
+		if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET))) {
 
 			printf("Invalid Size for A %d\n",retCode );
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xhemv.c b/src/library/blas/xhemv.c
index 0db6a8f9..21011dd7 100644
--- a/src/library/blas/xhemv.c
+++ b/src/library/blas/xhemv.c
@@ -54,17 +54,17 @@ doHemv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
diff --git a/src/library/blas/xher.c b/src/library/blas/xher.c
index af36962b..7131057c 100644
--- a/src/library/blas/xher.c
+++ b/src/library/blas/xher.c
@@ -56,16 +56,16 @@ doher(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
    		printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
diff --git a/src/library/blas/xher2.c b/src/library/blas/xher2.c
index cb676592..21a8ddcf 100644
--- a/src/library/blas/xher2.c
+++ b/src/library/blas/xher2.c
@@ -59,21 +59,21 @@ doHer2(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
 
-	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+	if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         printf("Invalid Size for Y\n");
         return retCode;
     }
diff --git a/src/library/blas/xher2k.c b/src/library/blas/xher2k.c
index 302a648b..4c3d2f2a 100644
--- a/src/library/blas/xher2k.c
+++ b/src/library/blas/xher2k.c
@@ -71,7 +71,7 @@ doHer2k(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
 
@@ -79,15 +79,15 @@ doHer2k(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xherk.c b/src/library/blas/xherk.c
index 18d1fb4d..b4f409d7 100644
--- a/src/library/blas/xherk.c
+++ b/src/library/blas/xherk.c
@@ -64,7 +64,7 @@ doHerk(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
 
@@ -72,11 +72,11 @@ doHerk(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xhpmv.c b/src/library/blas/xhpmv.c
index 991819c4..1f0fe67b 100644
--- a/src/library/blas/xhpmv.c
+++ b/src/library/blas/xhpmv.c
@@ -53,17 +53,17 @@ doHpmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
-                         offa, 0, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    AP, offa, 0, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -182,4 +182,4 @@ clblasZhpmv(
     return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
                   Y, offy, incy, numCommandQueues, commandQueues,
                   numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xrot.c b/src/library/blas/xrot.c
index 7fd981bc..d07ec87d 100644
--- a/src/library/blas/xrot.c
+++ b/src/library/blas/xrot.c
@@ -58,11 +58,11 @@ doRot(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xrotg.c b/src/library/blas/xrotg.c
index fb9c8e1b..e4971480 100644
--- a/src/library/blas/xrotg.c
+++ b/src/library/blas/xrotg.c
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 //#define DEBUG_ROTG
 
 #include <stdio.h>
@@ -73,21 +69,21 @@ doRotg(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for B\n");
             return retCode;
 		}
 
-		if (retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for C\n");
             return retCode;
 		}
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for S\n");
             return retCode;
 		}
diff --git a/src/library/blas/xrotm.c b/src/library/blas/xrotm.c
index fcdfcb08..4130cf5d 100644
--- a/src/library/blas/xrotm.c
+++ b/src/library/blas/xrotm.c
@@ -60,15 +60,15 @@ doRotm(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for PARAM\n"); // PARAM is of minimum length 5
             return retCode;
 		}
diff --git a/src/library/blas/xrotmg.c b/src/library/blas/xrotmg.c
index b3c22298..e6e48b6d 100644
--- a/src/library/blas/xrotmg.c
+++ b/src/library/blas/xrotmg.c
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 #include <stdio.h>
 #include <string.h>
 #include <clBLAS.h>
@@ -69,23 +65,23 @@ doRotmg(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for D1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for D2\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET))) {
 			printf("Invalid Size for X1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y1\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET))) {
 			printf("Invalid Size for PARAM\n");
             return retCode;
 		}
diff --git a/src/library/blas/xscal.c b/src/library/blas/xscal.c
index 6722383a..b2620310 100644
--- a/src/library/blas/xscal.c
+++ b/src/library/blas/xscal.c
@@ -57,7 +57,7 @@ doScal(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
diff --git a/src/library/blas/xshbmv.c b/src/library/blas/xshbmv.c
index e0a5087a..94f733da 100644
--- a/src/library/blas/xshbmv.c
+++ b/src/library/blas/xshbmv.c
@@ -68,19 +68,19 @@ doSHbmv(
     }
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)))
     {
         return retCode;
     }
 
-    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
-                                            N, N, K, 0, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
+                                          N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xspmv.c b/src/library/blas/xspmv.c
index d522ba84..b40e0269 100644
--- a/src/library/blas/xspmv.c
+++ b/src/library/blas/xspmv.c
@@ -53,17 +53,17 @@ doSpmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
-                         offa, 0, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    AP, offa, 0, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         return retCode;
     }
 	if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -184,4 +184,4 @@ clblasDspmv(
     return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
                   Y, offy, incy, numCommandQueues, commandQueues,
                   numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xswap.c b/src/library/blas/xswap.c
index 38066186..1d83a5b2 100644
--- a/src/library/blas/xswap.c
+++ b/src/library/blas/xswap.c
@@ -60,11 +60,11 @@ doSwap(
 
 		// Check wheather enough memory was allocated
 
-		if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
 			printf("Invalid Size for X\n");
             return retCode;
 		}
-		if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+		if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
 			printf("Invalid Size for Y\n");
             return retCode;
 		}
diff --git a/src/library/blas/xsymm.c b/src/library/blas/xsymm.c
index e61a33f6..5c87fc6e 100644
--- a/src/library/blas/xsymm.c
+++ b/src/library/blas/xsymm.c
@@ -50,31 +50,31 @@ doSymm(	CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side,
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
 		printf("SYMM:- Invalid mem object..\n");
         return retCode;
     }
 
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET))) {
 		printf("Invalid Size for B\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET))) {
 		printf("Invalid Size for C\n");
         return retCode;
     }
 	if (side == clblasLeft)
 	{
 		// MxM x MxN
-    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET )) {
+    	if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
     	}
 	} else {
 		// MxN x NxN
-    	if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    	if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 			printf("Invalid Size for A\n");
             return retCode;
     	}
diff --git a/src/library/blas/xsymv.c b/src/library/blas/xsymv.c
index 55b23e85..790e8720 100644
--- a/src/library/blas/xsymv.c
+++ b/src/library/blas/xsymv.c
@@ -60,17 +60,17 @@ doSymv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xsyr.c b/src/library/blas/xsyr.c
index d2d1ae7c..9358920f 100644
--- a/src/library/blas/xsyr.c
+++ b/src/library/blas/xsyr.c
@@ -55,7 +55,7 @@ doSyr(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
    		printf("Invalid mem object..\n");
         return retCode;
     }
@@ -65,11 +65,11 @@ doSyr(
      * checkMatrixSizes() does not account of "offa" argument.
      * Need to be added.
      */
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
diff --git a/src/library/blas/xsyr2.c b/src/library/blas/xsyr2.c
index 2f0a1856..fddcfbd2 100644
--- a/src/library/blas/xsyr2.c
+++ b/src/library/blas/xsyr2.c
@@ -58,21 +58,21 @@ doSyr2(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
         printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET ))) {
         printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
         printf("Invalid Size for X\n");
         return retCode;
     }
 
-	if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+	if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
         printf("Invalid Size for Y\n");
         return retCode;
     }
diff --git a/src/library/blas/xsyr2k.c b/src/library/blas/xsyr2k.c
index e99a617b..25ed438c 100644
--- a/src/library/blas/xsyr2k.c
+++ b/src/library/blas/xsyr2k.c
@@ -58,7 +58,7 @@ doSyr2k(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
         return retCode;
     }
 
@@ -66,13 +66,13 @@ doSyr2k(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xsyrk.c b/src/library/blas/xsyrk.c
index 4157d5e8..2582830e 100644
--- a/src/library/blas/xsyrk.c
+++ b/src/library/blas/xsyrk.c
@@ -55,7 +55,7 @@ doSyrk(
     }
 
     // Validate arguments
-    if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET)) {
+    if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
 
@@ -63,10 +63,10 @@ doSyrk(
         return clblasInvalidValue;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtbmv.c b/src/library/blas/xtbmv.c
index 8f59bc99..b3b0d3b7 100644
--- a/src/library/blas/xtbmv.c
+++ b/src/library/blas/xtbmv.c
@@ -59,20 +59,20 @@ doTbmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 	printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET)) {
+    if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
 		printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		printf("Invalid Size for X\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET)) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
 		printf("Invalid Size for scratch vector\n");
         return retCode;
     }
diff --git a/src/library/blas/xtrmm.c b/src/library/blas/xtrmm.c
index b7611dae..8aff2079 100644
--- a/src/library/blas/xtrmm.c
+++ b/src/library/blas/xtrmm.c
@@ -55,16 +55,16 @@ doTrmm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) {
         return retCode;
     }
     msize = (side == clblasLeft) ? M : N;
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
-                         offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+                                    B, offB, ldb, B_MAT_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtrmv.c b/src/library/blas/xtrmv.c
index 2f4e2166..145c799f 100644
--- a/src/library/blas/xtrmv.c
+++ b/src/library/blas/xtrmv.c
@@ -57,20 +57,20 @@ doTrmv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
 	printf("Invalid mem object..\n");
         return retCode;
     }
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 		printf("Invalid Size for A\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		printf("Invalid Size for X\n");
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
 		printf("Invalid Size for scratch vector\n");
         return retCode;
     }
diff --git a/src/library/blas/xtrsm.c b/src/library/blas/xtrsm.c
index 9fb5b4af..d2fd7f09 100644
--- a/src/library/blas/xtrsm.c
+++ b/src/library/blas/xtrsm.c
@@ -55,17 +55,17 @@ doTrsm(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+    if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET ))) {
         return retCode;
     }
     msize = (side == clblasLeft) ? M : N;
 
-    if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
-                         offA, lda, A_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+                                    A, offA, lda, A_MAT_ERRSET ))) {
         return retCode;
     }
-    if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
-                         offB, ldb, B_MAT_ERRSET )) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+                                    B, offB, ldb, B_MAT_ERRSET ))) {
         return retCode;
     }
 
diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c
index c3342287..1e48349a 100644
--- a/src/library/blas/xtrsv.c
+++ b/src/library/blas/xtrsv.c
@@ -351,7 +351,7 @@ doTrsv(
 
     /* Validate arguments */
 
-    if (retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET)) {
+    if ((retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid mem object..\n");
 		#endif
@@ -363,13 +363,13 @@ doTrsv(
  	 * checkMatrixSizes() does not account for "offa" argument.
  	 * Need to pass "offa" when "checkMatrixSizes()" is changed.
 	 */
-    if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET)) {
+    if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid Size for A\n");
 		#endif
         return retCode;
     }
-    if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+    if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
 		#ifdef DEBUG_TRSV
 		printf("Invalid Size for X\n");
 		#endif
diff --git a/src/library/clBLAS.pc.in b/src/library/clBLAS.pc.in
new file mode 100644
index 00000000..433ca635
--- /dev/null
+++ b/src/library/clBLAS.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}/bin@SUFFIX_BIN@
+includedir=${prefix}/include
+libdir=${prefix}/lib@SUFFIX_LIB@
+
+Name: clBLAS
+Description: Open source OpenCL BLAS library
+Version: @clBLAS_VERSION@
+URL: https://github.com/clMathLibraries/clBLAS
+
+Cflags: -I${includedir}
+Libs: -L${libdir} -lclBLAS
diff --git a/src/library/common/kern_cache.c b/src/library/common/kern_cache.c
index 787d139f..1006e482 100644
--- a/src/library/common/kern_cache.c
+++ b/src/library/common/kern_cache.c
@@ -425,7 +425,9 @@ fullKernelSize(Kernel *kern)
         size += allSizes[i];
     }
 
-    clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+    if (!kern->noSource) {
+        clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+    }
 
     return (size + retSize + sizeof(Kernel) + kern->extraSize);
 }
diff --git a/src/library/common/tests/CMakeLists.txt b/src/library/common/tests/CMakeLists.txt
index 213e0bca..b1e34871 100644
--- a/src/library/common/tests/CMakeLists.txt
+++ b/src/library/common/tests/CMakeLists.txt
@@ -44,22 +44,15 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
 
 add_executable(t_dblock_kgen ${SRC_DBLOCK_KGEN})
 target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_dblock_kgen PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 add_executable(t_gens_cache ${SRC_GENS_CACHE})
 target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_gens_cache PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_dblock_kgen t_gens_cache
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS t_dblock_kgen t_gens_cache
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_dblock_kgen t_gens_cache
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/common/tests/t_gens_cache.c b/src/library/common/tests/t_gens_cache.c
index 177a25b3..5a2b9823 100644
--- a/src/library/common/tests/t_gens_cache.c
+++ b/src/library/common/tests/t_gens_cache.c
@@ -23,7 +23,11 @@
 #include <stdlib.h>
 #include <string.h>
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <kerngen.h>
 #include <kern_cache.h>
diff --git a/src/library/tools/ktest/CMakeLists.txt b/src/library/tools/ktest/CMakeLists.txt
index 34828f0e..2cc8c318 100644
--- a/src/library/tools/ktest/CMakeLists.txt
+++ b/src/library/tools/ktest/CMakeLists.txt
@@ -140,19 +140,11 @@ source_group(\\ FILES ${KTEST_SRC})
 add_executable(make-ktest ${KTEST_SRC} ${KTEST_EXTERNAL_SRC})
 add_dependencies(make-ktest GENERATE_CLT)
 target_link_libraries(make-ktest ${OPENCL_LIBRARIES} ${Boost_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( make-ktest PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS make-ktest
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS make-ktest
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS make-ktest
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/tools/ktest/step.h b/src/library/tools/ktest/step.h
index 7148c726..0472e499 100644
--- a/src/library/tools/ktest/step.h
+++ b/src/library/tools/ktest/step.h
@@ -18,7 +18,11 @@
 #ifndef KTEST_PATTERN_H__
 #define KTEST_PATTERN_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <list>
 #include <map>
 #include <string>
diff --git a/src/library/tools/ktest/var.h b/src/library/tools/ktest/var.h
index 0ebb1078..8bab85e6 100644
--- a/src/library/tools/ktest/var.h
+++ b/src/library/tools/ktest/var.h
@@ -18,7 +18,11 @@
 #ifndef KTEST_VAR_H__
 #define KTEST_VAR_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 #include <string>
 
 namespace clMath {
diff --git a/src/library/tools/tplgen/configure.bat b/src/library/tools/tplgen/configure.bat
deleted file mode 100644
index b1f3db68..00000000
--- a/src/library/tools/tplgen/configure.bat
+++ /dev/null
@@ -1,14 +0,0 @@
-del CMakeCache.txt
-cmake -DCMAKE_BUILD_TYPE=debug -G "Visual Studio 10" ..\tplgen
-if NOT ERRORLEVEL 1 goto end
-IF ERRORLEVEL 4 goto try9
-IF ERRORLEVEL 3 goto try9
-IF ERRORLEVEL 2 goto try9
-IF ERRORLEVEL 1 goto try9
-goto end
-
-:try9
-del CMakeCache.txt
-cmake -DCMAKE_BUILD_TYPE=Debug -G "Visual Studio 9 2008" ..\tplgen
-
-:end
\ No newline at end of file
diff --git a/src/library/tools/tplgen/tplgen.cpp b/src/library/tools/tplgen/tplgen.cpp
index 25150aa7..e81ecd2d 100644
--- a/src/library/tools/tplgen/tplgen.cpp
+++ b/src/library/tools/tplgen/tplgen.cpp
@@ -73,7 +73,7 @@ int main( int argc, char *argv[] )
     size_t found;
     string str;
     int startOptions = 1;
-    char *outputPrefix = "";
+    const char *outputPrefix = "";
 
     std::cout << "TPLGEN Running.....\n";
     if (argc < 2)
diff --git a/src/library/tools/tune/CMakeLists.txt b/src/library/tools/tune/CMakeLists.txt
index dbfcce97..2de5bf3b 100644
--- a/src/library/tools/tune/CMakeLists.txt
+++ b/src/library/tools/tune/CMakeLists.txt
@@ -130,7 +130,7 @@ if( BLAS_DEBUG_TOOLS )
 endif()
 
 # Library with functions for time measurement. In Windows they are included automatically
-if(UNIX)
+if(UNIX AND NOT APPLE)
     set(TIME_LIBRARY "rt")
 endif()
 
@@ -138,19 +138,11 @@ endif()
 add_executable(tune ${TOOLS_SRC} ${TOOLS_EXTERNAL_SRC})
 add_dependencies(tune GENERATE_CLT)
 target_link_libraries(tune ${OPENCL_LIBRARIES} ${TIME_LIBRARY} ${MATH_LIBRARY})
+set_target_properties( tune PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS tune
-			RUNTIME DESTINATION bin64
-			LIBRARY DESTINATION lib64
-			ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS tune
-			RUNTIME DESTINATION bin32
-			LIBRARY DESTINATION lib32
-			ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS tune
+		RUNTIME DESTINATION bin${SUFFIX_BIN}
+		LIBRARY DESTINATION lib${SUFFIX_LIB}
+		ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+		)
diff --git a/src/library/tools/tune/storage_data.h b/src/library/tools/tune/storage_data.h
index 5efcf5ee..3e72e76b 100644
--- a/src/library/tools/tune/storage_data.h
+++ b/src/library/tools/tune/storage_data.h
@@ -18,13 +18,16 @@
 #ifndef STORAGEDATA_H_
 #define STORAGEDATA_H_
 
-#include <malloc.h>
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <trace_malloc.h>
 
diff --git a/src/library/tools/tune/storage_io.c b/src/library/tools/tune/storage_io.c
index 4d9dd375..8fd3ec83 100644
--- a/src/library/tools/tune/storage_io.c
+++ b/src/library/tools/tune/storage_io.c
@@ -16,7 +16,6 @@
 
 
 
-#include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #include <signal.h>
@@ -25,7 +24,7 @@
 #include "storage_data.h"
 
 #define  SUBDIM_UNUSED_FILE_VALUE 10000
-const char *ENV_FILE_PATH = "AMD_CLBLAS_STORAGE_PATH";
+const char *ENV_FILE_PATH = "CLBLAS_STORAGE_PATH";
 const char *FileID  = "CBS";
 const char *FileExt = "kdb";
 const char *FileExtTmp = "kdb.tmp";
diff --git a/src/library/tools/tune/subdim.c b/src/library/tools/tune/subdim.c
index 37ead334..6eed76f1 100644
--- a/src/library/tools/tune/subdim.c
+++ b/src/library/tools/tune/subdim.c
@@ -364,7 +364,7 @@ nextSubdimElem(SubDimInfo* sd)
 
     // !!! DEBUG
     if (sd->count > 500) {
-        *(int*)0 = 0;
+        abort();
     }
 
     sd->count ++;
diff --git a/src/library/tools/tune/toolslib.c b/src/library/tools/tune/toolslib.c
index 680a2197..fc55b8a4 100644
--- a/src/library/tools/tune/toolslib.c
+++ b/src/library/tools/tune/toolslib.c
@@ -15,7 +15,6 @@
  * ************************************************************************/
 
 
-#include <malloc.h>
 #include <string.h>
 #include <stdlib.h>
 #include <signal.h>
diff --git a/src/library/tools/tune/toolslib.h b/src/library/tools/tune/toolslib.h
index 48c27e62..9e08a9a1 100644
--- a/src/library/tools/tune/toolslib.h
+++ b/src/library/tools/tune/toolslib.h
@@ -18,7 +18,11 @@
 #ifndef TOOLSLIB_H__
 #define TOOLSLIB_H__
 
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 #include <defbool.h>
 #include <devinfo.h>
diff --git a/src/library/tools/tune/tune.c b/src/library/tools/tune/tune.c
index d41e45a3..b6174c4e 100644
--- a/src/library/tools/tune/tune.c
+++ b/src/library/tools/tune/tune.c
@@ -15,13 +15,16 @@
  * ************************************************************************/
 
 
-#include <malloc.h>
 #include <math.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
 #include <CL/cl.h>
+#endif
 
 // #include "fileio.h"
 #include "toolslib.h"
@@ -35,6 +38,10 @@
 
 #if defined(_MSC_VER)
 #include "Windows.h"
+#elif defined(__APPLE__)
+#include <stdint.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
 #else
 #include "time.h"
 #endif
@@ -83,7 +90,33 @@ getCurrentTime(void)
      }
      return (nano_time_t)count.QuadPart;
 }
-#else /* defined(_MCS_VER) */
+
+#elif defined(__APPLE__)
+
+typedef uint64_t nano_time_t;
+#define NANOTIME_MAX UINT64_MAX
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+  static mach_timebase_info_data_t timebase_info = {0};
+
+    if (timebase_info.denom == 0)
+    {
+        (void)mach_timebase_info(&timebase_info);
+    }
+
+    /* Let's hope we don't overflow */
+    return (t * timebase_info.denom) / timebase_info.numer;
+}
+
+nano_time_t
+getCurrentTime(void)
+{
+    return mach_absolute_time();
+}
+
+#else
 
 typedef unsigned long nano_time_t;
 #define NANOTIME_MAX (~0UL - 1)
@@ -276,12 +309,12 @@ struct GeneratorInfoRec {
     DeviceInfo          deviceInfos;    // Todo delete this member. Use TargetDevice.
     char                *deviceName;    //
 
-    bool       aFunc[BLAS_FUNCTIONS_NUMBER];
+    bool       aFunc[BLAS_FUNCTIONS_NUMBER];    //  True/false value if the corresponding function should be tuned
     int        aPattern;
-    bool       aDType[TYPE_NUMBER];
+    bool       aDType[TYPE_NUMBER]; //  True false value if the precision should be tuned; s/d/c/z
     int        aFlag;
     int        aCommand;
-    bool       aIsKernel;
+    bool       aIsKernel;   // True/false value to store binary kernels into the kernel database
     int        aMaxparam;
     bool       aExtendedOutput;
     bool       aAll;
@@ -2214,14 +2247,29 @@ generateKernelForOthersFlag( BlasExtraInfo* bExtra,
                             bestParamOther[nDim]->count++;
                     }
                 }
+
+                //  If the user selected that they want to store the kernel binaries to disk,
+                //  and we do not have those binaries, generate them again
                 if (genInfo.aIsKernel && bestParamOther[nDim]->kernel == NULL) {
+                    MatrixInfo mi [DIMARRAYCOUNT];
                     unsigned int func = bFunc->funcNo;
                     unsigned int patt = bPatt->pattNo;
+
+                    //  Initialize resources to generate kernels in genAllKernel
                     initCLBLASExtra(&extra, bExtra);
-                    genAllKernel(&args, extra, bestParamOther[nDim],
-                                 pattern, func, patt);
-                    logKernalGen();
+                    initMatrixInfo( mi, extra.dtype, &genInfo.deviceInfos, bExtra );
+                    initCLBlasKArgDim( &args, mi, extra.flags );
+
+                    genAllKernel(&args, extra, bestParamOther[nDim], pattern, func, patt);
+
+                    //  Free those resources when finished
+                    releaseMemObjAll( mi, bExtra );
+                    destroyMatrixInfo( mi, bExtra );
+
+                    logKernalGen( );
                 }
+
+                //  This stores the kernel binaries to disk
                 saveBestParams(bExtraOther, bestParamOther);
             }
             deleteGParams(bExtraOther, bestParamOther);
@@ -2271,13 +2319,22 @@ createFile(void)
     bool isEnvPattSelected = false;
     unsigned int dev;
 
-    initOpenCl();
+    //  This intializes global genInfo with either the last detected platform, or the
+    //  first AMD platform it finds.  It records the number of devices in that platform.
+    initOpenCl( );
+
     // For each devices
     for (dev = 0; dev < genInfo.numDevices; dev++) {
     	initDevice(dev);
+
+        //  The following creates the .kdb file on disk according to the set environment variable
         writeStorageCache(&genInfo.targetDevice);
-        getContext();
-        configurePattern();
+
+        //  The following creates the OpenCL context and commanqueue for the first device in global genInfo struct
+        getContext( );
+
+        //  Does nothing; nop
+        configurePattern( );
 
         // for each function
         for (funcId = 0; funcId < BLAS_FUNCTIONS_NUMBER; funcId++) {
@@ -2345,6 +2402,9 @@ createFile(void)
                     bExtra = &(bPatt->extra[nExtra]);
                     genInfo.last = 0;
 
+                    //  This evaluates whether the current combination of parameters from the given function should be tuned or not
+                    //  If skipFlags returns 1, then the this combination is skipped
+                    //  This checks for hardcoded combinations which are skipped because of known runtime bugs.  
                     if ( skipFlags(bExtra,
                             pattId,
                             funcId,
@@ -2353,6 +2413,7 @@ createFile(void)
                         continue;
                     }
 
+                    //  Similar logic to skipFlags, but this mostly filters out cases that were specified on the command line
                     if (isFilter(bExtra, pattId, funcId)) {
                         continue;
                     }
@@ -2603,12 +2664,17 @@ main(int argc, char*  argv[])
 {
     FILE_PATH = getenv(ENV_FILE_PATH);
 
-    initGeneratorInfoRec();
+    //  This clears and initializes the global GeneratorInfoRec genInfo struct
+    initGeneratorInfoRec( );
     parseArg(argc, argv);
+
+    //  This will
+    //  Set up the global clblasSolvers for all function families supported within blas, including initializing memory patterns
+    //  Identify all recognized devices in the system
     clblasSetup();
 
     if (!FILE_PATH){
-        printf("The environment variable 'AMD_CLBLAS_STORAGE_PATH' is not defined\n");
+        printf("The environment variable 'CLBLAS_STORAGE_PATH' is not defined\n");
         exit(EXIT_COD_NO_ENVIRONMENT_VARIABLE);
     }
 
diff --git a/src/samples/CMakeLists.pack b/src/samples/CMakeLists.pack
index dbf8e7b4..22e16cca 100644
--- a/src/samples/CMakeLists.pack
+++ b/src/samples/CMakeLists.pack
@@ -1,6 +1,18 @@
-#############################################################################
-## Copyright (C) 2010,2011 Advanced Micro Devices, Inc. All Rights Reserved.
-#############################################################################
+# ########################################################################
+# Copyright 2013 Advanced Micro Devices, Inc.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ########################################################################
 cmake_minimum_required(VERSION 2.6)
 project(clblas.samples)
 
diff --git a/src/samples/CMakeLists.txt b/src/samples/CMakeLists.txt
index ea9e2b5f..c354ba8c 100644
--- a/src/samples/CMakeLists.txt
+++ b/src/samples/CMakeLists.txt
@@ -260,45 +260,24 @@ add_executable(example_sasum ${SASUM_SAMPLE_SRC})
 target_link_libraries(example_sasum ${OPENCL_LIBRARIES} clBLAS)
 set_property( TARGET example_sasum PROPERTY FOLDER "Samples")
 
-if( TARGET_PLATFORM EQUAL 64 )
-	# CPack configuration; include the executable into the package
-	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
-             example_ssyr2k example_strmm example_strsm 
-		     example_strmv example_strsv example_sger example_cher example_ssyr 
-		     example_ssyr2 example_cherk example_ssymm example_chemm
-		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
-		     example_sspr2 example_zhpr2 
-		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
-		     example_cher2k
-		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
-		     example_srotg example_srotmg example_srot example_srotm
-		     example_snrm2 example_sasum example_isamax
-
-			 version
-			 RUNTIME DESTINATION bin64
-			 LIBRARY DESTINATION lib64
-			 ARCHIVE DESTINATION lib64/import
-			)
-else()
-	# CPack configuration; include the executable into the package
-	install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
-             example_ssyr2k example_strmm example_strsm 
-		     example_strmv example_strsv example_sger example_cher example_ssyr 
-		     example_ssyr2 example_cherk example_ssymm example_chemm
-		     example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
-		     example_sspr2 example_zhpr2 
-		     example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
-		     example_cher2k
-		     example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
-		     example_srotg example_srotmg example_srot example_srotm
-		     example_snrm2 example_sasum example_isamax
-
-			 version
-			 RUNTIME DESTINATION bin32
-			 LIBRARY DESTINATION lib32
-			 ARCHIVE DESTINATION lib32/import
-			)
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS example_sgemm example_sgemv example_ssymv example_ssyrk
+         example_ssyr2k example_strmm example_strsm 
+         example_strmv example_strsv example_sger example_cher example_ssyr 
+         example_ssyr2 example_cherk example_ssymm example_chemm
+         example_stpmv example_chpmv example_stpsv example_sspmv example_sspr example_chpr
+         example_sspr2 example_zhpr2 
+         example_sgbmv example_stbmv example_ssbmv example_chbmv example_stbsv
+         example_cher2k
+         example_sswap example_sscal example_csscal example_scopy example_saxpy example_sdot
+         example_srotg example_srotmg example_srot example_srotm
+         example_snrm2 example_sasum example_isamax
+
+         version
+        RUNTIME DESTINATION bin${SUFFIX_BIN}
+        LIBRARY DESTINATION lib${SUFFIX_LIB}
+        ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+        )
 
 configure_file( "${PROJECT_SOURCE_DIR}/samples/CMakeLists.pack"
 		"${PROJECT_BINARY_DIR}/samples/CMakeLists.txt" COPYONLY )
diff --git a/src/scripts/perf/CMakeLists.txt b/src/scripts/perf/CMakeLists.txt
index 7b71a092..81d33857 100644
--- a/src/scripts/perf/CMakeLists.txt
+++ b/src/scripts/perf/CMakeLists.txt
@@ -21,10 +21,4 @@ set(GRAPHING_SCRIPTS 	measurePerformance.py
 						performanceUtility.py
 						)
 
-if( TARGET_PLATFORM EQUAL 64 )
-    set( BIN_DIR bin64 )
-else()
-    set( BIN_DIR bin32 )
-endif()
-
-install( FILES ${GRAPHING_SCRIPTS} DESTINATION ${BIN_DIR} )
+install( FILES ${GRAPHING_SCRIPTS} DESTINATION bin${SUFFIX_BIN} )
diff --git a/src/scripts/perf/measurePerformance.py b/src/scripts/perf/measurePerformance.py
index 659d4ef6..8559e66d 100644
--- a/src/scripts/perf/measurePerformance.py
+++ b/src/scripts/perf/measurePerformance.py
@@ -42,9 +42,10 @@
 sidevalues = ['left','right']
 uplovalues = ['upper','lower']
 diagvalues = ['unit','nonunit']
-functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv' ]
+functionvalues = ['gemm', 'trmm', 'trsm', 'syrk', 'syr2k', 'gemv', 'symv', 'symm', 'hemm', 'herk', 'her2k' ]
 precisionvalues = ['s', 'd', 'c', 'z']
 roundtripvalues = ['roundtrip','noroundtrip','both']
+memallocvalues = ['default','alloc_host_ptr','use_host_ptr','copy_host_ptr','use_persistent_mem_amd']
 
 parser = argparse.ArgumentParser(description='Measure performance of the clblas library')
 parser.add_argument('--device',
@@ -125,6 +126,9 @@
 parser.add_argument('--roundtrip',
     dest='roundtrip', default='noroundtrip',
     help='whether measure the roundtrips or not. choices are ' + str(roundtripvalues) + '. (default noroundtrip); should not be specified when calling ACML')
+parser.add_argument('--memalloc',
+	dest='memalloc', default='default',
+	help='set the flags for OpenCL memory allocation. Choices are ' + str(memallocvalues) + '. (default is default); do not need to set when calling ACML or if roundtrip is not set')
 ini_group = parser.add_mutually_exclusive_group()
 ini_group.add_argument('--createini',
     dest='createIniFilename', default=None, type=argparse.FileType('w'),
@@ -138,6 +142,7 @@
 label = str(args.label)
 roundtrip = str(args.roundtrip)
 library = str(args.library)
+memalloc = str(args.memalloc)
 
 subprocess.call('mkdir perfLog', shell = True)
 logfile = os.path.join('perfLog', (label+'-'+'blasMeasurePerfLog.txt'))
@@ -145,7 +150,6 @@
 def printLog(txt):
     print txt
     log(logfile, txt)
-printLog(roundtrip)
 printLog("=========================MEASURE PERFORMANCE START===========================")
 printLog("Process id of Measure Performance:"+str(os.getpid()))
 
@@ -449,7 +453,8 @@ def executeCommand():
                      '--function', function,
                      '--precision', precision,
                      '-p', '10',
-					 '--roundtrip', roundtrip]
+					 '--roundtrip', roundtrip,
+					 '--memalloc', memalloc]
     else:
         printLog( 'ERROR: unknown library:"' +library+ '" can\'t assemble command')
         quit()
diff --git a/src/tests/BlasBase.cpp b/src/tests/BlasBase.cpp
index 85905994..c012803d 100644
--- a/src/tests/BlasBase.cpp
+++ b/src/tests/BlasBase.cpp
@@ -506,6 +506,8 @@ BlasBase::printEnvInfo(void)
     #else
             std::cout << "(x32)" << std::endl;
     #endif
+#elif defined( __APPLE__ )
+        std::cout << "Apple OS X" << std::endl;
 #else
         std::cout << "Linux" << std::endl;
 #endif
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 1f0e07c4..e0ebedef 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -242,6 +242,12 @@ source_group(functional  FILES ${SRC_FUNC} ${FUNC_HEADERS})
 # at paramVal = CL_PROGRAM_BINARIES and several devices in the context
 add_definitions( -DTEST_WITH_SINGLE_DEVICE )
 
+# vs11 needs std::tuples compiled with 10 parameters by default
+# NOTE: this assumes that googletest is compiled with the same preprocessor macro; they must match
+if( MSVC11 )
+	add_definitions( "/D_VARIADIC_MAX=10" )
+endif()
+
 # Having problems on build server, compiling gtest headers with -pedantic; disabling detection of long long
 # http://code.google.com/p/googletest/issues/detail?id=334
 if( CMAKE_COMPILER_IS_GNUCXX )
@@ -262,7 +268,9 @@ endif( )
 
 # Library with functions for time measurement. In Windows they are included automatically
 if(UNIX)
-    set(TIME_LIBRARY "rt")
+    if(NOT APPLE)
+        set(TIME_LIBRARY "rt")
+    endif()
     set(THREAD_LIBRARY "pthread")
 endif()
 
@@ -270,11 +278,7 @@ endif()
 # It stitches together a path to a previously built static library, based on our 'make install' logic
 # Search for 64bit libs if FIND_LIBRARY_USE_LIB64_PATHS is set to true in the global environment, 32bit libs else
 get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
-if( LIB64 )
-	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib64" )
-else( )
-	set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib32" )
-endif( )
+set( runtime.library "${CMAKE_INSTALL_PREFIX}/lib${SUFFIX_LIB}" )
 
 if( WIN32 )
 	set( runtime.library "${runtime.library}/import/clBLAS${CMAKE_STATIC_LIBRARY_SUFFIX}" )
@@ -292,14 +296,17 @@ if( GTEST_FOUND )
 	    
 	    add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 									    ${CORR_HEADERS} ${TESTS_HEADERS})
+        set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 							      ${CORR_HEADERS} ${TESTS_HEADERS})
 	    set_target_properties(test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS)
+        set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 							      ${CORR_HEADERS} ${TESTS_HEADERS})
 	    set_target_properties(test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS)
+        set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 	    # The build server builds the library with gcc 4.1.2 to support Red Hat 5.5, but the test programs must be built with 
 	    # gcc > 4.3.2 to support ACML.  
@@ -323,16 +330,19 @@ if( GTEST_FOUND )
 
 		add_executable(test-correctness ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 						${CORR_HEADERS} ${TESTS_HEADERS})
+        set_target_properties( test-correctness PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 		add_executable(test-medium ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 					  ${CORR_HEADERS} ${TESTS_HEADERS})
 		set_target_properties( test-medium PROPERTIES COMPILE_DEFINITIONS MEDIUM_TESTS )
+        set_target_properties( test-medium PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 		add_executable(test-short ${SRC_CORR} ${SRC_COMMON} ${SRC_COMMON_REFIMPL}
 					  ${CORR_HEADERS} ${TESTS_HEADERS})
 		set_target_properties( test-short PROPERTIES COMPILE_DEFINITIONS SHORT_TESTS )
+        set_target_properties( test-short PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
-		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+		if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE)
 			set_target_properties( test-correctness PROPERTIES LINKER_LANGUAGE Fortran )
 			set_target_properties( test-medium PROPERTIES LINKER_LANGUAGE Fortran )
 			set_target_properties( test-short PROPERTIES LINKER_LANGUAGE Fortran )
@@ -344,9 +354,9 @@ if( GTEST_FOUND )
 				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 			else( )
-				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
-				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
-				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
+				target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} clBLAS)
 			endif( )
 		else( )
 			if( NETLIB_FOUND )
@@ -354,9 +364,9 @@ if( GTEST_FOUND )
 				target_link_libraries(test-medium ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 				target_link_libraries(test-short ${Netlib_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 			else( )
-				target_link_libraries(test-correctness BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
-				target_link_libraries(test-medium BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
-				target_link_libraries(test-short BLAS_LIBRARIES ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-correctness ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-medium ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
+				target_link_libraries(test-short ${BLAS_LIBRARIES} ${GTEST_LIBRARIES} ${OPENCL_LIBRARIES} ${runtime.library} )
 			endif( )
 		endif( )
     endif( )
@@ -365,22 +375,24 @@ if( GTEST_FOUND )
     set_property( TARGET test-medium PROPERTY FOLDER "Test")
     set_property( TARGET test-short PROPERTY FOLDER "Test")
 
-    if( TARGET_PLATFORM EQUAL 64 )
-	    # CPack configuration; include the executable into the package
-	    install( TARGETS test-correctness test-medium test-short
-			    RUNTIME DESTINATION bin64
-			    LIBRARY DESTINATION lib64
-			    ARCHIVE DESTINATION lib64/import
-			    )
-    else()
-	    # CPack configuration; include the executable into the package
-	    install( TARGETS test-correctness test-medium test-short
-			    RUNTIME DESTINATION bin32
-			    LIBRARY DESTINATION lib32
-			    ARCHIVE DESTINATION lib32/import
-			    )
-    endif()
+    # CPack configuration; include the executable into the package
+    install( TARGETS test-correctness test-medium test-short
+            RUNTIME DESTINATION bin${SUFFIX_BIN}
+            LIBRARY DESTINATION lib${SUFFIX_LIB}
+            ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+            )
+    
+    get_target_property( testLocation test-correctness LOCATION )
+
+    configure_file(
+        "${CMAKE_CURRENT_SOURCE_DIR}/copyTestDependencies.cmake.in"
+        "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake"
+        @ONLY
+    )
 
+    # Register script at run at install time to analyze the executable and copy dependencies into package
+    install( SCRIPT "${CMAKE_CURRENT_BINARY_DIR}/copyTestDependencies.cmake")
+ 
 	if( ACML_FOUND )
 		include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
 			${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_DIR}/tests/include  ${clBLAS_SOURCE_DIR}/include)
@@ -391,6 +403,7 @@ if( GTEST_FOUND )
 			${SRC_COMMON_TIMER} ${PERF_HEADERS} ${TESTS_HEADERS}
 			${SRC_COMMON_REFIMPL})
 		target_link_libraries(test-performance ${ACML_LIBRARIES})
+        set_target_properties( test-performance PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 
 
 		if( BUILD_RUNTIME )
@@ -401,21 +414,12 @@ if( GTEST_FOUND )
 
         set_property( TARGET test-performance PROPERTY FOLDER "Test")
 
-		if( TARGET_PLATFORM EQUAL 64 )
-			# CPack configuration; include the executable into the package
-			install( TARGETS test-performance
-					RUNTIME DESTINATION bin64
-					LIBRARY DESTINATION lib64
-					ARCHIVE DESTINATION lib64/import
-					)
-		else()
-			# CPack configuration; include the executable into the package
-			install( TARGETS test-performance
-					RUNTIME DESTINATION bin32
-					LIBRARY DESTINATION lib32
-					ARCHIVE DESTINATION lib32/import
-					)
-		endif()
+        # CPack configuration; include the executable into the package
+        install( TARGETS test-performance
+                RUNTIME DESTINATION bin${SUFFIX_BIN}
+                LIBRARY DESTINATION lib${SUFFIX_LIB}
+                ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+                )
 	endif()
 
 	include_directories(${OPENCL_INCLUDE_DIRS} ${GTEST_INCLUDE_DIRS}
@@ -424,27 +428,19 @@ if( GTEST_FOUND )
 	add_executable(test-functional ${SRC_FUNC} ${SRC_COMMON} ${SRC_COMMON_TIMER}
 								  ${FUNC_HEADERS} ${TESTS_HEADERS})
 								  
+    set_target_properties( test-functional PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
 	if( BUILD_RUNTIME )
-		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} clBLAS)
+		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} clBLAS )
 	else()
 		target_link_libraries(test-functional ${GTEST_LIBRARIES} ${TIME_LIBRARY} ${THREAD_LIBRARY} ${OPENCL_LIBRARIES} ${runtime.library} )
 	endif()
 
     set_property( TARGET test-functional PROPERTY FOLDER "Test")
 
-	if( TARGET_PLATFORM EQUAL 64 )
-		# CPack configuration; include the executable into the package
-		install( TARGETS test-functional
-				RUNTIME DESTINATION bin64
-				LIBRARY DESTINATION lib64
-				ARCHIVE DESTINATION lib64/import
-				)
-	else()
-		# CPack configuration; include the executable into the package
-		install( TARGETS test-functional
-				RUNTIME DESTINATION bin32
-				LIBRARY DESTINATION lib32
-				ARCHIVE DESTINATION lib32/import
-				)
-	endif()
+    # CPack configuration; include the executable into the package
+    install( TARGETS test-functional
+            RUNTIME DESTINATION bin${SUFFIX_BIN}
+            LIBRARY DESTINATION lib${SUFFIX_LIB}
+            ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+            )
 endif()
diff --git a/src/tests/copyTestDependencies.cmake.in b/src/tests/copyTestDependencies.cmake.in
new file mode 100644
index 00000000..d52832fb
--- /dev/null
+++ b/src/tests/copyTestDependencies.cmake.in
@@ -0,0 +1,97 @@
+# Customized install script for fftw test program; analyzes all the shared library dependencies and installs
+# the dependencies into the package
+include( GetPrerequisites )
+
+#    message( testLocation ": @testLocation@" )
+
+# The Microsoft IDE presents a challenge because the full configuration is not known at cmake time
+# This logic allows us to 'substitute' the proper confguration at install time
+if( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "Debug" )
+    string( REPLACE "\$(Configuration)" "Debug" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "Release" )
+    string( REPLACE "\$(Configuration)" "Release" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "MinSizeRel" )
+    string( REPLACE "\$(Configuration)" "MinSizeRel" fixedTestLocation "@testLocation@" )
+elseif( "${CMAKE_INSTALL_CONFIG_NAME}" MATCHES "RelwithDebInfo" )
+    string( REPLACE "\$(Configuration)" "RelwithDebInfo" fixedTestLocation "@testLocation@" )
+endif( )
+
+#    message( fixedTestLocation ": ${fixedTestLocation}" )
+# Get the directory that the test executable resides in; this helps get_prerequisites( ) find dependent libraries
+get_filename_component( testDir "${fixedTestLocation}" PATH )
+#    message( testDir ": ${testDir}" )
+
+set( installPath "" )
+if( WIN32 )
+    set( installPath "${CMAKE_INSTALL_PREFIX}/bin@SUFFIX_BIN@" )
+else( )
+    set( installPath "${CMAKE_INSTALL_PREFIX}/lib@SUFFIX_LIB@" )
+endif( )
+
+# Only search for dependencies that have ROOT defined
+set( depList "" )
+
+#This logic assumes that clBLAS CMakeLists.txt has been called
+get_filename_component( acmlDir "@ACML_LIBRARIES@" PATH )
+
+if( EXISTS "${acmlDir}" )
+    list( APPEND depList "${acmlDir}" )
+#    message( "acmlDir: ${acmlDir}" )
+endif( )
+
+#This logic assumes that FindGTest.cmake has been called
+get_filename_component( gtestDir "@GTEST_LIBRARY@" PATH )
+get_filename_component( gtestDirDebug "@GTEST_LIBRARY_DEBUG@" PATH )
+
+if( EXISTS "${gtestDir}" )
+    list( APPEND depList "${gtestDir}" )
+#    message( "gtestDir: ${gtestDir}" )
+endif( )
+
+string( COMPARE NOTEQUAL "${gtestDir}" "${gtestDirDebug}" gtestDiffDirs )
+if( ${gtestDiffDirs} AND EXISTS "${gtestDirDebug}" )
+    list( APPEND depList "${gtestDirDebug}" )
+#    message( "gtestDirDebug: ${gtestDirDebug}" )
+endif( )
+
+#This logic assumes that FindOpenCL.cmake has been called
+get_filename_component( openclDir "@OPENCL_LIBRARIES@" PATH )
+
+if( EXISTS "${openclDir}" )
+    list( APPEND depList "${openclDir}" )
+#    message( "openclDir: ${openclDir}" )
+endif( )
+ 
+if( EXISTS "${testDir}" )
+    list( APPEND depList "${testDir}" )
+    # On linux, the .so files are not staged with the rest of the executables
+    if( UNIX )
+       list( APPEND depList "${testDir}/../library" )
+    endif( )
+endif( )
+
+# message( "depList: ${depList}" )
+
+# This retrieves a list of shared library dependencies from the target; they are not full path names
+# Skip system dependencies and skip recursion
+get_prerequisites( ${fixedTestLocation} testDependencies 1 0 "" "${depList}" )
+
+# Loop on queried library dependencies and copy them into package
+foreach( dep ${testDependencies} )
+    # This converts the dependency into a full path
+    gp_resolve_item( "${fixedTestLocation}" "${dep}" "" "${depList}" dep_test_path )
+
+    # In linux, the dep_test_path may point to a symbolic link, we also need to copy real file
+    get_filename_component( dep_realpath "${dep_test_path}" REALPATH )
+    get_filename_component( dep_name "${dep_test_path}" NAME )
+    # message( STATUS "depName: ${dep_name}" )
+    # message( STATUS "depFullPath: ${dep_test_path}" )
+    # message( STATUS "dep_realpath: ${dep_realpath}" )
+
+    if( NOT EXISTS ${installPath}/${dep_name} )
+        file( INSTALL ${dep_test_path} ${dep_realpath}
+              USE_SOURCE_PERMISSIONS
+              DESTINATION ${installPath}
+            )
+    endif( )
+endforeach( )
diff --git a/src/tests/correctness/blas-lapack.c b/src/tests/correctness/blas-lapack.c
index a010b7b8..9687bdf3 100644
--- a/src/tests/correctness/blas-lapack.c
+++ b/src/tests/correctness/blas-lapack.c
@@ -24,6 +24,9 @@
 #if !defined CORR_TEST_WITH_ACML
 
 #include "blas-lapack.h"
+#if defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
 
 void
 sgemv(char transa, int m, int n, float alpha, float *a, int lda, float *x, int incx, float beta, float *y, int incy)
@@ -627,20 +630,30 @@ void zdscal( int n, double alpha, doublecomplex *x, int incx)
 
 float sdot( int n, float *x, int incx,  float *y, int incy)
 {
+#ifdef __APPLE__
+    return cblas_sdot(n, x, incx, y, incy);
+#else
     return sdot_(&n, x, &incx, y, &incy);
+#endif
 }
 
 double ddot( int n, double *x, int incx,  double *y, int incy)
 {
+#ifdef __APPLE__
+    return cblas_ddot(n, x, incx, y, incy);
+#else
     return ddot_(&n, x, &incx, y, &incy);
+#endif
 }
 
 complex cdotu( int n, complex *x, int incx, complex *y, int incy)
 {
     complex ans;
 
-    #if defined( _WIN32 ) || defined( _WIN64 )
+#if defined( _WIN32 ) || defined( _WIN64 )
         ans = cdotu_(&n, x, &incx, y, &incy);
+    #elif defined( __APPLE__)
+        cblas_cdotu_sub(n, x, incx, y, incy, &ans);
     #else
         cdotusub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -654,6 +667,8 @@ doublecomplex zdotu( int n, doublecomplex *x, int incx,  doublecomplex *y, int i
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = zdotu_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_zdotu_sub(n, x, incx, y, incy, &ans);
     #else
         zdotusub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -667,6 +682,8 @@ complex cdotc( int n, complex *x, int incx, complex *y, int incy)
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = cdotc_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_cdotc_sub(n, x, incx, y, incy, &ans);
     #else
         cdotcsub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -680,6 +697,8 @@ doublecomplex zdotc( int n, doublecomplex *x, int incx,  doublecomplex *y, int i
 
     #if defined( _WIN32 ) || defined( _WIN64 )
         ans = zdotc_(&n, x, &incx, y, &incy);
+    #elif defined(__APPLE__)
+        cblas_zdotc_sub(n, x, incx, y, incy, &ans);
     #else
         zdotcsub_(&n, x, &incx, y, &incy, &ans);
     #endif
@@ -829,42 +848,94 @@ int izamax( int n, doublecomplex *x, int incx)
 
 float snrm2( int n, float *x, int incx)
 {
+#ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_snrm2(n, x, incx);
+#else
     return snrm2_(&n, x, &incx);
+#endif
 }
 
 double dnrm2( int n, double *x, int incx)
 {
+#ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_dnrm2(n, x, incx);
+#else
     return dnrm2_(&n, x, &incx);
+#endif
 }
 
 float scnrm2( int n, complex *x, int incx)
 {
+#ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_scnrm2(n, x, incx);
+#else
     return scnrm2_(&n, x, &incx);
+#endif
 }
 
 double dznrm2( int n, doublecomplex *x, int incx)
 {
+#ifdef __APPLE__
+    //On OSX passing negative values for incx can lead to a
+    //a crash, so we catch it here (cf. Github issue #37).
+    if (n < 1 || incx < 1) {
+        return 0;
+    }
+    return cblas_dznrm2(n, x, incx);
+#else
     return dznrm2_(&n, x, &incx);
+#endif
 }
 
 float sasum( int n, float *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_sasum(n, x, incx);
+#else
     return sasum_(&n, x, &incx);
+#endif
 }
 
 double dasum( int n, double *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_dasum(n, x, incx);
+#else
     return dasum_(&n, x, &incx);
+#endif
 }
 
 float scasum( int n, complex *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_scasum(n, x, incx);
+#else
     return scasum_(&n, x, &incx);
+#endif
 }
 
 double dzasum( int n, doublecomplex *x, int incx)
 {
+#ifdef __APPLE__
+    return cblas_dzasum(n, x, incx);
+#else
     return dzasum_(&n, x, &incx);
+#endif
 }
 
 #endif
diff --git a/src/tests/correctness/blas-lapack.h b/src/tests/correctness/blas-lapack.h
index 6dc55ee3..d2db1aa3 100644
--- a/src/tests/correctness/blas-lapack.h
+++ b/src/tests/correctness/blas-lapack.h
@@ -1164,7 +1164,7 @@ void zcopy_(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy);
 float sdot_(int *n, float *x, int *incx, float* y, int *incy);
 double ddot_(int *n, double *x, int *incx, double* y, int *incy);
 
-#if defined( _WIN32 ) || defined( _WIN64 )
+#if defined( _WIN32 ) || defined( _WIN64 ) || defined( __APPLE__)
     complex cdotu_(int *n, complex *x, int *incx, complex* y, int *incy);
     doublecomplex zdotu_(int *n, doublecomplex *x, int *incx, doublecomplex* y, int *incy);
     complex cdotc_(int *n, complex *x, int *incx, complex* y, int *incy);
diff --git a/src/tests/correctness/corr-rotg.cpp b/src/tests/correctness/corr-rotg.cpp
index e26e7cd6..21ef905b 100644
--- a/src/tests/correctness/corr-rotg.cpp
+++ b/src/tests/correctness/corr-rotg.cpp
@@ -14,10 +14,6 @@
  * limitations under the License.
  * ************************************************************************/
 
-/***********************************************************************
-**  Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
 #include <stdlib.h>             // srand()
 #include <string.h>             // memcpy()
 #include <gtest/gtest.h>
diff --git a/src/tests/correctness/test-correctness.cpp b/src/tests/correctness/test-correctness.cpp
index 950382e9..7a1a0841 100644
--- a/src/tests/correctness/test-correctness.cpp
+++ b/src/tests/correctness/test-correctness.cpp
@@ -205,7 +205,11 @@ const ComplexLong alphaBetaRange[] = {{50,50}, {20,20}};
 const ComplexLong alphaBeta = {10,10};
 const ComplexLong sflagRange[] = {{-1,0}, {0,0}, {1,0}, {-2,0}};
 
+const ComplexLong rotCosMedium = {0, 3};
+const ComplexLong rotSinMedium = {0, 4};
 
+const ComplexLong rotCosShort = {1, 6};
+const ComplexLong rotSinShort = {1, 2};
 
 #ifdef DO_SPL
 
@@ -316,10 +320,10 @@ INSTANTIATE_TEST_CASE_P(ALL_ROTM, ROTM, Combine(
 #ifdef DO_ROT
 #if defined(SHORT_TESTS)
 INSTANTIATE_TEST_CASE_P(Small_ROT, ROT, Combine(
-        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, 2), Values(1)));
+        Values(1, 5, 10, 20), Values(1, 6), Values(1, -1), Values(1, 6), Values(1, -1), Values(rotCosShort), Values(rotSinShort), Values(1)));
 #elif defined(MEDIUM_TESTS)
 INSTANTIATE_TEST_CASE_P(Medium_ROT, ROT, Combine(
-        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(0, 3), Values(0, 4), Values(1)));
+        Values(64,128,256,512), Values(0,3), Values(1, -3, 3, 1), Values(0,3), Values(1, -3, 3, 1), Values(rotCosMedium), Values(rotSinMedium), Values(1)));
 #else
 INSTANTIATE_TEST_CASE_P(ALL_ROT, ROT, Combine(
         ValuesIn(completeRange), ValuesIn(offsetRange), ValuesIn(incs), ValuesIn(offsetRange), ValuesIn(incs),
diff --git a/src/tests/include/BlasBase.h b/src/tests/include/BlasBase.h
index 1901afad..6c13e520 100644
--- a/src/tests/include/BlasBase.h
+++ b/src/tests/include/BlasBase.h
@@ -20,6 +20,7 @@
 
 #include <clBLAS.h>
 #include <common.h>
+#include <algorithm>
 
 #if _MSC_VER
 #pragma warning (disable:4127)
diff --git a/src/tests/include/blas-math.h b/src/tests/include/blas-math.h
index a7e3293e..784c44b6 100644
--- a/src/tests/include/blas-math.h
+++ b/src/tests/include/blas-math.h
@@ -20,10 +20,12 @@
 
 #if defined (_MSC_VER)
 
+#if( _MSC_VER <= 1700 )
 static unsigned long long ROW_NAN = 0x7ff0000000000000LL;
-static unsigned int ROW_NANF = 0x7fc00000;
-
 #define NAN *(reinterpret_cast<double*>(&ROW_NAN))
+#endif
+
+static unsigned int ROW_NANF = 0x7fc00000;
 #define NANF *(reinterpret_cast<float*>(&ROW_NANF))
 
 #else   /* _MSC_VER */
diff --git a/src/tests/include/timer.h b/src/tests/include/timer.h
index 29353ff8..41c8e275 100644
--- a/src/tests/include/timer.h
+++ b/src/tests/include/timer.h
@@ -27,6 +27,12 @@ extern "C" {
 typedef unsigned long long nano_time_t;
 #define NANOTIME_MAX (~0ULL - 1)
 
+#elif defined(__APPLE__)
+#include <stdint.h>
+
+typedef uint64_t nano_time_t;
+#define NANOTIME_MAX (UINT64_MAX - 1)
+
 #else
 
 typedef unsigned long nano_time_t;
diff --git a/src/tests/timer.c b/src/tests/timer.c
index e304f4f5..01844793 100644
--- a/src/tests/timer.c
+++ b/src/tests/timer.c
@@ -79,25 +79,39 @@ sleepTime(nano_time_t time) {
 
 #include <time.h>
 
-nano_time_t
-conv2nanosec(nano_time_t t)
-{
-    /* clock_... functions measure time in nanoseconds */
-    return t;
-}
+#if defined(__APPLE__) && defined(__MACH__)
 
-nano_time_t
-conv2microsec(nano_time_t t)
+#include <assert.h>
+#include <mach/mach.h>
+#include <mach/mach_time.h>
+#include <pthread.h>
+
+// see https://developer.apple.com/library/mac/qa/qa1398/_index.html
+static mach_timebase_info_data_t mtb_;
+
+static void
+init_timebase_conv_(void)
 {
-    return t/1000;
+    kern_return_t err;
+
+    err = mach_timebase_info(&mtb_);
+    assert(err == KERN_SUCCESS);
 }
 
 nano_time_t
-conv2millisec(nano_time_t t)
+getCurrentTime(void)
 {
-    return t/1000000;
+     static pthread_once_t once = PTHREAD_ONCE_INIT;
+     uint64_t              now;
+
+     pthread_once(&once, init_timebase_conv_);
+     now = mach_absolute_time();
+
+     return (now * mtb_.numer) / mtb_.denom;
 }
 
+#else /* ! (_MCS_VER || __APPLE__) */
+
 nano_time_t
 getCurrentTime(void)
 {
@@ -111,6 +125,29 @@ getCurrentTime(void)
     return 0;
 }
 
+#endif
+
+
+nano_time_t
+conv2nanosec(nano_time_t t)
+{
+    /* clock_... functions measure time in nanoseconds */
+    return t;
+}
+
+nano_time_t
+conv2microsec(nano_time_t t)
+{
+    return t/1000;
+}
+
+nano_time_t
+conv2millisec(nano_time_t t)
+{
+    return t/1000000;
+}
+
+
 void
 sleepTime(nano_time_t time) {
     struct timespec t1;
diff --git a/src/wrappers/python/README.txt b/src/wrappers/python/README.txt
new file mode 100644
index 00000000..10c3cf8c
--- /dev/null
+++ b/src/wrappers/python/README.txt
@@ -0,0 +1,59 @@
+pyclBLAS setup and installation
+(I've been pronouncing it 'pickleBLAS')
+------------------------------------------------------------------------
+A python extention wrapper around clBLAS from https://github.com/clMathLibraries/clBLAS
+
+Dependencies:
+1.  clBLAS from https://github.com/clMathLibraries/clBLAS ( develop branch )
+2.  PyOpenCL from http://mathema.tician.de/software/pyopencl/ ( 2013.2 minimum )
+3.  Cython from http://cython.org/, ( 0.18 minimum )
+4.  OpenCL runtime, such as AMD's catalyst package ( AMD v2.9 SDK tested )
+
+NOTE:  This has been tested with 32-bit python on windows & 64-bit on OpenSUSE
+
+NOTE:  Only sgemm has been wrapped as proof-of-concept
+
+Build steps:
+------------------------------------------------------------------------
+1.  First, clone the clBLAS repo from github and make sure to build the 
+'install' step.  This is either 'make install' on linux derivatives or 
+the 'install' project on Visual Studio projects.  This should produce a 
+'package' directory in your build tree that contains ./include, ./libXX & 
+./bin.  
+
+Note:  it is necessary to build 32-bit clBLAS if using 32-bit python,
+and 64-bit clBLAS for 64-bit python.
+
+2.  Install pyopencl.  If your python distribution contains a version 
+of pyopencl that is a minimum of 2013.2, then just install with the 
+distributions package manager like pypm, pip, easy_install.  If not, download
+pyopencl yourself and follow its directions to build and install.
+
+3.  Install Cython.  If your python distribution contains a version 
+of cython that is a minimum of .18, then just install with the 
+distributions package manager like pypm, pip, easy_install.  If not, 
+download cython yourself and follow its directions to build and install.
+
+4.  An OpenCL SDK is required to build, which includes OpenCL header files
+and linkable libraries.  One such SDK is the AMD APP SDK, which can be 
+downloaded from http://developer.amd.com/tools-and-sdks/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/
+
+5.  Build the pyclBLAS extention.  This is accompished by running setup.py,
+which acts as a python makefile.  An example install command: 
+'python setup.py --clBlasRoot=F:\code\GitHub\clMathLibraries\bin\clBLAS\develop\vs11x32\package build_ext --inplace'
+
+'python setup.py --help' prints additional command line parameters that extend 
+the traditional distutils options.  After successfully building the extention
+module, a pyclBLAS.pyd file appears.  As shown above, it may be necessary to provide
+the setup makefile with the paths of the clBLAS 'package' directory and the 
+OpenCL SDK directory.  Setup.py does attempt to find the OpenCL SDK through 
+the environment variable AMDAPPSDKROOT or OPENCL_ROOT.
+
+NOTE:  On windows, if using a more recent version of visual studio than 2008, 
+it may be necessary to trick python to using the newer version of your compiler, 
+by creating an environment variable that it expects to exist as such:
+set VS90COMNTOOLS=%VS110COMNTOOLS%
+    
+NOTE: It may be necessary to copy the clBLAS shared library into 
+the same directory as the extention module so that it can find 
+clBLAS at runtime
diff --git a/src/wrappers/python/pyclBLAS.pxd b/src/wrappers/python/pyclBLAS.pxd
new file mode 100644
index 00000000..b3c4e8ec
--- /dev/null
+++ b/src/wrappers/python/pyclBLAS.pxd
@@ -0,0 +1,85 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+# This pxd file defines all the enums and structs that we plan to use from
+# python.  It is used from pyclBLAS.pyx
+from libc.stdint cimport intptr_t, uintptr_t
+
+cdef extern from "clBLAS.h":
+    # These are base OpenCL enumerations that clBLAS uses
+    cdef enum:
+        CL_SUCCESS                      = 0
+        CL_INVALID_VALUE                = -30
+        CL_INVALID_COMMAND_QUEUE        = -36
+        CL_INVALID_CONTEXT              = -34
+        CL_INVALID_MEM_OBJECT           = -38
+        CL_INVALID_DEVICE               = -33
+        CL_INVALID_EVENT_WAIT_LIST      = -57
+        CL_OUT_OF_RESOURCES             = -5
+        CL_OUT_OF_HOST_MEMORY           = -6
+        CL_INVALID_OPERATION            = -59
+        CL_COMPILER_NOT_AVAILABLE       = -3
+        CL_BUILD_PROGRAM_FAILURE        = -11
+
+    cdef enum clblasStatus_:
+        clblasSuccess               = CL_SUCCESS
+        clblasInvalidValue          = CL_INVALID_VALUE
+        clblasInvalidCommandQueue   = CL_INVALID_COMMAND_QUEUE
+        clblasInvalidContext        = CL_INVALID_CONTEXT
+        clblasInvalidMemObject      = CL_INVALID_MEM_OBJECT
+        clblasInvalidDevice         = CL_INVALID_DEVICE
+        clblasInvalidEventWaitList  = CL_INVALID_EVENT_WAIT_LIST
+        clblasOutOfResources        = CL_OUT_OF_RESOURCES
+        clblasOutOfHostMemory       = CL_OUT_OF_HOST_MEMORY
+        clblasInvalidOperation      = CL_INVALID_OPERATION
+        clblasCompilerNotAvailable  = CL_COMPILER_NOT_AVAILABLE
+        clblasBuildProgramFailure   = CL_BUILD_PROGRAM_FAILURE
+        clblasNotImplemented        = -1024
+        clblasNotInitialized        = -1023
+        clblasInvalidMatA
+        clblasInvalidMatB
+        clblasInvalidMatC
+        clblasInvalidVecX
+        clblasInvalidVecY
+        clblasInvalidDim
+        clblasInvalidLeadDimA
+        clblasInvalidLeadDimB
+        clblasInvalidLeadDimC
+        clblasInvalidIncX
+        clblasInvalidIncY
+        clblasInsufficientMemMatA
+        clblasInsufficientMemMatB
+        clblasInsufficientMemMatC
+        clblasInsufficientMemVecX
+        clblasInsufficientMemVecY
+    ctypedef clblasStatus_ clblasStatus
+
+    cdef enum clblasOrder_:
+        clblasRowMajor             = 0
+        clblasColumnMajor          = 1
+    ctypedef clblasStatus_ clblasOrder
+
+    cdef enum clblasTranspose_:
+        clblasNoTrans             = 0
+        clblasTrans               = 1
+        clblasConjTrans           = 2
+    ctypedef clblasStatus_ clblasTranspose
+
+    ctypedef unsigned int cl_uint
+    ctypedef float cl_float
+    ctypedef void* cl_mem
+    ctypedef void* cl_command_queue
+    ctypedef void* cl_event
diff --git a/src/wrappers/python/pyclBLAS.pyx b/src/wrappers/python/pyclBLAS.pyx
new file mode 100644
index 00000000..6e944c47
--- /dev/null
+++ b/src/wrappers/python/pyclBLAS.pyx
@@ -0,0 +1,117 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+cimport pyclBLAS
+import pyopencl
+
+# These are prototypes from clBLAS.h that we wish to call from python
+################################################################################
+################################################################################
+cdef extern from "clBLAS.h":
+   clblasStatus clblasGetVersion( cl_uint* major, cl_uint* minor, cl_uint* patch )
+
+   clblasStatus clblasSetup( )
+
+   void clblasTeardown( )
+
+   clblasStatus clblasSgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
+                size_t M, size_t N, size_t K, cl_float alpha, const cl_mem A, size_t offA, size_t lda,
+                const cl_mem B, size_t offB, size_t ldb, cl_float beta, cl_mem C, size_t offC, size_t ldc,
+                cl_uint numCommandQueues, cl_command_queue* commandQueues, cl_uint numEventsInWaitList,
+                const cl_event* eventWaitList, cl_event* events)
+
+################################################################################
+################################################################################
+# enums to be accessed from python
+# TODO:  is there a better way to express enums?  I like how pyopencl does it,
+# they have layers of scoped constants cl.mem_flags.READ_ONLY
+# The enums below have global scope
+RowMajor    = pyclBLAS.clblasRowMajor
+ColumnMajor = pyclBLAS.clblasColumnMajor
+NoTrans     = pyclBLAS.clblasNoTrans
+Trans       = pyclBLAS.clblasTrans
+ConjTrans   = pyclBLAS.clblasConjTrans
+
+################################################################################
+################################################################################
+# The following functions are the python callable wrapper implementations
+def Setup( ):
+   result = clblasSetup( )
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clblasSetup( ) failed initialization" )
+   return result
+
+################################################################################
+def Teardown( ):
+   clblasTeardown( )
+   return
+
+################################################################################
+def GetVersion( ):
+   cdef pyclBLAS.cl_uint pyMajor
+   cdef pyclBLAS.cl_uint pyMinor
+   cdef pyclBLAS.cl_uint pyPatch
+   result = clblasGetVersion( &pyMajor, &pyMinor, &pyPatch )
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clblasGetVersion( ) did not return version information" )
+   return pyMajor, pyMinor, pyPatch
+
+################################################################################
+# TODO:  Is there way to template these python callable functions, such that we
+# do not need to make a new function for every supported precision?
+def Sgemm( clblasOrder order, clblasTranspose transA, clblasTranspose transB,
+                size_t M, size_t N, size_t K, cl_float alpha, A, size_t offA, size_t lda,
+                B, size_t offB, size_t ldb, cl_float beta, C, size_t offC, size_t ldc,
+                cl_uint numCommandQueues, commandQueues, cl_uint numEventsInWaitList,
+                eventWaitList ):
+
+   # Simplify python wrapper to only handle 1 queue at this time
+   if( numCommandQueues != 1 ):
+      raise IndexError( "pyblasSgemm( ) requires the number of queues to be 1" )
+   cdef intptr_t pIntQueue = commandQueues.int_ptr
+   cdef cl_command_queue pcqQueue = <cl_command_queue>pIntQueue
+
+   # This logic does not yet work for numEventsInWaitList > (greater than) 1
+   # Need to figure out how python & pyopencl pass lists of objects
+   cdef intptr_t pIntWaitList = 0
+   cdef cl_event* pWaitList = NULL
+   if( numEventsInWaitList > 0 ):
+      if( numEventsInWaitList < 2 ):
+         pIntWaitList = eventWaitList.int_ptr
+         pWaitList = <cl_event*>pIntWaitList
+      else:
+         raise IndexError( "pyblasSgemm( ) requires numEventsInWaitList to be <= 1" )
+
+   # Pyopencl objects contain an int_ptr method to get access to the internally wrapped
+   # OpenCL object pointers
+   cdef cl_event outEvent = NULL
+   cdef intptr_t matA = A.int_ptr
+   cdef intptr_t matB = B.int_ptr
+   cdef intptr_t matC = C.int_ptr
+
+   # Transition execution to clBLAS
+   cdef clblasStatus result = clblasSgemm( order, transA, transB, M, N, K, alpha, <const cl_mem>matA, offA, lda,
+                         <const cl_mem>matB, offB, ldb, beta, <cl_mem>matC, offC, ldc,
+                         numCommandQueues, &pcqQueue, numEventsInWaitList,
+                         pWaitList, &outEvent )
+
+   if( result != clblasSuccess ):
+      raise RuntimeError( "clBLAS sgemm call failed" )
+
+   # Create a pyopencl Event object from the event returned from clBLAS and return
+   # it to the user
+   sgemmEvent = pyopencl.Event.from_int_ptr( <intptr_t>outEvent )
+   return sgemmEvent
diff --git a/src/wrappers/python/setup.py b/src/wrappers/python/setup.py
new file mode 100644
index 00000000..7092714c
--- /dev/null
+++ b/src/wrappers/python/setup.py
@@ -0,0 +1,107 @@
+################################################################################
+ # Copyright 2014 Advanced Micro Devices, Inc.
+ #
+ # Licensed under the Apache License, Version 2.0 (the "License");
+ # you may not use this file except in compliance with the License.
+ # You may obtain a copy of the License at
+ #
+ # http://www.apache.org/licenses/LICENSE-2.0
+ #
+ # Unless required by applicable law or agreed to in writing, software
+ # distributed under the License is distributed on an "AS IS" BASIS,
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ # See the License for the specific language governing permissions and
+ # limitations under the License.
+################################################################################
+
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
+from os import path, environ
+import argparse
+import platform
+
+def main():
+   parser = argparse.ArgumentParser(description='Set up the pyclBLAS extension module')
+   parser.add_argument('--clRoot',
+     dest='clRoot', default=None,
+     help='Root directory to find the OpenCL SDK, which should contain the include directory')
+   parser.add_argument('--clBlasRoot',
+     dest='clBlasRoot', default=None,
+     help='Root directory to find the clBLAS SDK, which should contain the include directory')
+
+   args, unknown_args = parser.parse_known_args( )
+
+##    print( "recognized args: ", args )
+##    print( "unknown args: ", unknown_args )
+
+   # First check environment variables for clRoot paths
+   clRootPath = None
+   if( environ.get('OPENCL_ROOT') is not None ):
+     clRootPath = environ['OPENCL_ROOT']
+
+   # Special check for environment variable set by AMD Catalyst installer
+   if( clRootPath is None and environ.get( 'AMDAPPSDKROOT' ) is not None ):
+     clRootPath = environ['AMDAPPSDKROOT']
+
+   # If user specifies a command line options, this trumps environment variables
+   print( "args.clRoot: ", args.clRoot )
+   if( args.clRoot is not None ):
+     clRootPath = args.clRoot
+
+   if( clRootPath is None ):
+     print( "This setup.py needs to know the root path of an OpenCL installation")
+     print( "Please specify the environment variable OPENCL_ROOT with a path" )
+     print( "Or pass the command line option --clRoot" )
+     exit( )
+
+   # First check environment variables for clRoot paths
+   clBlasRootPath = None
+   if( environ.get('CLBLAS_ROOT') is not None ):
+     clBlasRootPath = environ['CLBLAS_ROOT']
+
+   # If user specifies a command line options, this trumpts environment variables
+   print( "args.clBlasRoot: ", args.clBlasRoot )
+   if( args.clBlasRoot is not None ):
+     clBlasRootPath = args.clBlasRoot
+
+   if( clBlasRootPath is None ):
+     print( "This setup.py needs to know the root path of the clBLAS installation")
+     print( "Please specify the environment variable CLBLAS_ROOT with a path" )
+     print( "or pass the command line option --clBlasRoot" )
+     exit( )
+
+   # 64bit and 32bit have different library paths
+   if( platform.architecture( )[0] == '64bit' ):
+     libraryPath = 'lib64'
+   else:
+     libraryPath = 'lib'
+
+   # Windows and linux have different library paths
+   if( platform.system( ) == 'Windows' ):
+     libraryPath = path.join( libraryPath, 'import' )
+
+   module = [
+     Extension( name = 'pyclBLAS',
+               sources = ['pyclBLAS.pyx'],
+               include_dirs = [ path.join( clRootPath, 'include' ),
+                                path.join( clBlasRootPath, 'include' ) ],
+               library_dirs = [ path.join( clBlasRootPath, libraryPath ) ],
+               libraries=['clBLAS'] )
+   ]
+
+   setup(
+      name = 'pyclBLAS',
+      version = '0.0.1',
+      author = 'Kent Knox',
+      description = 'Python wrapper for clBLAS',
+      license = 'Apache License, Version 2.0',
+      cmdclass = {"build_ext": build_ext},
+      ext_modules = module,
+      script_args = unknown_args
+   )
+
+# This is the start of the execution of the python script
+# Useful for debuggers to step into script
+if __name__ == '__main__':
+    main( )