diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..2d30e3b3
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,44 @@
+language: cpp
+
+compiler:
+ - gcc
+
+before_install:
+ - sudo apt-get update -qq
+ - sudo apt-get install -qq fglrx opencl-headers libboost-program-options-dev libgtest-dev
+# Uncomment below to help verify the installs above work
+# - ls -la /usr/lib/libboost*
+# - ls -la /usr/include/boost
+# - ls -la /usr/src/gtest
+
+install:
+ - mkdir -p bin/gTest
+ - cd bin/gTest
+ - cmake -DCMAKE_BUILD_TYPE=Release /usr/src/gtest
+ - make
+ - sudo mv libg* /usr/lib
+
+before_script:
+ - cd ${TRAVIS_BUILD_DIR}
+ - mkdir -p bin/clBLAS
+ - cd bin/clBLAS
+ - cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_TEST=OFF -DBUILD_CLIENT=ON ../../src
+
+script:
+ - make install
+# - ls -Rla package
+# Run a simple test to validate that the build works; CPU device in a VM
+ - cd package/bin
+ - export LD_LIBRARY_PATH=${TRAVIS_BUILD_DIR}/bin/clBLAS/package/lib64:${LD_LIBRARY_PATH}
+ - ./client --cpu
+
+after_success:
+ - cd ${TRAVIS_BUILD_DIR}/bin/clBLAS
+ - make package
+
+notifications:
+ email:
+ - clmath-developers@googlegroups.com
+ on_success: change
+ on_failure: always
+
\ No newline at end of file
diff --git a/CHANGELOG b/CHANGELOG
index 9cd3d900..03b9faff 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -243,34 +243,3 @@ For example:
./example_sgemm
- Run a simple client; one example is provided for each supported main
BLAS function family.
-_______________________________________________________________________________
-(C) 2010-2013 Advanced Micro Devices, Inc. All rights reserved. AMD, the AMD
-Arrow logo, ATI, the ATI logo, Radeon, FireStream, FireGL, Catalyst, and
-combinations thereof are trademarks of Advanced Micro Devices, Inc. Microsoft
-(R), Windows, and Windows Vista (R) are registered trademarks of Microsoft
-Corporation in the U.S. and/or other jurisdictions. OpenCL and the OpenCL logo
-are trademarks of Apple Inc. used by permission by Khronos. Other names are for
-informational purposes only and may be trademarks of their respective owners.
-
-The contents of this document are provided in connection with Advanced Micro
-Devices, Inc. ("AMD") products. AMD makes no representations or warranties with
-respect to the accuracy or completeness of the contents of this publication and
-reserves the right to make changes to specifications and product descriptions
-at any time without notice. The information contained herein may be of a
-preliminary or advance nature and is subject to change without notice. No
-license, whether express, implied, arising by estoppel or otherwise, to any
-intellectual property rights is granted by this publication. Except as set forth
-in AMD's Standard Terms and Conditions of Sale, AMD assumes no liability
-whatsoever, and disclaims any express or implied warranty, relating to its
-products including, but not limited to, the implied warranty of
-merchantability, fitness for a particular purpose, or infringement of any
-intellectual property right.
-
-AMD's products are not designed, intended, authorized or warranted for use as
-components in systems intended for surgical implant into the body, or in other
-applications intended to support or sustain life, or in any other application
-in which the failure of AMD's product could create a situation where personal
-injury, death, or severe property or environmental damage may occur. AMD
-reserves the right to discontinue or make changes to its products at any time
-without notice.
-_______________________________________________________________________________
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 61932f1f..0dc5c7e8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -8,7 +8,7 @@ Firstly, in order to contribute code to this project, a contributor must have a
* After forking, the contributor [clones their repository](https://help.github.com/articles/create-a-repo) locally on their machine
* Code is developed and checked into the contributor's repository. These commits are eventually pushed upstream to their GitHub repository
* The contributor then issues a [pull-request](https://help.github.com/articles/using-pull-requests) against the **develop** branch of this repository, which is the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow which is well suited for working with GitHub
- * A [git extention](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. Refer to the projects wiki
+ * A [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. Refer to the projects wiki
At this point, the repository maintainers will be notified by GitHub that a 'pull request' exists pending against their repository. A code review should be completed within a few days, depending on the scope of submitted code, and the code will either be accepted, rejected or commented on for extra feedback.
@@ -32,5 +32,5 @@ guidelines over time
Pull requests will be reviewed by the set of collaborators that are assigned for the repository. Pull requests may be accepted, declined or a conversation may start on the pull request thread with feedback. If the pull request is trivial and all the submission guidelines defined above are honored, the pull request may be accepted without delay. If the pull request is good, but the guidelines defined above are not followed, the collaborators may leave feedback on the pull request and engage in a conversation with the contributor with what they can do to improve the pull request. At any time, collaborators may decline a pull request if they decide the contribution is not appropriate for the project, or the feedback from reviewers on a pull request is not being addressed in an appropriate amount of time.
## Is it possible to become an official collaborator of the repository?
-Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project. When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator. These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes. It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project. The benefit of being a repository collaborator allows you to be able to be able to manage other peoples pull requests.
+Yes, we hope to promote trusted members of the community, who have proven themselves to be competent and request to take on the extra responsibility to be official collaborators of the project. When an individual requests to be an official collaborator, current project collaborators will browse through the history of the requester's prior pull requests and take a vote amongst themselves if the requester should be promoted to collaborator. These individuals will then have the right to approve/decline pull requests and help shape the path that the project goes. It is worth noting, that on GitHub everybody has read-only access to the source and that everybody has the ability to issue a pull request to contribute to the project. The benefit of being a repository collaborator allows you to be able to manage other peoples pull requests.
diff --git a/README.md b/README.md
index dfdaa645..5f0338e9 100644
--- a/README.md
+++ b/README.md
@@ -1,78 +1,110 @@
clBLAS
=====
+[![Build Status](https://travis-ci.org/clMathLibraries/clBLAS.png)](https://travis-ci.org/clMathLibraries/clBLAS)
+
+
+This repository houses the code for the OpenCL™ BLAS portion of clMath.
+The complete set of BLAS level 1, 2 & 3 routines is implemented. Please
+see Netlib BLAS for the list of supported routines. In addition to GPU
+devices, the library also supports running on CPU devices to facilitate
+debugging and multicore programming. APPML 1.10 is the most current
+generally available pre-packaged binary version of the library available
+for download for both Linux and Windows platforms.
+
+The primary goal of clBLAS is to make it easier for developers to
+utilize the inherent performance and power efficiency benefits of
+heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL
+interfaces, but rather leaves OpenCL state management to the control of
+the user to allow for maximum performance and flexibility. The clBLAS
+library does generate and enqueue optimized OpenCL kernels, relieving
+the user from the task of writing, optimizing and maintaining kernel
+code themselves.
-clMATH is a software library containing FFT and BLAS functions written in OpenCL. In addition to GPU devices, the libraries also support running on CPU devices to facilitate debugging and multicore programming.
+## clBLAS library user documentation
-APPML 1.10 is the most current generally available version of the library, and pre-built binaries are available for download on both Linux and Windows platforms.
+[Library and API documentation][] for developers is available online as
+a GitHub Pages website
-This repository houses the code for the OpenCL™ BLAS portion of APPML. The complete set of BLAS level 1, 2 & 3 routines has been implemented. Please see Netlib BLAS for the list of routines. For more information on supported graphics cards, see the AMD APP System Requirements.
+### Google Groups
-The primary goal of clBLAS is to make it easier for developers to utilize the inherent performance and power efficiency benefits of heterogeneous computing. clBLAS interfaces do not hide nor wrap OpenCL interfaces, but rather leaves OpenCL state management to the control of the user to allow for maximum performance and flexibility. The clBLAS library does generate and enqueue optimized OpenCL kernels, relieving the user from the task of writing, optimizing and maintaining kernel code themselves.
+Two mailing lists have been created for the clMath projects:
-## clBLAS library user documentation
-[Library and API documentation]( http://clmathlibraries.github.io/clBLAS/ ) for developers is available online as a GitHub Pages website
+- [clmath@googlegroups.com][] - group whose focus is to answer
+ questions on using the library or reporting issues
+
+- [clmath-developers@googlegroups.com][] - group whose focus is for
+ developers interested in contributing to the library code itself
## clBLAS Wiki
-The [project wiki](https://github.com/clMathLibraries/clBLAS/wiki) contains helpful documentation, including a [build primer](https://github.com/clMathLibraries/clBLAS/wiki/Build)
+
+The [project wiki][] contains helpful documentation, including a [build
+primer][]
## Contributing code
-Please refer to and read the [Contributing](CONTRIBUTING.md) document for guidelines on how to contribute code to this open source project
+
+Please refer to and read the [Contributing][] document for guidelines on
+how to contribute code to this open source project. The code in the
+/master branch is considered to be stable, and all pull-requests should
+be made against the /develop branch.
## License
-The source for clFFT is licensed under the [Apache License, Version 2.0]( http://www.apache.org/licenses/LICENSE-2.0 )
+
+The source for clBLAS is licensed under the [Apache License, Version
+2.0][]
## Example
-The simple example below shows how to use clBLAS to compute an OpenCL accelerated SGEMM
-```c
-#include
-#include
+The simple example below shows how to use clBLAS to compute an OpenCL
+accelerated SGEMM
-/* Include the clBLAS header. It includes the appropriate OpenCL headers
+ #include
+ #include
+
+ /* Include the clBLAS header. It includes the appropriate OpenCL headers
*/
-#include
+ #include
-/* This example uses predefined matrices and their characteristics for
+ /* This example uses predefined matrices and their characteristics for
* simplicity purpose.
*/
-#define M 4
-#define N 3
-#define K 5
+ #define M 4
+ #define N 3
+ #define K 5
-static const cl_float alpha = 10;
+ static const cl_float alpha = 10;
-static const cl_float A[M*K] = {
+ static const cl_float A[M*K] = {
11, 12, 13, 14, 15,
21, 22, 23, 24, 25,
31, 32, 33, 34, 35,
41, 42, 43, 44, 45,
-};
-static const size_t lda = K; /* i.e. lda = K */
+ };
+ static const size_t lda = K; /* i.e. lda = K */
-static const cl_float B[K*N] = {
+ static const cl_float B[K*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
51, 52, 53,
-};
-static const size_t ldb = N; /* i.e. ldb = N */
+ };
+ static const size_t ldb = N; /* i.e. ldb = N */
-static const cl_float beta = 20;
+ static const cl_float beta = 20;
-static cl_float C[M*N] = {
+ static cl_float C[M*N] = {
11, 12, 13,
21, 22, 23,
31, 32, 33,
41, 42, 43,
-};
-static const size_t ldc = N; /* i.e. ldc = N */
+ };
+ static const size_t ldc = N; /* i.e. ldc = N */
-static cl_float result[M*N];
+ static cl_float result[M*N];
-int main( void )
-{
+ int main( void )
+ {
cl_int err;
cl_platform_id platform = 0;
cl_device_id device = 0;
@@ -138,25 +170,48 @@ int main( void )
clReleaseContext( ctx );
return ret;
-}
-```
+ }
## Build dependencies
+
### Library for Windows
-* Windows® 7/8
-* Visual Studio 2010 SP1
-* An OpenCL SDK, such as APP SDK 2.8
-* Latest CMake
+
+- Windows® 7/8
+
+- Visual Studio 2010 SP1, 2012
+
+- An OpenCL SDK, such as APP SDK 2.9
+
+- Latest CMake
### Library for Linux
-* GCC 4.6 and onwards
-* An OpenCL SDK, such as APP SDK 2.8
-* Latest CMake
+
+- GCC 4.6 and onwards
+
+- An OpenCL SDK, such as APP SDK 2.9
+
+- Latest CMake
+
+### Library for Mac OSX
+
+- Recommended to generate Unix makefiles with cmake
### Test infrastructure
-* Latest Googletest
-* Latest ACML
-* Latest Boost
+
+- Googletest v1.6
+
+- ACML on windows/linux; Accelerate on Mac OSX
+
+- Latest Boost
### Performance infrastructure
-* Python
\ No newline at end of file
+
+- Python
+
+ [Library and API documentation]: http://clmathlibraries.github.io/clBLAS/
+ [clmath@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath
+ [clmath-developers@googlegroups.com]: https://groups.google.com/forum/#!forum/clmath-developers
+ [project wiki]: https://github.com/clMathLibraries/clBLAS/wiki
+ [build primer]: https://github.com/clMathLibraries/clBLAS/wiki/Build
+ [Contributing]: CONTRIBUTING.md
+ [Apache License, Version 2.0]: http://www.apache.org/licenses/LICENSE-2.0
diff --git a/doc/clBLAS.doxy b/doc/clBLAS.doxy
index 86fbbfc4..afc15ae0 100644
--- a/doc/clBLAS.doxy
+++ b/doc/clBLAS.doxy
@@ -52,7 +52,7 @@ PROJECT_LOGO =
# If a relative path is entered, it will be relative to the location
# where doxygen was started. If left blank the current directory will be used.
-OUTPUT_DIRECTORY = F:\code\git-svn\clBLAS.head\bin\master\vs10x64.superbuild\docs
+OUTPUT_DIRECTORY = ..\..\bin\clBLAS.doxy
# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
# 4096 sub-directories (in 2 levels) under the output directory of each output
@@ -651,17 +651,17 @@ WARN_LOGFILE =
# directories like "/usr/src/myproject". Separate the files or directories
# with spaces.
-INPUT = clBLAS.h \
- include/cltypes.h \
- include/kerngen.h \
- include/solver.h \
- include/mempat.h \
- src/blas/gens/blas_kgen.h \
- src/blas/include/clblas-internal.h \
- src/blas/include/kernel_extra.h \
- src/blas/include/solution_seq.h \
- include/granulation.h \
- src/tools/ktest/step.h
+INPUT = ../src/clBLAS.h \
+ ../src/include/cltypes.h \
+ ../src/include/kerngen.h \
+ ../src/include/solver.h \
+ ../src/include/mempat.h \
+ ../src/library/gens/blas_kgen.h \
+ ../src/library/include/clblas-internal.h \
+ ../src/library/include/kernel_extra.h \
+ ../src/library/include/solution_seq.h \
+ ../src/include/granulation.h \
+ ../src/library/tools/ktest/step.h
# This tag can be used to specify the character encoding of the source files
# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
@@ -721,7 +721,7 @@ EXCLUDE_SYMBOLS =
# directories that contain example code fragments that are included (see
# the \include command).
-EXAMPLE_PATH = samples
+EXAMPLE_PATH = ../src/samples
# If the value of the EXAMPLE_PATH tag contains directories, you can use the
# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 6602b795..41b54ab6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,7 +14,7 @@
# limitations under the License.
# ########################################################################
-cmake_minimum_required(VERSION 2.6)
+cmake_minimum_required(VERSION 2.8)
#User toggle-able options that can be changed on the command line with -D
option( BUILD_RUNTIME "Build the BLAS runtime library" ON )
@@ -28,26 +28,39 @@ option( BUILD_KTEST "A command line tool for testing single clBLAS kernel" ON )
# However, test-correctness can instead use NETLIB as a reference library
set(CORR_TEST_WITH_ACML ON CACHE BOOL "Use ACML library in correctness tests")
-# uncomment these to print compiler invocation lines for nmake files
-# set( CMAKE_START_TEMP_FILE "" )
-# set( CMAKE_END_TEMP_FILE "" )
-# set( CMAKE_VERBOSE_MAKEFILE 1 )
+if( CMAKE_GENERATOR MATCHES "NMake" )
+ option( NMAKE_COMPILE_VERBOSE "Print compile and link strings to the console" OFF )
+ if( NMAKE_COMPILE_VERBOSE )
+ set( CMAKE_START_TEMP_FILE "" )
+ set( CMAKE_END_TEMP_FILE "" )
+ set( CMAKE_VERBOSE_MAKEFILE 1 )
+ endif( )
+endif( )
# If we are on linux, and we wish to link with the netlib BLAS implementation, we need to have a valid fortran compiler
-if( NOT CORR_TEST_WITH_ACML AND NOT WIN32)
+if( NOT CORR_TEST_WITH_ACML AND NOT WIN32 AND NOT APPLE )
project(clBLAS Fortran C CXX )
else( )
project(clBLAS C CXX)
endif( )
# Define a version for the code
-set( clBLAS_VERSION_MAJOR 2 )
-set( clBLAS_VERSION_MINOR 0 )
-set( clBLAS_VERSION_PATCH 0 )
+if( NOT DEFINED clBLAS_VERSION_MAJOR )
+ set( clBLAS_VERSION_MAJOR 2 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_MINOR )
+ set( clBLAS_VERSION_MINOR 2 )
+endif( )
+
+if( NOT DEFINED clBLAS_VERSION_PATCH )
+ set( clBLAS_VERSION_PATCH 0 )
+endif( )
+
set( clBLAS_VERSION "${clBLAS_VERSION_MAJOR}.${clBLAS_VERSION_MINOR}.${clBLAS_VERSION_PATCH}")
# Increment this if we break backward compatibility.
-set(clBLAS_SOVERSION 1)
+set( clBLAS_SOVERSION 2 )
# We have custom written Find* modules now in the root source directory
set( CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR} )
@@ -58,24 +71,34 @@ if( CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE )
endif( )
-set( ACMLROOT $ENV{ACMLROOT} CACHE PATH "AMD ACML root path")
-
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Debug CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel."
FORCE)
endif()
+# These variables are meant to contain string which should be appended to the installation paths
+# of library and executable binaries, respectively. They are meant to be user configurable/overridable.
+set( SUFFIX_LIB_DEFAULT "" )
+set( SUFFIX_BIN_DEFAULT "" )
+
if(TARGET_PLATFORM EQUAL 32 OR TARGET_PLATFORM EQUAL 64)
set(TARGET_PLATFORM ${TARGET_PLATFORM} CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+ if( TARGET_PLATFORM EQUAL 64 )
+ set( SUFFIX_LIB_DEFAULT "64" )
+ endif( )
else()
if(CMAKE_SIZEOF_VOID_P MATCHES 8)
set(TARGET_PLATFORM "64" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
+ set( SUFFIX_LIB_DEFAULT "64" )
else()
set(TARGET_PLATFORM "32" CACHE STRING "Target platform type (32-bit or 64-bit)" FORCE)
endif()
endif()
+set( SUFFIX_LIB ${SUFFIX_LIB_DEFAULT} CACHE STRING "String to append to 'lib' install path" )
+set( SUFFIX_BIN ${SUFFIX_BIN_DEFAULT} CACHE STRING "String to append to 'bin' install path" )
+
if( MSVC_IDE )
set_property( GLOBAL PROPERTY USE_FOLDERS TRUE )
endif( )
@@ -98,19 +121,14 @@ endif()
# TODO: maybe this could be written using the FindBLAS module in the future
if( BUILD_TEST )
if(NOT CORR_TEST_WITH_ACML)
- if( WIN32 )
+ if(APPLE)
+ find_library(BLAS_LIBRARIES Accelerate)
+ MARK_AS_ADVANCED(BLAS_LIBRARIES)
+ message(STATUS "Using Accelerate framework on Mac OS-X")
+ else()
find_package( Netlib COMPONENTS BLAS REQUIRED )
+ endif()
else( )
- if( $ENV{REFBLAS_ROOT} )
- set( REFBLAS_ROOT $ENV{REFBLAS_ROOT} CACHE PATH "NetLib BLAS root path")
- else( )
- message(FATAL_ERROR "Cannot find reference BLAS, please set REFBLAS_ROOT environment variable")
- endif( )
-
- # Find reference BLAS implementation
- include( ${REFBLAS_ROOT}/package/cmake/exportBLAS.cmake )
- endif( )
- else( )
# Find ACML BLAS implementation
# platform dependent ACML subdirectory
if (WIN32)
@@ -121,9 +139,10 @@ if( BUILD_TEST )
find_path(ACML_INCLUDE_DIRS acml.h
HINTS
- $ENV{ACMLROOT}/include
- ${ACMLROOT}/include
- ${ACMLROOT}/${ACML_SUBDIR}/include
+ ${ACML_ROOT}/include
+ ${ACML_ROOT}/${ACML_SUBDIR}/include
+ $ENV{ACML_ROOT}/include
+ $ENV{ACML_ROOT}/${ACML_SUBDIR}/include
)
if( ACML_INCLUDE_DIRS )
@@ -132,27 +151,30 @@ if( BUILD_TEST )
endif()
if( UNIX )
- find_library(ACML_LIBRARIES acml acml_mp
+ find_library(ACML_LIBRARIES acml_mp
HINTS
- $ENV{ACMLROOT}/lib
- ${ACMLROOT}/lib
- ${ACMLROOT}/${ACML_SUBDIR}/lib
+ ${ACML_ROOT}/lib
+ ${ACML_ROOT}/${ACML_SUBDIR}/lib
+ $ENV{ACML_ROOT}/lib
+ $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
)
find_library(_acml_mv_library acml_mv
HINTS
- $ENV{ACMLROOT}/lib
- ${ACMLROOT}/lib
- ${ACMLROOT}/${ACML_SUBDIR}/lib
+ ${ACML_ROOT}/lib
+ ${ACML_ROOT}/${ACML_SUBDIR}/lib
+ $ENV{ACML_ROOT}/lib
+ $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
)
mark_as_advanced(_acml_mv_library)
endif( )
if(WIN32)
- find_library(ACML_LIBRARIES libacml_dll libacml_mp_dll
+ find_library(ACML_LIBRARIES libacml_mp_dll
HINTS
- $ENV{ACMLROOT}/lib
- ${ACMLROOT}/lib
- ${ACMLROOT}/${ACML_SUBDIR}/lib
+ ${ACML_ROOT}/lib
+ ${ACML_ROOT}/${ACML_SUBDIR}/lib
+ $ENV{ACML_ROOT}/lib
+ $ENV{ACML_ROOT}/${ACML_SUBDIR}/lib
)
endif( )
@@ -213,16 +235,21 @@ if(CMAKE_COMPILER_IS_GNUCXX)
if(TARGET_PLATFORM EQUAL 32)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-builtin")
endif()
-endif()
+elseif( MSVC )
+ # CMake sets huge stack frames for windows, for whatever reason. We go with compiler default.
+ string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}" )
+ string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS}" )
+ string( REGEX REPLACE "/STACK:[0-9]+" "" CMAKE_MODULE_LINKER_FLAGS "${CMAKE_MODULE_LINKER_FLAGS}" )
+endif( )
if (WIN32)
add_definitions(-D_CRT_SECURE_NO_WARNINGS)
-endif()
+endif( )
#TODO: We should remove this pre-processor define for our 1.8 build; this means removing our deprecated image functions such as calls clCreateImage2D( )
add_definitions( -DCL_USE_DEPRECATED_OPENCL_1_1_APIS )
-configure_file( "${PROJECT_SOURCE_DIR}/version.h.in" "${PROJECT_BINARY_DIR}/include/version.h" )
+configure_file( "${PROJECT_SOURCE_DIR}/clBLAS.version.h.in" "${PROJECT_BINARY_DIR}/include/clBLAS.version.h" )
# configure a header file to pass the CMake version settings to the source, and package the header files in the output archive
install( FILES
@@ -230,7 +257,7 @@ install( FILES
"clAmdBlas.h"
"clAmdBlas.version.h"
"clBLAS-complex.h"
- "${PROJECT_BINARY_DIR}/include/version.h"
+ "${PROJECT_BINARY_DIR}/include/clBLAS.version.h"
DESTINATION
"./include" )
diff --git a/src/FindOpenCL.cmake b/src/FindOpenCL.cmake
index 1cdc43de..8725612f 100644
--- a/src/FindOpenCL.cmake
+++ b/src/FindOpenCL.cmake
@@ -46,21 +46,17 @@
# target_link_libraries(foo ${OPENCL_LIBRARIES})
#
#-----------------------
-if( DEFINED ENV{AMDAPPSDKROOT} )
- set( OPENCL_ROOT $ENV{AMDAPPSDKROOT} CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-else( )
- set( OPENCL_ROOT "/usr/lib" CACHE PATH "Environment variable defining the root of OPENCL implementation" )
-endif( )
find_path(OPENCL_INCLUDE_DIRS
- NAMES OpenCL/cl.h CL/cl.h
+ NAMES OpenCL/cl.h CL/cl.h
HINTS
- ${OPENCL_ROOT}/include
- ENV AMDAPPSDKROOT/include
- PATHS
- /usr/include
- /usr/local/include
- DOC "OpenCL header file path"
+ ${OPENCL_ROOT}/include
+ $ENV{AMDAPPSDKROOT}/include
+ $ENV{CUDA_PATH}/include
+ PATHS
+ /usr/include
+ /usr/local/include
+ DOC "OpenCL header file path"
)
mark_as_advanced( OPENCL_INCLUDE_DIRS )
@@ -68,23 +64,29 @@ mark_as_advanced( OPENCL_INCLUDE_DIRS )
get_property( LIB64 GLOBAL PROPERTY FIND_LIBRARY_USE_LIB64_PATHS )
if( LIB64 )
- find_library( OPENCL_LIBRARIES
- NAMES OpenCL
- HINTS
+ find_library( OPENCL_LIBRARIES
+ NAMES OpenCL
+ HINTS
${OPENCL_ROOT}/lib
- ENV AMDAPPSDKROOT/lib
- DOC "OpenCL dynamic library path"
- PATH_SUFFIXES x86_64 x64
- )
+ $ENV{AMDAPPSDKROOT}/lib
+ $ENV{CUDA_PATH}/lib
+ DOC "OpenCL dynamic library path"
+ PATH_SUFFIXES x86_64 x64
+ PATHS
+ /usr/lib
+ )
else( )
- find_library( OPENCL_LIBRARIES
- NAMES OpenCL
- HINTS
+ find_library( OPENCL_LIBRARIES
+ NAMES OpenCL
+ HINTS
${OPENCL_ROOT}/lib
- ENV AMDAPPSDKROOT/lib
- DOC "OpenCL dynamic library path"
- PATH_SUFFIXES x86
- )
+ $ENV{AMDAPPSDKROOT}/lib
+ $ENV{CUDA_PATH}/lib
+ DOC "OpenCL dynamic library path"
+ PATH_SUFFIXES x86 Win32
+ PATHS
+ /usr/lib
+ )
endif( )
mark_as_advanced( OPENCL_LIBRARIES )
@@ -92,5 +94,5 @@ include( FindPackageHandleStandardArgs )
FIND_PACKAGE_HANDLE_STANDARD_ARGS( OPENCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
if( NOT OPENCL_FOUND )
- message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
+ message( STATUS "FindOpenCL looked for libraries named: OpenCL" )
endif()
diff --git a/src/clAmdBlas.h b/src/clAmdBlas.h
index 1921473e..c994eccc 100644
--- a/src/clAmdBlas.h
+++ b/src/clAmdBlas.h
@@ -8528,14 +8528,11 @@ clAmdBlasCgemm(
size_t K,
FloatComplex alpha,
const cl_mem A,
- size_t offA,
size_t lda,
const cl_mem B,
- size_t offB,
size_t ldb,
FloatComplex beta,
cl_mem C,
- size_t offC,
size_t ldc,
cl_uint numCommandQueues,
cl_command_queue *commandQueues,
diff --git a/src/clBLAS.def b/src/clBLAS.def
index 5111ff2a..0a9f9b6b 100644
--- a/src/clBLAS.def
+++ b/src/clBLAS.def
@@ -1,6 +1,18 @@
-;/***********************************************************************
-;** Copyright (C) 2010 Advanced Micro Devices, Inc. All Rights Reserved.
-;***********************************************************************/
+;/* ************************************************************************
+; * Copyright 2013 Advanced Micro Devices, Inc.
+; *
+; * Licensed under the Apache License, Version 2.0 (the "License");
+; * you may not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * http://www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an "AS IS" BASIS,
+; * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; * ************************************************************************/
LIBRARY clBLAS
diff --git a/src/clBLAS.h b/src/clBLAS.h
index 6d219c33..7d89b9f6 100644
--- a/src/clBLAS.h
+++ b/src/clBLAS.h
@@ -56,6 +56,10 @@ extern "C" {
* keeping interfaces familiar to users who know how to use BLAS. All
* functions accept matrices through buffer objects.
*
+ * This library is entirely thread-safe with the exception of the following API :
+ * clblasSetup and clblasTeardown.
+ * Developers using the library can safely using any blas routine from different thread.
+ *
* @section deprecated
* This library provided support for the creation of scratch images to achieve better performance
* on older AMD APP SDK's.
diff --git a/src/version.h.in b/src/clBLAS.version.h.in
similarity index 100%
rename from src/version.h.in
rename to src/clBLAS.version.h.in
diff --git a/src/client/CMakeLists.txt b/src/client/CMakeLists.txt
index 5154a313..2ebebf11 100644
--- a/src/client/CMakeLists.txt
+++ b/src/client/CMakeLists.txt
@@ -26,7 +26,11 @@ set(CLIENT_HEADER
clfunc_xtrmm.hpp
clfunc_xtrsm.hpp
clfunc_xsyrk.hpp
- clfunc_xsyr2k.hpp)
+ clfunc_xsyr2k.hpp
+ clfunc_xhemm.hpp
+ clfunc_xsymm.hpp
+ clfunc_xherk.hpp
+ clfunc_xher2k.hpp)
set(WRAPPER_SRC testPerfWrapper.cpp)
@@ -48,21 +52,15 @@ include_directories(
add_executable(client ${CLIENT_SRC} ${CLIENT_HEADER})
target_link_libraries(client ${Boost_LIBRARIES} ${OPENCL_LIBRARIES} clBLAS)
+set_target_properties( client PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
add_executable(testPerfWrapper ${WRAPPER_SRC})
target_link_libraries(testPerfWrapper ${Boost_LIBRARIES})
-
-if( TARGET_PLATFORM EQUAL 64 )
- set( BIN_DIR bin64 )
- set( LIB_DIR lib64 )
-else()
- set( BIN_DIR bin32 )
- set( LIB_DIR lib32 )
-endif()
+set_target_properties( testPerfWrapper PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
# CPack configuration; include the executable into the package
install( TARGETS client testPerfWrapper
- RUNTIME DESTINATION ${BIN_DIR}
- LIBRARY DESTINATION ${LIB_DIR}
- ARCHIVE DESTINATION ${LIB_DIR}/import
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
)
diff --git a/src/client/clfunc_common.hpp b/src/client/clfunc_common.hpp
index 4876daf5..bda11866 100644
--- a/src/client/clfunc_common.hpp
+++ b/src/client/clfunc_common.hpp
@@ -28,6 +28,11 @@
#include "dis_warning.h"
#include "clBLAS.h"
+#if defined(__APPLE__) || defined(__MACOSX)
+#include
+#else
+#include
+#endif
template
static T
@@ -243,6 +248,7 @@ class clblasFunc
OPENCL_V_THROW(err, "creating context");
queue_ = clCreateCommandQueue(ctx_, device_, 0, &err);
+
timer_id = timer.getUniqueID( "clfunc", 0 );
@@ -307,13 +313,18 @@ class clblasFunc
virtual void reset_gpu_write_buffer() = 0;
virtual void read_gpu_buffer() = 0;
virtual void roundtrip_func() = 0;
+ virtual void roundtrip_func_rect() {}
+ virtual void allochostptr_roundtrip_func() {}
+ virtual void usehostptr_roundtrip_func() {}
+ virtual void copyhostptr_roundtrip_func() {}
+ virtual void usepersismem_roundtrip_func() {}
virtual void roundtrip_setup_buffer(int order_option, int side_option,
int uplo_option, int diag_option, int
transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda,
size_t ldb, size_t ldc, size_t offA, size_t offBX,
size_t offCY, double alpha, double beta) = 0;
-
+ virtual void releaseGPUBuffer_deleteCPUBuffer()=0;
StatisticalTimer& timer;
StatisticalTimer::sTimerID timer_id;
diff --git a/src/client/clfunc_xgemm.hpp b/src/client/clfunc_xgemm.hpp
index 17223a62..fcd40a79 100644
--- a/src/client/clfunc_xgemm.hpp
+++ b/src/client/clfunc_xgemm.hpp
@@ -62,20 +62,13 @@ class xGemm : public clblasFunc
~xGemm()
{
- delete buffer_.a_;
- delete buffer_.b_;
- delete buffer_.c_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
- "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
- "releasing buffer C");
}
void call_func()
{
- std::cout << "xGemm::call_func\n";
+ timer.Start(timer_id);
+ xGemm_Function(true);
+ timer.Stop(timer_id);
}
double gflops()
@@ -420,7 +413,307 @@ class xGemm : public clblasFunc
void roundtrip_func()
{
- std::cout << "xGemm::roundtrip_func\n";
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+ buffer_.offA_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.a_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T),
+ buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void roundtrip_func_rect()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ //rect
+ size_t a_buffer_origin[3] = {0,0,0};
+ size_t a_host_origin[3] = {0,0,0};
+ size_t a_region[3] = {buffer_.m_*sizeof(T),buffer_.k_,1};
+ size_t a_buffer_row_pitch=0*sizeof(T);//lda
+ size_t a_buffer_slice_pitch=0;
+ size_t a_host_row_pitch=buffer_.lda_*sizeof(T);
+ size_t a_host_slice_pitch=0;
+
+ size_t b_buffer_origin[3] = {0,0,0};
+ size_t b_host_origin[3] = {0,0,0};
+ size_t b_region[3] = {buffer_.k_*sizeof(T),buffer_.n_,1};
+ size_t b_buffer_row_pitch=0*sizeof(T);//ldb
+ size_t b_buffer_slice_pitch=0;
+ size_t b_host_row_pitch=buffer_.ldb_*sizeof(T);
+ size_t b_host_slice_pitch=0;
+
+ size_t c_buffer_origin[3] = {0,0,0};
+ size_t c_host_origin[3] = {0,0,0};
+ size_t c_region[3] = {buffer_.m_*sizeof(T),buffer_.n_,1};
+ size_t c_buffer_row_pitch=0*sizeof(T);//ldc
+ size_t c_buffer_slice_pitch=0;
+ size_t c_host_row_pitch=buffer_.ldc_*sizeof(T);
+ size_t c_host_slice_pitch=0;
+
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.k_*buffer_.m_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.k_ * buffer_.n_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.m_ * buffer_.n_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+ /*
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+ buffer_.offA_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.a_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T),
+ buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, NULL);*/
+ err = clEnqueueWriteBufferRect(queue_, buffer_.buf_a_, CL_TRUE, a_buffer_origin, a_host_origin, a_region, a_buffer_row_pitch,
+ a_buffer_slice_pitch, a_host_row_pitch, a_host_slice_pitch, buffer_.a_, 0, NULL, NULL);
+ err = clEnqueueWriteBufferRect(queue_, buffer_.buf_b_, CL_TRUE, b_buffer_origin, b_host_origin, b_region, b_buffer_row_pitch,
+ b_buffer_slice_pitch, b_host_row_pitch, b_host_slice_pitch, buffer_.b_, 0, NULL, NULL);
+ err = clEnqueueWriteBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+ c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, NULL);
+
+ if(buffer_.trans_a_==clblasNoTrans)
+ {
+ buffer_.lda_=buffer_.m_;
+ }
+ else
+ {
+ buffer_.lda_=buffer_.k_;
+ }
+ if(buffer_.trans_b_==clblasNoTrans)
+ {
+ buffer_.ldb_=buffer_.k_;
+ }
+ else
+ {
+ buffer_.ldb_=buffer_.n_;
+ }
+ buffer_.ldc_=buffer_.m_;
+ xGemm_Function(false);
+ /*
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ */
+ err = ::clEnqueueReadBufferRect(queue_, buffer_.buf_c_, CL_TRUE, c_buffer_origin, c_host_origin, c_region, c_buffer_row_pitch,
+ c_buffer_slice_pitch, c_host_row_pitch, c_host_slice_pitch, buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void allochostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+
+ cl_int err;
+ // Create buffers with CL_MEM_ALLOC_HOST_PTR for zero copy
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+
+ // map the buffers to pointers at host device
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+
+ timer.Stop(timer_id);
+ }
+ void usehostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ buffer_.c_, &err);
+ xGemm_Function(true);
+ timer.Stop(timer_id);
+ }
+ void copyhostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ buffer_.c_, &err);
+ xGemm_Function(false);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T), buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usepersismem_roundtrip_func()
+ {
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+ timer.Start(timer_id);
+
+ cl_int err;
+
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ NULL, &err);
+
+ // map the buffers to pointers at host devices
+ T *map_a,*map_b,*map_c;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_*buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B, C to the host pointers
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, NULL);
+ // calling clBLAS
+ xGemm_Function(false);
+ // map the C buffer again to read output
+ map_c = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_c_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.lda_*buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_c, buffer_.c_, ( buffer_.ldc_*buffer_.c_num_vectors_ + buffer_.offC_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_c_, map_c, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+
+ timer.Stop(timer_id);
+#else
+ std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"< buffer_;
-
-}; // class xgemm
+ void xGemm_Function(bool flush);
+}; // class xgemm
template<>
-void
+void
xGemm::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm::
-roundtrip_func()
-{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_float),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float),
- buffer_.c_, 0, NULL, NULL);
- clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
-
template<>
-void
+void
xGemm::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- //set up buffer
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_double),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double),
- buffer_.c_, 0, NULL, NULL);
- //call_func
- clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
+}
template<>
-void
+void
xGemm::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
- clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_float2),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float2),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float2),
- buffer_.c_, 0, NULL, NULL);
- clblasCgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_float2), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_float2),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
+}
template<>
-void
+void
xGemm::
-call_func()
+xGemm_Function(bool flush)
{
- timer.Start(timer_id);
-
- clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
+ clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xGemm::
-roundtrip_func()
+ //flush==true if only the kernel time (library call) is timed
+ //flush==false if memory time is also timed
+ if (flush==true)
{
- timer.Start(timer_id);
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_*buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
- (buffer_.ldc_ * buffer_.c_num_vectors_ +
- buffer_.offC_) * sizeof(cl_double2),
- NULL, &err);
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double2),
- buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double2),
- buffer_.c_, 0, NULL, NULL);
- clblasZgemm(order_, buffer_.trans_a_, buffer_.trans_b_,
- buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
- buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
- err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
- buffer_.offC_ * sizeof(cl_double2), buffer_.ldc_ * buffer_.c_num_vectors_ *
- sizeof(cl_double2),
- buffer_.c_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ clWaitForEvents(1, &event_);
}
-
+}
template<>
double
diff --git a/src/client/clfunc_xgemv.hpp b/src/client/clfunc_xgemv.hpp
index 2d1d5b06..cc851094 100644
--- a/src/client/clfunc_xgemv.hpp
+++ b/src/client/clfunc_xgemv.hpp
@@ -286,6 +286,12 @@ class xGemv : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xger.hpp b/src/client/clfunc_xger.hpp
index 05899cd7..d2f36dbc 100644
--- a/src/client/clfunc_xger.hpp
+++ b/src/client/clfunc_xger.hpp
@@ -217,6 +217,12 @@ class xGer : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xgerc.hpp b/src/client/clfunc_xgerc.hpp
index 829d9380..ed39f797 100644
--- a/src/client/clfunc_xgerc.hpp
+++ b/src/client/clfunc_xgerc.hpp
@@ -98,7 +98,12 @@ class xGerc : public clblasFunc
{}
void call_func();
-
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to do
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
diff --git a/src/client/clfunc_xgeru.hpp b/src/client/clfunc_xgeru.hpp
index 8c7d02c9..dbcecc9e 100644
--- a/src/client/clfunc_xgeru.hpp
+++ b/src/client/clfunc_xgeru.hpp
@@ -94,7 +94,12 @@ class xGeru : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
-
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xhemm.hpp b/src/client/clfunc_xhemm.hpp
index 8e46d1e3..9f4047e2 100644
--- a/src/client/clfunc_xhemm.hpp
+++ b/src/client/clfunc_xhemm.hpp
@@ -45,7 +45,7 @@
template
struct xHemmBuffer
{
- clblasOrder order;
+ clblasOrder order;
clblasSide side;
clblasUplo uplo;
size_t M;
@@ -78,22 +78,30 @@ class xHemm : public clblasFunc
~xHemm()
{
- delete buffer.cpuA;
- delete buffer.cpuB;
- delete buffer.cpuC;
- OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
}
double gflops()
{
- return (buffer.N*(buffer.N+1))/time_in_ns();
+ if (buffer.side == clblasLeft)
+ {
+ return (8*buffer.M*buffer.M*buffer.N)/time_in_ns();
+ }
+ else
+ {
+ return (8*buffer.N*buffer.N*buffer.M)/time_in_ns();
+ }
}
std::string gflops_formula()
{
- return "M*(M+1)/time";
+ if (buffer.side == clblasLeft)
+ {
+ return "8*M*M*N/time";
+ }
+ else
+ {
+ return "8*N*N*M/time";
+ }
}
void setup_buffer(int order_option, int side_option, int
@@ -106,20 +114,137 @@ class xHemm : public clblasFunc
void initialize_gpu_buffer();
void reset_gpu_write_buffer();
void call_func();
- void read_gpu_buffer()
+ void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(T),
+ buffer.ldc*buffer.N*sizeof(T),
+ buffer.cpuC,0,NULL,NULL);
}
- void roundtrip_func()
- {//to-do need to fill up
+ void roundtrip_func()
+ {
+ std::cout << "xHemm::roundtrip_func" <::setup_buffer(int order_option, int side_option, int
buffer.a_num_vectors * buffer.lda*sizeof(T),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(T),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -350,10 +475,12 @@ void xHemm::initialize_gpu_buffer()
buffer.a_num_vectors * buffer.lda*sizeof(T),
buffer.cpuA, 0, NULL, NULL);
- err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE, 0,
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(T),
buffer.ldb*buffer.N*sizeof(T),
buffer.cpuB, 0, NULL, NULL);
- err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE, 0,
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(T),
buffer.ldc*buffer.N*sizeof(T),
buffer.cpuC, 0, NULL, NULL);
}
@@ -379,6 +506,50 @@ void xHemm::call_func()
timer.Stop(timer_id);
}
+template <>
+void xHemm::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ //create buffer
+ buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+ NULL, &err);
+
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.N*buffer.ldb*sizeof(cl_float2),
+ NULL, &err);
+ buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.N*buffer.ldc*sizeof(cl_float2),
+ NULL, &err);
+ //write gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+ buffer.offa * sizeof(cl_float2),
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
+ buffer.cpuA, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(cl_float2),
+ buffer.ldb*buffer.N*sizeof(cl_float2),
+ buffer.cpuB, 0, NULL, NULL);
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_float2),
+ buffer.ldc*buffer.N*sizeof(cl_float2),
+ buffer.cpuC, 0, NULL, NULL);
+
+ clblasChemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+ buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+ buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+ 0, NULL,NULL);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_float2),
+ buffer.ldc*buffer.N*sizeof(cl_float2),
+ buffer.cpuC, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+
+}
template <>
void xHemm::call_func()
{
@@ -390,5 +561,48 @@ void xHemm::call_func()
clWaitForEvents(1, &event_);
timer.Stop(timer_id);
}
+template <>
+void xHemm::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ //create buffer
+ buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+ NULL, &err);
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer.N*buffer.ldb*sizeof(cl_double2),
+ NULL, &err);
+ buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.N*buffer.ldc*sizeof(cl_double2),
+ NULL, &err);
+ //write gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer.A, CL_TRUE,
+ buffer.offa * sizeof(cl_double2),
+ buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
+ buffer.cpuA, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer.B, CL_TRUE,
+ buffer.offb * sizeof(cl_double2),
+ buffer.ldb*buffer.N*sizeof(cl_double2),
+ buffer.cpuB, 0, NULL, NULL);
+ err = clEnqueueWriteBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_double2),
+ buffer.ldc*buffer.N*sizeof(cl_double2),
+ buffer.cpuC, 0, NULL, NULL);
+
+ clblasZhemm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
+ buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
+ buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, 1, &queue_,
+ 0, NULL,NULL);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer.C, CL_TRUE,
+ buffer.offc * sizeof(cl_double2),
+ buffer.ldc*buffer.N*sizeof(cl_double2),
+ buffer.cpuC, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+
+}
#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xhemv.hpp b/src/client/clfunc_xhemv.hpp
index 570c3fce..6211114c 100644
--- a/src/client/clfunc_xhemv.hpp
+++ b/src/client/clfunc_xhemv.hpp
@@ -95,7 +95,12 @@ class xHemv : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
-
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to do
+ }
protected:
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher.hpp b/src/client/clfunc_xher.hpp
index e624b558..5144b22b 100644
--- a/src/client/clfunc_xher.hpp
+++ b/src/client/clfunc_xher.hpp
@@ -90,7 +90,12 @@ class xHer : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
-
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to do
+ }
protected:
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher2.hpp b/src/client/clfunc_xher2.hpp
index 27d95f34..aec7cc83 100644
--- a/src/client/clfunc_xher2.hpp
+++ b/src/client/clfunc_xher2.hpp
@@ -94,6 +94,12 @@ class xHer2 : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to do
+ }
protected:
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xher2k.hpp b/src/client/clfunc_xher2k.hpp
new file mode 100644
index 00000000..15095fa8
--- /dev/null
+++ b/src/client/clfunc_xher2k.hpp
@@ -0,0 +1,676 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHER2K_HXX__
+#define CLBLAS_BENCHMARK_XHER2K_HXX__
+
+#include "clfunc_common.hpp"
+
+template
+struct xHer2kBuffer
+{
+ clblasOrder order_;
+ clblasUplo uplo_;
+ clblasTranspose transA_;
+ size_t N_;
+ size_t K_;
+ T alpha_;
+ cl_mem A_;
+ size_t offa_;
+ size_t lda_;
+ cl_mem B_;
+ size_t offb_;
+ size_t ldb_;
+ T beta_;
+ cl_mem C_;
+ size_t offc_;
+ size_t ldc_;
+ size_t a_num_vectors_;
+ size_t b_num_vectors_;
+ size_t c_num_vectors_;
+ T* cpuA_;
+ T* cpuB_;
+ T* cpuC_;
+}; // struct buffer
+
+template
+class xHer2k : public clblasFunc
+{
+public:
+ xHer2k(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
+ {
+ timer.getUniqueID("clHer2k", 0);
+ }
+
+ ~xHer2k()
+ {
+ }
+
+ double gflops()
+ {
+ return static_cast(8*(buffer_.K_ * buffer_.N_ * buffer_.N_)/time_in_ns()+2*buffer_.N_/time_in_ns());
+ }
+
+ std::string gflops_formula()
+ {
+ return "(8*K*N*N+2*N)/time";
+ }
+
+ void setup_buffer(int order_option, int side_option, int
+ uplo_option, int diag_option, int transA_option, int
+ transB_option, size_t M, size_t N, size_t K,
+ size_t lda, size_t ldb, size_t ldc,size_t offA,
+ size_t offB, size_t offC, double alpha,
+ double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offb_ = offB;
+ buffer_.offc_ = offC;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(T),
+ NULL, &err);
+ }
+ void initialize_cpu_buffer()
+ {
+ srand(10);
+ for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.lda_; ++j)
+ {
+ buffer_.cpuA_[i*buffer_.lda_+j] = random(UPPER_BOUND()) /
+ randomScale();
+ }
+ }
+ for (size_t i = 0; i < buffer_.N_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.ldc_; ++j)
+ {
+ buffer_.cpuC_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) /
+ randomScale();
+ }
+ }
+ }
+ void initialize_gpu_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuA_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void reset_gpu_write_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void call_func();
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void roundtrip_func();
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ int diag_option, int transA_option, int transB_option,
+ size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+ size_t ldc, size_t offA, size_t offBX, size_t offCY,
+ double alpha, double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offb_ = offBX;
+ buffer_.offc_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuB_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.cpuA_;
+ delete buffer_.cpuB_;
+ delete buffer_.cpuC_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.B_), "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+ }
+protected:
+protected:
+ void initialize_scalars(double alpha, double beta)
+ {
+ buffer_.alpha_ = makeScalar(alpha);
+ buffer_.beta_ = makeScalar(beta);
+ }
+
+private:
+ xHer2kBuffer buffer_;
+};
+
+template<>
+void
+xHer2k::call_func()
+{
+ timer.Start(timer_id);
+ clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasCher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHer2k::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.B_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offb_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasZher2k(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_,
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.B_, buffer_.offb_, buffer_.ldb_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xherk.hpp b/src/client/clfunc_xherk.hpp
new file mode 100644
index 00000000..74871a39
--- /dev/null
+++ b/src/client/clfunc_xherk.hpp
@@ -0,0 +1,535 @@
+/* ************************************************************************
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * ************************************************************************/
+
+
+// $Id
+
+#ifndef CLBLAS_BENCHMARK_XHERK_HXX__
+#define CLBLAS_BENCHMARK_XHERK_HXX__
+
+#include "clfunc_common.hpp"
+
+template
+struct xHerkBuffer
+{
+ clblasOrder order_;
+ clblasUplo uplo_;
+ clblasTranspose transA_;
+ size_t N_;
+ size_t K_;
+ T alpha_;
+ cl_mem A_;
+ size_t offa_;
+ size_t lda_;
+ T beta_;
+ cl_mem C_;
+ size_t offc_;
+ size_t ldc_;
+ size_t a_num_vectors_;
+ size_t c_num_vectors_;
+ T* cpuA_;
+ T* cpuC_;
+}; // struct buffer
+
+template
+class xHerk : public clblasFunc
+{
+public:
+ xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
+ {
+ timer.getUniqueID("clHerk", 0);
+ }
+
+ ~xHerk()
+ {
+ }
+
+ double gflops()
+ {
+ return static_cast(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
+ }
+
+ std::string gflops_formula()
+ {
+ return "4*K*N*(N+1)/time";
+ }
+
+ void setup_buffer(int order_option, int side_option, int
+ uplo_option, int diag_option, int transA_option, int
+ transB_option, size_t M, size_t N, size_t K,
+ size_t lda, size_t ldb, size_t ldc,size_t offA,
+ size_t offB, size_t offC, double alpha,
+ double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offB);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offc_ = offC;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(T),
+ NULL, &err);
+ }
+ void initialize_cpu_buffer()
+ {
+ srand(10);
+ for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.lda_; ++j)
+ {
+ buffer_.cpuA_[i*buffer_.lda_+j] = random(UPPER_BOUND()) /
+ randomScale();
+ }
+ }
+ for (size_t i = 0; i < buffer_.N_; ++i)
+ {
+ for (size_t j = 0; j < buffer_.ldc_; ++j)
+ {
+ buffer_.cpuC_[i*buffer_.ldc_+j] = random(UPPER_BOUND()) /
+ randomScale();
+ }
+ }
+ }
+ void initialize_gpu_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.A_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuA_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offa_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void reset_gpu_write_buffer()
+ {
+ cl_int err;
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void call_func();
+ void read_gpu_buffer()
+ {
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.cpuC_, 0, NULL, NULL);
+ }
+ void roundtrip_func();
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
+ }
+ void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
+ int diag_option, int transA_option, int transB_option,
+ size_t M, size_t N, size_t K, size_t lda, size_t ldb,
+ size_t ldc, size_t offA, size_t offBX, size_t offCY,
+ double alpha, double beta)
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+ initialize_scalars(alpha,beta);
+
+ buffer_.N_ = N;
+ buffer_.K_ = K;
+ buffer_.offa_ = offA;
+ buffer_.offc_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.transA_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.transA_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.transA_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.transA_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.cpuA_;
+ delete buffer_.cpuC_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
+ }
+protected:
+protected:
+ void initialize_scalars(double alpha, double beta)
+ {
+ buffer_.alpha_ = makeScalar(alpha);
+ buffer_.beta_ = makeScalar(beta);
+ }
+
+private:
+ xHerkBuffer buffer_;
+};
+
+template<>
+void
+xHerk::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_float2),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk::call_func()
+{
+ timer.Start(timer_id);
+
+ clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+void
+xHerk::roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offa_) * sizeof(cl_double2),
+ NULL, &err);
+
+ buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offc_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
+ buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
+ buffer_.A_, buffer_.offa_, buffer_.lda_,
+ buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.C_, CL_TRUE,
+ buffer_.offc_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.cpuC_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+#endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__
\ No newline at end of file
diff --git a/src/client/clfunc_xsymm.hpp b/src/client/clfunc_xsymm.hpp
index e9fe9818..a7558e92 100644
--- a/src/client/clfunc_xsymm.hpp
+++ b/src/client/clfunc_xsymm.hpp
@@ -58,12 +58,6 @@ class xSymm : public clblasFunc
~xSymm()
{
- delete buffer.cpuA;
- delete buffer.cpuB;
- delete buffer.cpuC;
- OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
}
double gflops()
@@ -104,6 +98,10 @@ class xSymm : public clblasFunc
{
std::cout << "xSymm::roundtrip_func\n";
}
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xSymm::zerocopy_roundtrip_func\n";
+ }
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
@@ -212,6 +210,17 @@ class xSymm : public clblasFunc
buffer.cpuC = new T[buffer.N * buffer.ldc];
buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer.cpuA;
+ delete buffer.cpuB;
+ delete buffer.cpuC;
+ OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
@@ -337,7 +346,7 @@ void xSymm::setup_buffer(int order_option, int side_option, int
buffer.a_num_vectors * buffer.lda*sizeof(T),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(T),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -423,7 +432,7 @@ void xSymm::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_float),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -476,7 +485,7 @@ void xSymm::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_double),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -529,7 +538,7 @@ void xSymm::roundtrip_func()
buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_float2),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
@@ -577,12 +586,12 @@ void xSymm::roundtrip_func()
{
timer.Start(timer_id);
//set up buffer
- cl_int err;
+ cl_int err;
buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
NULL, &err);
- buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
buffer.N*buffer.ldb*sizeof(cl_double2),
NULL, &err);
buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
diff --git a/src/client/clfunc_xsymv.hpp b/src/client/clfunc_xsymv.hpp
index 625c7ec7..c9285410 100644
--- a/src/client/clfunc_xsymv.hpp
+++ b/src/client/clfunc_xsymv.hpp
@@ -209,6 +209,12 @@ class xSymv : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
diff --git a/src/client/clfunc_xsyr.hpp b/src/client/clfunc_xsyr.hpp
index 172032c9..4c70e69c 100644
--- a/src/client/clfunc_xsyr.hpp
+++ b/src/client/clfunc_xsyr.hpp
@@ -90,6 +90,12 @@ class xSyr : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
protected:
diff --git a/src/client/clfunc_xsyr2.hpp b/src/client/clfunc_xsyr2.hpp
index 761c6167..9977d08a 100644
--- a/src/client/clfunc_xsyr2.hpp
+++ b/src/client/clfunc_xsyr2.hpp
@@ -94,7 +94,12 @@ class xSyr2 : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
-
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
protected:
void initialize_scalars(double alpha, double beta)
diff --git a/src/client/clfunc_xsyr2k.hpp b/src/client/clfunc_xsyr2k.hpp
index 4faa3997..ae60f9e0 100644
--- a/src/client/clfunc_xsyr2k.hpp
+++ b/src/client/clfunc_xsyr2k.hpp
@@ -61,15 +61,6 @@ class xSyr2k : public clblasFunc
~xSyr2k()
{
- delete buffer_.a_;
- delete buffer_.b_;
- delete buffer_.c_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
- "releasing buffer B");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
- "releasing buffer C");
}
void call_func()
@@ -78,13 +69,12 @@ class xSyr2k : public clblasFunc
double gflops()
{
- return 2.0*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
- buffer_.n_*(buffer_.n_+1)/time_in_ns();
+ return (2*buffer_.k_*buffer_.n_*buffer_.n_+buffer_.n_)/time_in_ns();
}
std::string gflops_formula()
{
- return "2.0*(M*(M+1)*N+M*(M+1))/time";
+ return "(2*K*N*N+N)/time";
}
void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -293,7 +283,7 @@ class xSyr2k : public clblasFunc
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
NULL, &err);
@@ -364,19 +354,232 @@ class xSyr2k : public clblasFunc
}
void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(T),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
}
void roundtrip_func()
- {//to-do need to fill up
+ {
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
- {}
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+
+ initialize_scalars(alpha, beta);
+
+ buffer_.n_ = N;
+ buffer_.k_ = K;
+ buffer_.offA_ = offA;
+ buffer_.offB_ = offBX;
+ buffer_.offC_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.trans_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.trans_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.b_num_vectors_ = K;
+ buffer_.trans_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = N;
+ }
+ else if (ldb < N)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ buffer_.b_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.trans_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+
+ if (ldb == 0)
+ {
+ buffer_.ldb_ = K;
+ }
+ else if (ldb < K)
+ {
+ std::cerr << "ldb:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.ldb_ = ldb;
+ }
+ }
+ }
+
+ buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
+ buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.a_;
+ delete buffer_.b_;
+ delete buffer_.c_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+ "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
+ "releasing buffer B");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+ "releasing buffer C");
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
@@ -406,6 +609,41 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyr2k::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(float),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(float),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(float),
+ NULL, &err);
+
+ this->initialize_gpu_buffer();
+ clblasSsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(float),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(float),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
template<>
void
xSyr2k::
@@ -423,6 +661,41 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyr2k::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(double),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(double),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(double),
+ NULL, &err);
+
+ this->initialize_gpu_buffer();
+ clblasDsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(double),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(double),
+ buffer_.c_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
template<>
void
xSyr2k::
@@ -440,6 +713,56 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyr2k::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(cl_float2),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+
+ clblasCsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(cl_float2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_float2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k::gflops()
+{
+ return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
+}
+
+template<>
+std::string
+xSyr2k::gflops_formula()
+{
+ return "(8*K*N*N+2*N)/time";
+}
+
template<>
void
xSyr2k::
@@ -457,4 +780,53 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyr2k::
+roundtrip_func()
+{
+ timer.Start(timer_id);
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(cl_double2),
+ NULL, &err);
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasZsyr2k(order_, buffer_.uplo_, buffer_.trans_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
+ buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_ * sizeof(cl_double2),
+ buffer_.ldc_ * buffer_.c_num_vectors_ *
+ sizeof(cl_double2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyr2k::gflops()
+{
+ return (8*buffer_.k_*buffer_.n_*buffer_.n_+2*buffer_.n_)/time_in_ns();
+}
+
+template<>
+std::string
+xSyr2k::gflops_formula()
+{
+ return "(8*K*N*N+2*N)/time";
+}
+
#endif // ifndef CLBLAS_BENCHMARK_XSYR2K_HXX__
diff --git a/src/client/clfunc_xsyrk.hpp b/src/client/clfunc_xsyrk.hpp
index 5bfd0e3c..e9b6a7a5 100644
--- a/src/client/clfunc_xsyrk.hpp
+++ b/src/client/clfunc_xsyrk.hpp
@@ -56,13 +56,7 @@ class xSyrk : public clblasFunc
~xSyrk()
{
- delete buffer_.a_;
- delete buffer_.c_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
- "releasing buffer C");
- }
+ }
void call_func()
{
@@ -70,13 +64,12 @@ class xSyrk : public clblasFunc
double gflops()
{
- return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns() +
- buffer_.n_*(buffer_.n_+1)/time_in_ns();
+ return buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
}
std::string gflops_formula()
{
- return "(N*(N+1)*K+N*(N+1))/time";
+ return "(N*(N+1)*K)/time";
}
void setup_buffer(int order_option, int side_option, int uplo_option,
@@ -224,7 +217,7 @@ class xSyrk : public clblasFunc
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldc_ * buffer_.c_num_vectors_ +
buffer_.offC_) * sizeof(T),
NULL, &err);
@@ -281,18 +274,168 @@ class xSyrk : public clblasFunc
}
void read_gpu_buffer()
{
- //cl_int err;
- //to-do need to fill up
+ cl_int err;
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
+ buffer_.c_, 0, NULL, NULL);
}
void roundtrip_func()
- {//to-do need to fill up
+ {
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xSyrk::zerocopy_roundtrip_func\n";
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
size_t M, size_t N, size_t K, size_t lda, size_t ldb,
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
- {}
+ {
+ DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
+ DUMMY_ARGS_USAGE_2(ldb, offBX);
+
+ initialize_scalars(alpha, beta);
+
+ buffer_.n_ = N;
+ buffer_.k_ = K;
+ buffer_.offA_ = offA;
+ buffer_.offC_ = offCY;
+
+ if (uplo_option == 0)
+ {
+ buffer_.uplo_ = clblasUpper;
+ }
+ else
+ {
+ buffer_.uplo_ = clblasLower;
+ }
+
+
+ if (ldc == 0)
+ {
+ buffer_.ldc_ = N;
+ }
+ else if (ldc < N)
+ {
+ std::cerr << "ldc:wrong size\n";
+ }
+ else
+ {
+ buffer_.ldc_ = ldc;
+ }
+ buffer_.c_num_vectors_ = N;
+
+ if (order_option == 0)
+ {
+ order_ = clblasRowMajor;
+ if (transA_option == 0)
+ {
+ buffer_.trans_a_ = clblasNoTrans;
+ buffer_.a_num_vectors_ = N;
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = K;
+ if (transA_option == 1)
+ {
+ buffer_.trans_a_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_a_ = clblasConjTrans;
+ }
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+ else
+ {
+ order_ = clblasColumnMajor;
+ if (transA_option == 0)
+ {
+ buffer_.a_num_vectors_ = K;
+ buffer_.trans_a_ = clblasNoTrans;
+ if (lda == 0)
+ {
+ buffer_.lda_ = N;
+ }
+ else if (lda < N)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ else
+ {
+ buffer_.a_num_vectors_ = N;
+ if (transA_option == 1)
+ {
+ buffer_.trans_a_ = clblasTrans;
+ }
+ else if (transA_option == 2)
+ {
+ buffer_.trans_a_ = clblasConjTrans;
+ }
+
+ if (lda == 0)
+ {
+ buffer_.lda_ = K;
+ }
+ else if (lda < K)
+ {
+ std::cerr << "lda:wrong size\n";
+ exit(1);
+ }
+ else
+ {
+ buffer_.lda_ = lda;
+ }
+ }
+ }
+
+ buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
+ buffer_.c_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
+ }
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.a_;
+ delete buffer_.c_;
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
+ "releasing buffer A");
+ OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_c_),
+ "releasing buffer C");
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
@@ -321,6 +464,35 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyrk::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(float),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(float),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasSsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(float), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(float),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
template<>
void
xSyrk::
@@ -337,6 +509,35 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyrk::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(double),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(double),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasDsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(double), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(double),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
template<>
void
xSyrk::
@@ -353,6 +554,48 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyrk::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_float2),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_float2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasCsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(cl_float2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_float2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk::gflops()
+{
+ return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyrk::gflops_formula()
+{
+ return "(4*N*(N+1)*K)/time";
+}
template<>
void
xSyrk::
@@ -369,4 +612,47 @@ call_func()
timer.Stop(timer_id);
}
+template<>
+void
+xSyrk::roundtrip_func()
+{
+ timer.Start(timer_id);
+
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(cl_double2),
+ NULL, &err);
+
+ buffer_.buf_c_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldc_ * buffer_.c_num_vectors_ +
+ buffer_.offC_) * sizeof(cl_double2),
+ NULL, &err);
+ this->initialize_gpu_buffer();
+ clblasZsyrk(order_, buffer_.uplo_, buffer_.trans_a_, buffer_.n_,
+ buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_,
+ buffer_.lda_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_,
+ buffer_.ldc_, 1, &queue_, 0, NULL, NULL);
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_c_, CL_TRUE,
+ buffer_.offC_*sizeof(cl_double2), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(cl_double2),
+ buffer_.c_, 0, NULL, &event_);
+
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+}
+
+template<>
+double
+xSyrk::gflops()
+{
+ return 4*buffer_.n_*(buffer_.n_+1)*buffer_.n_/time_in_ns();
+}
+
+template<>
+std::string
+xSyrk::gflops_formula()
+{
+ return "(4*N*(N+1)*K)/time";
+}
+
#endif // ifndef CLBLAS_BENCHMARK_XSYRK_HXX__
diff --git a/src/client/clfunc_xtrmm.hpp b/src/client/clfunc_xtrmm.hpp
index d47ddfdb..2e05300c 100644
--- a/src/client/clfunc_xtrmm.hpp
+++ b/src/client/clfunc_xtrmm.hpp
@@ -57,12 +57,6 @@ class xTrmm : public clblasFunc
~xTrmm()
{
- delete buffer_.a_;
- delete buffer_.b_;
- OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
- "releasing buffer B");
}
void call_func()
@@ -238,7 +232,7 @@ class xTrmm : public clblasFunc
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
@@ -310,7 +304,11 @@ class xTrmm : public clblasFunc
}
void roundtrip_func()
{
- std::cout << "xGemm::roundtrip_func\n";
+ std::cout << "xTrmm::roundtrip_func\n";
+ }
+ void zerocopy_roundtrip_func()
+ {
+ std::cout << "xTrmm::zerocopy_roundtrip_func\n";
}
void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
int diag_option, int transA_option, int transB_option,
@@ -450,6 +448,17 @@ class xTrmm : public clblasFunc
buffer_.a_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
buffer_.b_ = new T[buffer_.ldb_*buffer_.b_num_vectors_];
}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ delete buffer_.a_;
+ delete buffer_.b_;
+ OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_a_),
+ "releasing buffer A");
+ OPENCL_V_THROW(clReleaseMemObject(buffer_.buf_b_),
+ "releasing buffer B");
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
@@ -493,7 +502,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float),
NULL, &err);
@@ -557,7 +566,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double),
NULL, &err);
@@ -621,7 +630,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_float2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_float2),
NULL, &err);
@@ -685,7 +694,7 @@ roundtrip_func()
buffer_.offA_) * sizeof(cl_double2),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(cl_double2),
NULL, &err);
diff --git a/src/client/clfunc_xtrmv.hpp b/src/client/clfunc_xtrmv.hpp
index 725e9f31..80d5004c 100644
--- a/src/client/clfunc_xtrmv.hpp
+++ b/src/client/clfunc_xtrmv.hpp
@@ -225,6 +225,12 @@ class xTrmv : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
diff --git a/src/client/clfunc_xtrsm.hpp b/src/client/clfunc_xtrsm.hpp
index 8ae85c30..2eb64cfb 100644
--- a/src/client/clfunc_xtrsm.hpp
+++ b/src/client/clfunc_xtrsm.hpp
@@ -22,6 +22,7 @@
#include "clfunc_common.hpp"
+
template
struct xTrsmBuffer
{
@@ -57,17 +58,13 @@ class xTrsm : public clblasFunc
~xTrsm()
{
- delete buffer_.a_;
- delete buffer_.b_;
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_a_),
- "releasing buffer A");
- OPENCL_V_THROW( clReleaseMemObject(buffer_.buf_b_),
- "releasing buffer B");
}
void call_func()
{
- std::cout << "xtrsm::call_func\n";
+ timer.Start(timer_id);
+ xTrsm_Function(true);
+ timer.Stop(timer_id);
}
double gflops()
@@ -237,7 +234,7 @@ class xTrsm : public clblasFunc
buffer_.offA_) * sizeof(T),
NULL, &err);
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
(buffer_.ldb_ * buffer_.b_num_vectors_ +
buffer_.offB_) * sizeof(T),
NULL, &err);
@@ -317,7 +314,179 @@ class xTrsm : public clblasFunc
}
void roundtrip_func()
{
- std::cout << "xtrsm::call_func\n";
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ //initialize gpu buffer
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
+ buffer_.offA_ * sizeof(T),
+ buffer_.lda_ * buffer_.a_num_vectors_ *
+ sizeof(T),
+ buffer_.a_, 0, NULL, NULL);
+
+ err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T),
+ buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void allochostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ // Map the buffers to pointers at host device
+ T *map_a,*map_b;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B to the mapped regions
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ // map the B buffer again to read the output
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usehostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void copyhostptr_roundtrip_func()
+ {
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ buffer_.a_, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ buffer_.b_, &err);
+ //call func
+ xTrsm_Function(false);
+ //read gpu buffer
+ err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
+ buffer_.offB_ * sizeof(T), buffer_.ldb_ * buffer_.b_num_vectors_ *
+ sizeof(T),
+ buffer_.b_, 0, NULL, &event_);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+ }
+ void usepersismem_roundtrip_func()
+ {
+#if defined(CL_MEM_USE_PERSISTENT_MEM_AMD)
+ timer.Start(timer_id);
+ //set up buffer
+ cl_int err;
+ buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.lda_ * buffer_.a_num_vectors_ +
+ buffer_.offA_) * sizeof(T),
+ NULL, &err);
+
+ buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE | CL_MEM_USE_PERSISTENT_MEM_AMD,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ NULL, &err);
+ // Map the buffers to pointers at host device
+ T *map_a,*map_b;
+ map_a = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_a_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_WRITE, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ // memcpy the input A, B to the mapped regions
+ memcpy( map_a, buffer_.a_, ( buffer_.lda_*buffer_.a_num_vectors_ + buffer_.offA_) * sizeof( T ) );
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ // unmap the buffers
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_a_, map_a, 0, NULL, NULL);
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ //call func
+ xTrsm_Function(false);
+ // map the B buffer again to read the output
+ map_b = (T*)clEnqueueMapBuffer(queue_, buffer_.buf_b_, CL_TRUE, CL_MAP_READ, 0,
+ (buffer_.ldb_ * buffer_.b_num_vectors_ +
+ buffer_.offB_) * sizeof(T),
+ 0, NULL, NULL, &err);
+ memcpy( map_b, buffer_.b_, ( buffer_.ldb_*buffer_.b_num_vectors_ + buffer_.offB_) * sizeof( T ) );
+ clEnqueueUnmapMemObject(queue_, buffer_.buf_b_, map_b, 0, NULL, NULL);
+ clWaitForEvents(1, &event_);
+ timer.Stop(timer_id);
+#else
+ std::cout<<"CL_MEM_USE_PERSISTENT_MEM_AMD is only supported on AMD hardware"< buffer_;
+ void xTrsm_Function(bool flush);
}; // class xtrsm
template<>
void
xTrsm::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasStrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasDtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm::
-call_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
-
clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm::
-roundtrip_func()
-{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_float2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_float2),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_float2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_float2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasCtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_float2), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_float2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
-}
-
-template<>
-void
-xTrsm::
-call_func()
-{
- timer.Start(timer_id);
-
- clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
- buffer_.trans_a_, buffer_.diag_,
- buffer_.m_, buffer_.n_, buffer_.alpha_,
- buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
- buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, &event_);
-
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
template<>
void
xTrsm::
-roundtrip_func()
+xTrsm_Function(bool flush)
{
- timer.Start(timer_id);
- //set up buffer
- cl_int err;
- buffer_.buf_a_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.lda_ * buffer_.a_num_vectors_ +
- buffer_.offA_) * sizeof(cl_double2),
- NULL, &err);
-
- buffer_.buf_b_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
- (buffer_.ldb_ * buffer_.b_num_vectors_ +
- buffer_.offB_) * sizeof(cl_double2),
- NULL, &err);
- //initialize gpu buffer
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_a_, CL_TRUE,
- buffer_.offA_ * sizeof(cl_double2),
- buffer_.lda_ * buffer_.a_num_vectors_ *
- sizeof(cl_double2),
- buffer_.a_, 0, NULL, NULL);
-
- err = clEnqueueWriteBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2),
- buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, NULL);
- //call func
- clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
+ clblasZtrsm(order_, buffer_.side_, buffer_.uplo_,
buffer_.trans_a_, buffer_.diag_,
buffer_.m_, buffer_.n_, buffer_.alpha_,
buffer_.buf_a_, buffer_.offA_, buffer_.lda_,
buffer_.buf_b_, buffer_.offB_, buffer_.ldb_,
- 1, &queue_, 0, NULL, NULL);
- //read gpu buffer
- err = clEnqueueReadBuffer(queue_, buffer_.buf_b_, CL_TRUE,
- buffer_.offB_ * sizeof(cl_double2), buffer_.ldb_ * buffer_.b_num_vectors_ *
- sizeof(cl_double2),
- buffer_.b_, 0, NULL, &event_);
- clWaitForEvents(1, &event_);
- timer.Stop(timer_id);
+ 1, &queue_, 0, NULL, &event_);
+ if(flush==true)
+ {
+ clWaitForEvents(1, &event_);
+ }
}
+
template<>
double
xTrsm::
diff --git a/src/client/clfunc_xtrsv.hpp b/src/client/clfunc_xtrsv.hpp
index f0b728ab..4eb0e5b8 100644
--- a/src/client/clfunc_xtrsv.hpp
+++ b/src/client/clfunc_xtrsv.hpp
@@ -218,6 +218,12 @@ class xTrsv : public clblasFunc
size_t ldc, size_t offA, size_t offBX, size_t offCY,
double alpha, double beta)
{}
+ void releaseGPUBuffer_deleteCPUBuffer()
+ {
+ //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
+ //need to do this before we eventually hit the destructor
+ //to-do
+ }
protected:
void initialize_scalars(double alpha, double beta)
{
diff --git a/src/client/client.cpp b/src/client/client.cpp
index 8f60a07a..16186095 100644
--- a/src/client/client.cpp
+++ b/src/client/client.cpp
@@ -39,6 +39,8 @@
#include "clfunc_xhemv.hpp"
#include "clfunc_xhemm.hpp"
#include "clfunc_xsymm.hpp"
+#include "clfunc_xherk.hpp"
+#include "clfunc_xher2k.hpp"
namespace po = boost::program_options;
@@ -67,6 +69,7 @@ int main(int argc, char *argv[])
std::string function;
std::string precision;
std::string roundtrip;
+ std::string memalloc;
int side_option;
int uplo_option;
int diag_option;
@@ -98,7 +101,8 @@ int main(int argc, char *argv[])
( "uplo", po::value( &uplo_option )->default_value(0), "0 = upper, 1 = lower. only used with [list of function families]" ) // xsymv xsyrk xsyr2k xtrsm xtrmm
( "diag", po::value( &diag_option )->default_value(0), "0 = unit diagonal, 1 = non unit diagonal. only used with [list of function families]" ) // xtrsm xtrmm
( "profile,p", po::value( &profileCount )->default_value(20), "Time and report the kernel speed (default: profiling off)" )
- ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"calculate the time for round trips")
+ ( "roundtrip", po::value( &roundtrip )->default_value("noroundtrip"),"including the time of OpenCL memory allocation and transportation; options:roundtrip, noroundtrip(default)")
+ ( "memalloc", po::value( &memalloc )->default_value("default"),"setting the memory allocation flags for OpenCL; would not take effect if roundtrip time is not measured; options:default(default),alloc_host_ptr,use_host_ptr,copy_host_ptr,use_persistent_mem_amd,rect_mem")
;
po::variables_map vm;
@@ -130,6 +134,8 @@ int main(int argc, char *argv[])
&& function != "hemv"
&& function != "hemm"
&& function != "symm"
+ && function != "herk"
+ && function != "her2k"
)
{
std::cerr << "Invalid value for --function" << std::endl;
@@ -432,6 +438,30 @@ int main(int argc, char *argv[])
return -1;
}
}
+ else if (function == "herk")
+ {
+ if (precision == "c")
+ my_function = new xHerk(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHerk(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her function" << std::endl;
+ return -1;
+ }
+ }
+ else if (function == "her2k")
+ {
+ if (precision == "c")
+ my_function = new xHer2k(timer, deviceType);
+ else if (precision == "z")
+ my_function = new xHer2k(timer, deviceType);
+ else
+ {
+ std::cerr << "Unknown her2 function" << std::endl;
+ return -1;
+ }
+ }
else if (function == "symm")
{
if (precision == "s")
@@ -483,8 +513,33 @@ int main(int argc, char *argv[])
my_function->call_func();
my_function->read_gpu_buffer();
my_function->reset_gpu_write_buffer();*/
- my_function->roundtrip_func();
- my_function->reset_gpu_write_buffer();
+
+ if(memalloc=="default")
+ {
+ my_function->roundtrip_func();
+ }
+ else if (memalloc=="alloc_host_ptr")
+ {
+ my_function->allochostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_host_ptr")
+ {
+ my_function->usehostptr_roundtrip_func();
+ }
+ else if (memalloc=="copy_host_ptr")
+ {
+ my_function->copyhostptr_roundtrip_func();
+ }
+ else if (memalloc=="use_persistent_mem_amd")
+ {
+ my_function->usepersismem_roundtrip_func();
+ }
+ else if (memalloc=="rect_mem")
+ {
+ my_function->roundtrip_func_rect();
+ }
+ //my_function->reset_gpu_write_buffer();
+ my_function->releaseGPUBuffer_deleteCPUBuffer();
}
if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -512,7 +567,8 @@ int main(int argc, char *argv[])
my_function->initialize_gpu_buffer();
my_function->call_func();
my_function->read_gpu_buffer();
- my_function->reset_gpu_write_buffer();
+ //my_function->reset_gpu_write_buffer();
+ my_function->releaseGPUBuffer_deleteCPUBuffer();
}
if( commandQueueFlags & CL_QUEUE_PROFILING_ENABLE )
@@ -525,7 +581,7 @@ int main(int argc, char *argv[])
std::endl;
}
}
-
+ delete my_function;
return 0;
}
diff --git a/src/include/defbool.h b/src/include/defbool.h
index e90736dd..26caf6af 100644
--- a/src/include/defbool.h
+++ b/src/include/defbool.h
@@ -18,7 +18,7 @@
#ifndef DEFBOOL_H_
#define DEFBOOL_H_
-#if defined(_MSC_VER) && _MSC_VER <= 1600
+#if defined(_MSC_VER) && _MSC_VER <= 1700
/*
FIX for windows compilation
@@ -48,10 +48,10 @@ typedef int _Bool;
#endif /* !__cplusplus */
-#else /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#else /* defined(_MSC_VER) && _MSC_VER <= 1700 */
#include
-#endif /* defined(_MSC_VER) && _MSC_VER <= 1600 */
+#endif /* defined(_MSC_VER) && _MSC_VER <= 1700 */
#endif /* DEFBOOL_H_ */
diff --git a/src/include/kern_cache.h b/src/include/kern_cache.h
index b6749c59..af14a855 100644
--- a/src/include/kern_cache.h
+++ b/src/include/kern_cache.h
@@ -55,6 +55,7 @@ typedef struct Kernel {
void *extra;
size_t extraSize;
void (*dtor)(struct Kernel *kern);
+ int noSource;
} Kernel;
typedef int
diff --git a/src/include/kerngen.h b/src/include/kerngen.h
index dd44b9ea..73ee1912 100644
--- a/src/include/kerngen.h
+++ b/src/include/kerngen.h
@@ -42,6 +42,12 @@
*/
/*@{*/
+#ifdef _MSC_VER
+#define SPREFIX "I"
+#else
+#define SPREFIX "z"
+#endif
+
#define SUBDIM_UNUSED (size_t)-1
enum {
diff --git a/src/include/trace_malloc.h b/src/include/trace_malloc.h
index 3dfa3152..acc97531 100644
--- a/src/include/trace_malloc.h
+++ b/src/include/trace_malloc.h
@@ -48,7 +48,7 @@ void releaseMallocTrace(void);
static __inline void initMallocTrace(void)
{
- /* do noting */
+ /* do nothing */
}
static __inline void printMallocStatistics(void)
diff --git a/src/library/CMakeLists.txt b/src/library/CMakeLists.txt
index 5bc8e2aa..f06282e6 100644
--- a/src/library/CMakeLists.txt
+++ b/src/library/CMakeLists.txt
@@ -277,53 +277,77 @@ if( BLAS_PRINT_BUILD_ERRORS )
add_definitions( -DPRINT_BUILD_ERRORS )
endif()
-#add_executable(tplgen tools/tplgen/tplgen.cpp)
-if (CMAKE_COMPILER_IS_GNUCXX)
- include(ExternalProject)
- ExternalProject_Add(
- tplgen
- URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
- INSTALL_COMMAND ""
- )
- add_custom_target( GENERATE_CLT
- COMMAND ${CMAKE_BINARY_DIR}/library/tplgen-prefix/src/tplgen-build/tplgen -o ../../include/ ${SRC_CL_TEMPLATES}
- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
- )
- add_dependencies(GENERATE_CLT tplgen)
+include( ExternalProject )
+ExternalProject_Add( tplgen
+ URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
+ INSTALL_COMMAND ""
+)
+
+ExternalProject_Get_Property( tplgen binary_dir )
+
+set( tplgenBinaryDir "" )
+if( CMAKE_COMPILER_IS_GNUCXX )
+ set( tplgenBinaryDir ${binary_dir} )
else()
- include(ExternalProject)
- ExternalProject_Add(
- tplgen
- URL "${CMAKE_SOURCE_DIR}/library/tools/tplgen"
- CONFIGURE_COMMAND "${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen\\configure.bat"
- BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Debug
-# BUILD_COMMAND MSBuild.exe tplgen.sln /m /fl /flp1:logfile=errors.log;errorsonly /flp2:logfile=warnings.log;warningsonly /t:rebuild
- INSTALL_COMMAND ""
- )
- add_custom_target( GENERATE_CLT
- COMMAND ${CMAKE_BINARY_DIR}\\library\\tplgen-prefix\\src\\tplgen-build\\Debug\\tplgen.exe -o ..\\..\\include ${SRC_CL_TEMPLATES}
- WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}\\library\\blas\\gens\\clTemplates
- )
- add_dependencies(GENERATE_CLT tplgen)
+ set( tplgenBinaryDir "${binary_dir}/${CMAKE_CFG_INTDIR}" )
endif()
+
+add_custom_target( GENERATE_CLT
+ COMMAND ${tplgenBinaryDir}/tplgen -o ${clBLAS_BINARY_DIR}/include ${SRC_CL_TEMPLATES}
+ WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/library/blas/gens/clTemplates
+)
+
+add_dependencies( GENERATE_CLT tplgen )
+
+if( CMAKE_COMPILER_IS_GNUCC )
+ configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/clBLAS.pc.in
+ ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc @ONLY )
+
+ install( FILES ${CMAKE_CURRENT_BINARY_DIR}/clBLAS.pc
+ DESTINATION lib${SUFFIX_LIB}/pkgconfig )
+endif( )
+
add_library(clBLAS SHARED ${CLBLAS_SOURCES} ${GLOBAL_HEADERS} ${SRC_BLAS_HEADERS} ${SRC_BLAS_GENS_HEADERS})
add_dependencies(clBLAS GENERATE_CLT)
set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
+set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
-if( TARGET_PLATFORM EQUAL 64 )
- # CPack configuration; include the executable into the package
- install( TARGETS clBLAS
- RUNTIME DESTINATION bin64
- LIBRARY DESTINATION lib64
- ARCHIVE DESTINATION lib64/import
- )
-else()
- # CPack configuration; include the executable into the package
- install( TARGETS clBLAS
- RUNTIME DESTINATION bin32
- LIBRARY DESTINATION lib32
- ARCHIVE DESTINATION lib32/import
- )
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS clBLAS
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
+
+# For debug builds, include the debug runtimes into the package for testing on non-developer machines
+set( CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_SKIP true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES true )
+set( CMAKE_INSTALL_DEBUG_LIBRARIES_ONLY true )
+
+if( WIN32 )
+ set( CLBLAS_RUNTIME_DESTINATION bin${SUFFIX_BIN} )
+else( )
+ set( CLBLAS_RUNTIME_DESTINATION lib${SUFFIX_LIB} )
+endif( )
+
+include( InstallRequiredSystemLibraries )
+
+# Install necessary runtime files for debug builds
+install( PROGRAMS ${CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS}
+ CONFIGURATIONS Debug
+ DESTINATION ${CLBLAS_RUNTIME_DESTINATION} )
+
+# Install all *.pdb files for debug builds
+install( DIRECTORY ${PROJECT_BINARY_DIR}/staging/
+ DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+ OPTIONAL
+ CONFIGURATIONS Debug
+ FILES_MATCHING PATTERN "*.pdb" )
+
+# Install a snapshot of the source as it was for this build; useful for the .pdb's
+install( DIRECTORY ${PROJECT_SOURCE_DIR}
+ DESTINATION ${CLBLAS_RUNTIME_DESTINATION}
+ OPTIONAL
+ CONFIGURATIONS Debug )
diff --git a/src/library/blas/generic/common.c b/src/library/blas/generic/common.c
index 9e26887d..fef08800 100644
--- a/src/library/blas/generic/common.c
+++ b/src/library/blas/generic/common.c
@@ -22,6 +22,7 @@
#include
#include
#include
+#include
#include "clblas-internal.h"
@@ -364,6 +365,7 @@ Kernel VISIBILITY_HIDDEN
kernel->extra = calloc(1, kernel->extraSize);
*(CLBLASKernExtra*)(kernel->extra) = *extra;
kernel->dtor = extraDtor;
+ kernel->noSource = 1;
}
else {
putKernel(NULL, kernel);
@@ -491,6 +493,7 @@ Kernel
#if !defined(KEEP_CLBLAS_KERNEL_SOURCES)
if (err == CL_SUCCESS) {
err = dropProgramSource(&kernel->program, context, device);
+ kernel->noSource = 1;
}
#endif /* !DUMP_CLBLAS_KERNELS */
@@ -524,17 +527,34 @@ setupBuildOpts(
opts[0] = '\0';
#if !defined NDEBUG
- strcpy(opts, "-g ");
+ addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-g");
#endif /* NDEBUG */
if (target.ident.vendor == VENDOR_NVIDIA &&
!strcmp(mempat->name, "2-staged cached global memory based "
"block trsm")) {
- strcat(opts, "-cl-opt-disable");
+ addBuildOpt(opts, BUILD_OPTS_MAXLEN, "-cl-opt-disable");
}
}
+void addBuildOpt(
+ char * opts,
+ size_t len,
+ const char * option)
+{
+ size_t l = strlen(opts);
+
+ if (l > 0 && !isspace(opts[l-1]) && l+1 < len) {
+ opts[l] = ' ';
+ opts[l+1] = '\0';
+ l++;
+ }
+
+ strncat(opts, option, len - l - 1);
+}
+
+
char VISIBILITY_HIDDEN
*sprintfGranulation(char *buf, const SubproblemDim *dim, int level)
{
diff --git a/src/library/blas/generic/kdump.c b/src/library/blas/generic/kdump.c
index 5345fc78..a48204a0 100644
--- a/src/library/blas/generic/kdump.c
+++ b/src/library/blas/generic/kdump.c
@@ -17,7 +17,7 @@
#include
#include
-#include
+#include
#include
#include
diff --git a/src/library/blas/generic/solution_seq_make.c b/src/library/blas/generic/solution_seq_make.c
index 0eee1fd7..8a5e402d 100644
--- a/src/library/blas/generic/solution_seq_make.c
+++ b/src/library/blas/generic/solution_seq_make.c
@@ -1435,9 +1435,12 @@ getStepGranulation(SolutionStep *step)
}
}
- status = getGranularityInfo(&step->device, mempat->name,
- step->args.dtype, step->extraFlags,
- (int)MNK, dims, &step->pgran, &time);
+ if( step->funcID != CLBLAS_GEMM2 )
+ {
+ status = getGranularityInfo(&step->device, mempat->name,
+ step->args.dtype, step->extraFlags,
+ (int)MNK, dims, &step->pgran, &time);
+ }
/*
* Disable blocking for implementations dealing with cache reads
* from the global memory
diff --git a/src/library/blas/gens/asum.cpp b/src/library/blas/gens/asum.cpp
index 3260acbe..06b9f544 100644
--- a/src/library/blas/gens/asum.cpp
+++ b/src/library/blas/gens/asum.cpp
@@ -125,23 +125,23 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_DOT
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if ( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE))
{
- strcat( buildOptStr, " -DCOMPLEX ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
#ifdef DEBUG_ASUM
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldb.vector) < 1) {
- strcat( buildOptStr, " -DINCX_NEGATIVE ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NEGATIVE");
}
return;
}
diff --git a/src/library/blas/gens/axpy_reg.cpp b/src/library/blas/gens/axpy_reg.cpp
index 0f8ced01..52aab71f 100644
--- a/src/library/blas/gens/axpy_reg.cpp
+++ b/src/library/blas/gens/axpy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_AXPY
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldc.vector) != 1) {
- strcat( buildOptStr, " -DINCY_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/copy_reg.cpp b/src/library/blas/gens/copy_reg.cpp
index d9f70951..ba1ff398 100644
--- a/src/library/blas/gens/copy_reg.cpp
+++ b/src/library/blas/gens/copy_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_COPY
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldc.vector) != 1) {
- strcat( buildOptStr, " -DINCY_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/dot.cpp b/src/library/blas/gens/dot.cpp
index 3f68221d..ed3e72b8 100644
--- a/src/library/blas/gens/dot.cpp
+++ b/src/library/blas/gens/dot.cpp
@@ -128,16 +128,16 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_DOT
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldc.vector) != 1) {
- strcat( buildOptStr, " -DINCY_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/gbmv.cpp b/src/library/blas/gens/gbmv.cpp
index 115ffbc0..ab8e5e2a 100644
--- a/src/library/blas/gens/gbmv.cpp
+++ b/src/library/blas/gens/gbmv.cpp
@@ -116,7 +116,7 @@ setBuildOpts(
if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) )
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_GBMV
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
@@ -124,15 +124,15 @@ setBuildOpts(
if( kargs->pigFuncID == CLBLAS_TBMV )
{
- strcat( buildOptStr, " -DTBMV_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTBMV_ONLY");
if( kargs->diag == clblasUnit )
{
- strcat( buildOptStr, " -DUNIT_DIAG ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUNIT_DIAG");
}
}
if( ((kargs->pigFuncID == CLBLAS_GBMV) || (kargs->pigFuncID == CLBLAS_TBMV)) && (kargs->transA == clblasConjTrans) )
{
- strcat( buildOptStr, " -DDO_CONJ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
}
if( (kargs->pigFuncID == CLBLAS_SBMV) || (kargs->pigFuncID == CLBLAS_HBMV) )
@@ -141,15 +141,15 @@ setBuildOpts(
isUpper = ( kargs->order == clblasColumnMajor )? !isUpper : isUpper;
if( isUpper )
- strcat( buildOptStr, " -DGIVEN_SHBMV_UPPER ");
- else strcat( buildOptStr, " -DGIVEN_SHBMV_LOWER ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_UPPER");
+ else addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGIVEN_SHBMV_LOWER");
if(kargs->pigFuncID == CLBLAS_HBMV)
{
- strcat( buildOptStr, " -DHBMV_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHBMV_ONLY");
if( kargs->order == clblasColumnMajor ) // Since routine calls Row-major, the whole matrix has to be conjugated while loading
{
- strcat( buildOptStr, " -DDO_CONJ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_CONJ");
}
}
}
diff --git a/src/library/blas/gens/gemm_cached.cpp b/src/library/blas/gens/gemm_cached.cpp
index 09231f90..5c7c3526 100644
--- a/src/library/blas/gens/gemm_cached.cpp
+++ b/src/library/blas/gens/gemm_cached.cpp
@@ -158,36 +158,36 @@ setBuildOpts(
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
if (isComplexType(kargs->dtype))
{
- strcat(buildOptStr, " -DCOMPLEX ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
}
if ((bestSize.useBarrier) == 1)
{
- strcat(buildOptStr, " -DGEMM_NEEDS_BARRIER ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DGEMM_NEEDS_BARRIER");
}
if (kargs->M % dims->y)
{
- strcat(buildOptStr, " -DM_TAIL_PRESENT ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DM_TAIL_PRESENT");
}
if (kargs->N % dims->x)
{
- strcat(buildOptStr, " -DN_TAIL_PRESENT ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DN_TAIL_PRESENT");
}
if (kflags & KEXTRA_CONJUGATE_A)
{
- strcat( buildOptStr, " -DCONJUGATE_A ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
}
if (kflags & KEXTRA_CONJUGATE_B)
{
- strcat( buildOptStr, " -DCONJUGATE_B ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
}
switch(kargs->pigFuncID)
@@ -201,46 +201,46 @@ setBuildOpts(
#endif
if (kargs->side == clblasLeft)
{
- strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
}
if (kargs->side == clblasRight)
{
- strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
}
if (kargs->uplo == clblasLower)
{
- strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
}
if (kargs->uplo == clblasUpper)
{
- strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
}
// Define the order for Legacy sake.
if (kargs->order == clblasColumnMajor)
{
- strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
} else {
- strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
}
if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
{
- strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
}
if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
{
- strcat(buildOptStr, " -D__HEMM__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
}
break;
case CLBLAS_HERK:
- strcat( buildOptStr, " -DHERK");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
if(kargs->uplo == clblasLower)
{
- strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
}
else if(kargs->uplo == clblasUpper)
{
- strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
}
break;
diff --git a/src/library/blas/gens/gemm_tail_cached.cpp b/src/library/blas/gens/gemm_tail_cached.cpp
index ea792499..ff144af9 100644
--- a/src/library/blas/gens/gemm_tail_cached.cpp
+++ b/src/library/blas/gens/gemm_tail_cached.cpp
@@ -96,10 +96,10 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
KernelExtraFlags kflags = step->extraFlags;
- strcat(buildOptStr, " -DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DTAIL_RUN -DM_TAIL_PRESENT -DN_TAIL_PRESENT");
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_GEMM_TAIL
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
@@ -107,16 +107,16 @@ setBuildOpts(
if (isComplexType(kargs->dtype))
{
- strcat(buildOptStr, " -DCOMPLEX ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
}
if (kflags & KEXTRA_CONJUGATE_A)
{
- strcat( buildOptStr, " -DCONJUGATE_A ");
-}
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_A");
+ }
if (kflags & KEXTRA_CONJUGATE_B)
{
- strcat( buildOptStr, " -DCONJUGATE_B ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCONJUGATE_B");
}
@@ -127,14 +127,14 @@ setBuildOpts(
break;
case CLBLAS_HERK:
- strcat( buildOptStr, " -DHERK");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK");
if(kargs->uplo == clblasLower)
{
- strcat( buildOptStr, " -DHERK_LOWER_TRIANGLE");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_LOWER_TRIANGLE");
}
else if(kargs->uplo == clblasUpper)
{
- strcat( buildOptStr, " -DHERK_UPPER_TRIANGLE");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERK_UPPER_TRIANGLE");
}
break;
@@ -147,33 +147,34 @@ setBuildOpts(
#endif
if (kargs->side == clblasLeft)
{
- strcat (buildOptStr, " -D__SYMM_LEFT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__");
}
if (kargs->side == clblasRight)
{
- strcat (buildOptStr, " -D__SYMM_RIGHT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
}
if (kargs->uplo == clblasLower)
{
- strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
}
if (kargs->uplo == clblasUpper)
{
- strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
}
+ // Define the order for Legacy sake.
if (kargs->order == clblasColumnMajor)
{
- strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
} else {
- strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
}
- if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
+ if ((kargs->pigFuncID == CLBLAS_SYMM_DIAGONAL) || (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL))
{
- strcat(buildOptStr, " -D__SYMM_DIAGONAL__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_DIAGONAL__");
}
if (kargs->pigFuncID == CLBLAS_HEMM_DIAGONAL)
{
- strcat(buildOptStr, " -D__HEMM__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__HEMM__");
}
break;
diff --git a/src/library/blas/gens/ger_lds.cpp b/src/library/blas/gens/ger_lds.cpp
index b74945ea..f72d1975 100644
--- a/src/library/blas/gens/ger_lds.cpp
+++ b/src/library/blas/gens/ger_lds.cpp
@@ -137,7 +137,7 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_GER
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
@@ -282,8 +282,8 @@ generator(
BH = subdims->y;
BW = subdims->x;
- sprintf( bhStr, "%d", BH );
- sprintf( bwStr, "%d", BW );
+ sprintf( bhStr, "%" SPREFIX "u", BH );
+ sprintf( bwStr, "%" SPREFIX "u", BW );
#ifdef DEBUG_GER
printf("BH = %s\n", bhStr);
diff --git a/src/library/blas/gens/her2_lds.cpp b/src/library/blas/gens/her2_lds.cpp
index a409c1ad..5adda19d 100644
--- a/src/library/blas/gens/her2_lds.cpp
+++ b/src/library/blas/gens/her2_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_HER2
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->order == clblasRowMajor )
{
- strcat( buildOptStr, " -DHER2_ROWMAJOR ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ROWMAJOR");
#ifdef DEBUG_HER2
printf("Setting build options ... HERMITIAN2_ROWMAJOR... for row-major support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_HPR2 )
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
}
//Build options for syr2_her2.clT to generate HER2 related code.
- strcat( buildOptStr, " -DHER2_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER2_ONLY");
return;
}
@@ -301,7 +301,7 @@ generator(
}
kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
- sprintf( targetRows, "%d", TARGETROWS );
+ sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
sprintf( blockSize, "%d", BLOCKSIZE );
#ifdef DEBUG_HER2
diff --git a/src/library/blas/gens/her_lds.cpp b/src/library/blas/gens/her_lds.cpp
index e174de2d..1a8365f0 100644
--- a/src/library/blas/gens/her_lds.cpp
+++ b/src/library/blas/gens/her_lds.cpp
@@ -139,25 +139,25 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_COMPLEX_DOUBLE )
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_HER
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->order == clblasRowMajor )
{
- strcat( buildOptStr, " -DHERMITIAN_ROWMAJOR ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHERMITIAN_ROWMAJOR");
#ifdef DEBUG_HER
printf("Setting build options ... HERMITIAN_ROWMAJOR... for row-major support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_HPR )
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
}
//Build options for syr_her.clT to generate HER related code.
- strcat( buildOptStr, " -DHER_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHER_ONLY");
return;
}
@@ -300,7 +300,7 @@ generator(
}
kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
- sprintf( targetRows, "%d", TARGETROWS );
+ sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
sprintf( blockSize, "%d", BLOCKSIZE );
#ifdef DEBUG_HER
diff --git a/src/library/blas/gens/iamax.cpp b/src/library/blas/gens/iamax.cpp
index bf20afd0..7a5966de 100644
--- a/src/library/blas/gens/iamax.cpp
+++ b/src/library/blas/gens/iamax.cpp
@@ -124,7 +124,7 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_AMAX
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
@@ -132,17 +132,17 @@ setBuildOpts(
if( (kargs->ldb.vector) != 1)
{
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldb.vector) < 1)
{
- strcat( buildOptStr, " -DRETURN_ON_INVALID ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
}
if( (kargs->redctnType == REDUCE_MAX_WITH_INDEX_ATOMICS))
{
- strcat( buildOptStr, " -DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
}
return;
diff --git a/src/library/blas/gens/kprintf.cpp b/src/library/blas/gens/kprintf.cpp
index 54772fa2..d5cbecb8 100644
--- a/src/library/blas/gens/kprintf.cpp
+++ b/src/library/blas/gens/kprintf.cpp
@@ -346,7 +346,7 @@ char* kprintf::mystrtok( char* in, const char* tok)
bool tokenFound = false;
for( size_t i=0 ; i <= (strlen(tok) - 1); i++)
{
- if ((*strtokPtr == tok[i]))
+ if (*strtokPtr == tok[i])
{
if ( tok[i] == '(')
{
diff --git a/src/library/blas/gens/legacy/tests/CMakeLists.txt b/src/library/blas/gens/legacy/tests/CMakeLists.txt
index 9c5a0f37..fae11cc5 100644
--- a/src/library/blas/gens/legacy/tests/CMakeLists.txt
+++ b/src/library/blas/gens/legacy/tests/CMakeLists.txt
@@ -45,19 +45,11 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
add_executable(t_blkmul ${SRC_BLKMUL})
target_link_libraries(t_blkmul ${OPENCL_LIBRARIES})
+set_target_properties( t_blkmul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
-if( TARGET_PLATFORM EQUAL 64 )
- # CPack configuration; include the executable into the package
- install( TARGETS t_blkmul
- RUNTIME DESTINATION bin64
- LIBRARY DESTINATION lib64
- ARCHIVE DESTINATION lib64/import
- )
-else()
- # CPack configuration; include the executable into the package
- install( TARGETS t_blkmul
- RUNTIME DESTINATION bin32
- LIBRARY DESTINATION lib32
- ARCHIVE DESTINATION lib32/import
- )
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_blkmul
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/library/blas/gens/legacy/tests/t_blkmul.c b/src/library/blas/gens/legacy/tests/t_blkmul.c
index 4983ce0d..590231ee 100644
--- a/src/library/blas/gens/legacy/tests/t_blkmul.c
+++ b/src/library/blas/gens/legacy/tests/t_blkmul.c
@@ -15,7 +15,11 @@
* ************************************************************************/
+#ifdef __APPLE__
+#include
+#else
#include
+#endif
#include
#include
#include
diff --git a/src/library/blas/gens/nrm2.cpp b/src/library/blas/gens/nrm2.cpp
index 832f5e41..d898ffbc 100644
--- a/src/library/blas/gens/nrm2.cpp
+++ b/src/library/blas/gens/nrm2.cpp
@@ -128,22 +128,22 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
- strcat( buildOptStr, " -DCOMPLEX ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
}
if(kargs->redctnType == REDUCE_BY_HYPOT) {
- strcat( buildOptStr, "-DUSE_HYPOT ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_HYPOT");
} else if(kargs->redctnType == REDUCE_BY_SSQ) {
- strcat( buildOptStr, " -DUSE_SSQ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DUSE_SSQ");
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldb.vector) < 1) {
- strcat( buildOptStr, " -DRETURN_ON_INVALID");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DRETURN_ON_INVALID");
}
return;
}
diff --git a/src/library/blas/gens/reduction.cpp b/src/library/blas/gens/reduction.cpp
index 1c81c0b7..5c005280 100644
--- a/src/library/blas/gens/reduction.cpp
+++ b/src/library/blas/gens/reduction.cpp
@@ -130,29 +130,29 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
switch(kargs->redctnType)
{
- case REDUCE_BY_SUM: strcat( buildOptStr, "-DREDUCE_BY_SUM ");
+ case REDUCE_BY_SUM: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SUM");
break;
- case REDUCE_BY_MAX: strcat( buildOptStr, "-DREDUCE_BY_MAX ");
+ case REDUCE_BY_MAX: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MAX");
break;
- case REDUCE_BY_MIN: strcat( buildOptStr, "-DREDUCE_BY_MIN ");
+ case REDUCE_BY_MIN: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_MIN");
break;
- case REDUCE_MAX_WITH_INDEX: strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX ");
+ case REDUCE_MAX_WITH_INDEX: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX");
break;
- case REDUCE_BY_HYPOT: strcat( buildOptStr, "-DREDUCE_BY_HYPOT ");
+ case REDUCE_BY_HYPOT: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_HYPOT");
break;
- case REDUCE_BY_SSQ: strcat( buildOptStr, "-DREDUCE_BY_SSQ ");
+ case REDUCE_BY_SSQ: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_BY_SSQ");
break;
- case REDUCE_MAX_WITH_INDEX_ATOMICS: strcat( buildOptStr, "-DREDUCE_MAX_WITH_INDEX_ATOMICS ");
+ case REDUCE_MAX_WITH_INDEX_ATOMICS: addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DREDUCE_MAX_WITH_INDEX_ATOMICS");
break;
default: printf("Invalid reduction type!!\n");
diff --git a/src/library/blas/gens/rotg_reg.cpp b/src/library/blas/gens/rotg_reg.cpp
index 0ec1eb0a..4d1ded18 100644
--- a/src/library/blas/gens/rotg_reg.cpp
+++ b/src/library/blas/gens/rotg_reg.cpp
@@ -98,10 +98,10 @@ setBuildOpts(
const SolutionStep *step = (const SolutionStep *)args;
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( (kargs->dtype == TYPE_DOUBLE) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
if( (kargs->dtype == TYPE_COMPLEX_FLOAT) || (kargs->dtype == TYPE_COMPLEX_DOUBLE) ) {
- strcat( buildOptStr, " -DCOMPLEX ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DCOMPLEX");
}
return;
diff --git a/src/library/blas/gens/rotm_reg.cpp b/src/library/blas/gens/rotm_reg.cpp
index 2b044192..2b87507e 100644
--- a/src/library/blas/gens/rotm_reg.cpp
+++ b/src/library/blas/gens/rotm_reg.cpp
@@ -121,17 +121,17 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
if(kargs->pigFuncID == CLBLAS_ROT)
{
- strcat( buildOptStr, " -DDO_ROT ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDO_ROT");
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldc.vector) != 1) {
- strcat( buildOptStr, " -DINCY_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/rotmg_reg.cpp b/src/library/blas/gens/rotmg_reg.cpp
index b256ac6f..7c333c6f 100644
--- a/src/library/blas/gens/rotmg_reg.cpp
+++ b/src/library/blas/gens/rotmg_reg.cpp
@@ -97,7 +97,7 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
}
return;
diff --git a/src/library/blas/gens/scal_reg.cpp b/src/library/blas/gens/scal_reg.cpp
index d82362b1..8b853106 100644
--- a/src/library/blas/gens/scal_reg.cpp
+++ b/src/library/blas/gens/scal_reg.cpp
@@ -125,13 +125,13 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_SCAL
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/swap_reg.cpp b/src/library/blas/gens/swap_reg.cpp
index 5b44cebe..b75e1004 100644
--- a/src/library/blas/gens/swap_reg.cpp
+++ b/src/library/blas/gens/swap_reg.cpp
@@ -125,16 +125,16 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_SWAP
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (kargs->ldb.vector) != 1) {
- strcat( buildOptStr, " -DINCX_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCX_NONUNITY");
}
if( (kargs->ldc.vector) != 1) {
- strcat( buildOptStr, " -DINCY_NONUNITY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DINCY_NONUNITY");
}
return;
diff --git a/src/library/blas/gens/symm_cached.cpp b/src/library/blas/gens/symm_cached.cpp
index cc8c0350..0d9ea8d3 100644
--- a/src/library/blas/gens/symm_cached.cpp
+++ b/src/library/blas/gens/symm_cached.cpp
@@ -99,7 +99,7 @@ setBuildOpts(
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_TRMV
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
@@ -107,23 +107,23 @@ setBuildOpts(
if (kargs->side == clblasLeft)
{
- strcat(buildOptStr, " -D__SYMM_LEFT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LEFT__ ");
} else {
- strcat(buildOptStr, " -D__SYMM_RIGHT__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_RIGHT__");
}
if (kargs->uplo == clblasUpper)
{
- strcat(buildOptStr, " -D__SYMM_UPPER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_UPPER__");
} else {
- strcat(buildOptStr, " -D__SYMM_LOWER__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_LOWER__");
}
if (kargs->order == clblasColumnMajor)
{
- strcat(buildOptStr, " -D__SYMM_COLMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_COLMAJOR__");
} else {
- strcat(buildOptStr, " -D__SYMM_ROWMAJOR__ ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-D__SYMM_ROWMAJOR__");
}
strcat(buildOptStr, " -cl-mad-enable ");
@@ -193,10 +193,10 @@ generator(
printf("WARNING: SYMM- generator: subdim and blocksize in-compatible.\n");
}
- sprintf(width, "%d", Y);
- sprintf(itemy, "%lu", ITEMY);
- sprintf(itemx, "%lu", ITEMX);
- sprintf(itemy_by_width, "%lu", (size_t) ITEMY/kextra->vecLenA);
+ sprintf(width, "%" SPREFIX "u", Y);
+ sprintf(itemy, "%" SPREFIX "u", ITEMY);
+ sprintf(itemx, "%" SPREFIX "u", ITEMX);
+ sprintf(itemy_by_width, "%" SPREFIX "u", (size_t) ITEMY/kextra->vecLenA);
kobj.put("%WIDTH", width);
kobj.put("%ITEMX", itemx);
diff --git a/src/library/blas/gens/syr2_lds.cpp b/src/library/blas/gens/syr2_lds.cpp
index 9fccb059..f5c20cb1 100644
--- a/src/library/blas/gens/syr2_lds.cpp
+++ b/src/library/blas/gens/syr2_lds.cpp
@@ -139,14 +139,14 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE )
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_SYR2
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_SPR2 )
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
}
return;
@@ -308,7 +308,7 @@ generator(
}
kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
- sprintf( targetRows, "%d", TARGETROWS );
+ sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
sprintf( blockSize, "%d", BLOCKSIZE );
#ifdef DEBUG_SYR2
diff --git a/src/library/blas/gens/syr_lds.cpp b/src/library/blas/gens/syr_lds.cpp
index 0a12ef4e..16911bb4 100644
--- a/src/library/blas/gens/syr_lds.cpp
+++ b/src/library/blas/gens/syr_lds.cpp
@@ -142,14 +142,14 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE )
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_SYR
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_SPR )
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
}
return;
@@ -308,7 +308,7 @@ generator(
}
kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD, doVLOAD);
- sprintf( targetRows, "%d", TARGETROWS );
+ sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
sprintf( blockSize, "%d", BLOCKSIZE );
#ifdef DEBUG_SYR
diff --git a/src/library/blas/gens/syrxk.c b/src/library/blas/gens/syrxk.c
index a0f6a295..54574ed2 100644
--- a/src/library/blas/gens/syrxk.c
+++ b/src/library/blas/gens/syrxk.c
@@ -21,6 +21,7 @@
#include
#include
+#include
#include
#include
@@ -1219,10 +1220,11 @@ genUpdateGenericDiagTile(
// type of the vectorized coordinates
Kstring vctype;
Kstring constOffs, constShifts, constMasks;
- unsigned int i, j, nops;
+ unsigned int i, j, nops,size;
unsigned int maxFetches = 0;
const char *yname, *xname;
const char *ldcName;
+ char hexadec[2];
batch = createStmtBatch();
if (batch == NULL) {
@@ -1253,6 +1255,14 @@ genUpdateGenericDiagTile(
tifl = (isUpper) ? TILE_ITER_BACKWARD_ROWS :
TILE_ITER_BACKWARD_COLS;
iterInit(&iter, &tileTempC, 1, tifl);
+ nops = 0;
+ while (!iterIsEnd(&iter)) {
+ nops++;
+ size = nops / nrCols;
+ iterIterate(&iter);
+ }
+
+ iterInit(&iter, &tileTempC, 1, tifl);
initTmpResTile(&tileTempC, gset, true);
@@ -1316,7 +1326,7 @@ genUpdateGenericDiagTile(
maxFetches = umin(maxFetches, i);
// declare vectorized coordinates
- declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", tempRows);
+ declareDiagUpresIndexedVars(ctx, vctype.buf, "cc", size);
/*
* real y coordinate, offset mask and
@@ -1326,8 +1336,8 @@ genUpdateGenericDiagTile(
"unsigned int mask;\n"
"int hit;\n");
if (withBeta) {
- declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", tempRows);
- declareDiagUpresIndexedVars(ctx, typeName, "betaNew", tempRows);
+ declareDiagUpresIndexedVars(ctx, typeName, "alphaNew", size);
+ declareDiagUpresIndexedVars(ctx, typeName, "betaNew", size);
}
// declare tile
@@ -1401,7 +1411,7 @@ genUpdateGenericDiagTile(
"cc%u = ((%s)mask &\n"
" %s) >>\n"
" %s;\n"
- "cc%u = %u - mad24(cc%u, %s, 0);\n",
+ "cc%u = %u - mad24(cc%u, %s, 0u);\n",
iter.row,
(1 << (nrCols - 1)),
@@ -1416,7 +1426,7 @@ genUpdateGenericDiagTile(
"cc%u = ((%s)mask &\n"
" %s) >>\n"
" %s;\n"
- "cc%u = mad24(cc%u, %s, 0);\n",
+ "cc%u = mad24(cc%u, %s, 0u);\n",
nrRows - 1, iter.row,
i, vctype.buf, constMasks.buf, constShifts.buf,
@@ -1443,7 +1453,9 @@ genUpdateGenericDiagTile(
ksprintf(&kstr, "cc%u", i);
}
else {
- ksprintf(&kstr, "cc%u.s%u", i, iter.col);
+ snprintf(hexadec, sizeof(char)*2, "%x", iter.col);
+ //itoa(iter.col, hexadec, 16);
+ ksprintf(&kstr, "cc%u.s%s", i, hexadec);
}
// prepare multipliers and fetch
diff --git a/src/library/blas/gens/tests/CMakeLists.txt b/src/library/blas/gens/tests/CMakeLists.txt
index f945b1eb..6d10e3fe 100644
--- a/src/library/blas/gens/tests/CMakeLists.txt
+++ b/src/library/blas/gens/tests/CMakeLists.txt
@@ -42,19 +42,11 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
add_executable(t_tilemul ${SRC_TILEMUL})
target_link_libraries(t_tilemul ${OPENCL_LIBRARIES})
+set_target_properties( t_tilemul PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
-if( TARGET_PLATFORM EQUAL 64 )
- # CPack configuration; include the executable into the package
- install( TARGETS t_tilemul
- RUNTIME DESTINATION bin64
- LIBRARY DESTINATION lib64
- ARCHIVE DESTINATION lib64/import
- )
-else()
- # CPack configuration; include the executable into the package
- install( TARGETS t_tilemul
- RUNTIME DESTINATION bin32
- LIBRARY DESTINATION lib32
- ARCHIVE DESTINATION lib32/import
- )
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_tilemul
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/library/blas/gens/tests/t_tilemul.c b/src/library/blas/gens/tests/t_tilemul.c
index ba4b49c9..4b4dd803 100644
--- a/src/library/blas/gens/tests/t_tilemul.c
+++ b/src/library/blas/gens/tests/t_tilemul.c
@@ -14,8 +14,11 @@
* limitations under the License.
* ************************************************************************/
-
+#ifdef __APPLE__
+#include
+#else
#include
+#endif
#include
#include
#include
diff --git a/src/library/blas/gens/trmv_reg.cpp b/src/library/blas/gens/trmv_reg.cpp
index 28ee1f26..9cacd0f1 100644
--- a/src/library/blas/gens/trmv_reg.cpp
+++ b/src/library/blas/gens/trmv_reg.cpp
@@ -136,28 +136,28 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_TRMV
printf("Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( (step->funcID == CLBLAS_HEMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
{
- strcat( buildOptStr, " -DHEMV_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ONLY");
/*
if(kargs->diag == clblasUnit)
{
- strcat( buildOptStr, " -DHEMV_ZERO_DIAG ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DHEMV_ZERO_DIAG");
}
*/
}
if ( kargs->pigFuncID == CLBLAS_SPMV )
{
- strcat( buildOptStr, " -DSPMV_ONLY ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DSPMV_ONLY");
}
if( (kargs->pigFuncID == CLBLAS_TPMV) || (kargs->pigFuncID == CLBLAS_HPMV) || (kargs->pigFuncID == CLBLAS_SPMV) )
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
}
return;
@@ -381,8 +381,8 @@ generator(
}
kprintf kobj( Prefix[extraFlags->dtype], vecLenA, doVLOAD);
- sprintf( targetRows, "%d", TARGETROWS );
- sprintf( blockSize, "%d", BLOCKSIZE );
+ sprintf( targetRows, "%" SPREFIX "u", TARGETROWS );
+ sprintf( blockSize, "%" SPREFIX "u", BLOCKSIZE );
#ifdef DEBUG_TRMV
printf("TARGET ROWS = %s\n", targetRows);
diff --git a/src/library/blas/gens/trsv_gemv.cpp b/src/library/blas/gens/trsv_gemv.cpp
index 49d5371b..ca73fbe5 100644
--- a/src/library/blas/gens/trsv_gemv.cpp
+++ b/src/library/blas/gens/trsv_gemv.cpp
@@ -128,14 +128,14 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_TRSV_GEMV
printf("TRSV GEMV: Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_TPSV)
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
#ifdef DEBUG_TRSV_GEMV
printf("TPSV GEMV: Setting build options ... PACKED\n");
#endif
@@ -415,9 +415,9 @@ generator(
{
return 0;
}
- sprintf( TARGETHEIGHT_S, "%d", TARGETHEIGHT );
+ sprintf( TARGETHEIGHT_S, "%" SPREFIX "u", TARGETHEIGHT );
sprintf( BLOCKSIZE_S, "%d", BLOCKSIZE );
- sprintf( TRIANGLE_HEIGHT_S, "%d", subdims->y );
+ sprintf( TRIANGLE_HEIGHT_S, "%" SPREFIX "u", subdims->y );
kobj.put("%TARGET_HEIGHT", TARGETHEIGHT_S);
kobj.put("%BLOCKSIZE", BLOCKSIZE_S);
@@ -433,9 +433,9 @@ generator(
{
return 0;
}
- sprintf( TARGETROWS_S, "%d", TARGETROWS );
- sprintf( TARGETWIDTH_S, "%d", TARGETWIDTH );
- sprintf( NLOOPS_S, "%d", NLOOPS );
+ sprintf( TARGETROWS_S, "%" SPREFIX "u", TARGETROWS );
+ sprintf( TARGETWIDTH_S, "%" SPREFIX "u", TARGETWIDTH );
+ sprintf( NLOOPS_S, "%" SPREFIX "u", NLOOPS );
kobj.put("%TARGET_ROWS", TARGETROWS_S);
kobj.put("%TARGET_WIDTH", TARGETWIDTH_S);
kobj.put("%NLOOPS", NLOOPS_S);
diff --git a/src/library/blas/gens/trsv_trtri.cpp b/src/library/blas/gens/trsv_trtri.cpp
index 071565ff..0bae0f99 100644
--- a/src/library/blas/gens/trsv_trtri.cpp
+++ b/src/library/blas/gens/trsv_trtri.cpp
@@ -128,21 +128,21 @@ setBuildOpts(
const CLBlasKargs *kargs = (const CLBlasKargs *)(&step->args);
if ( kargs->dtype == TYPE_DOUBLE || kargs->dtype == TYPE_COMPLEX_DOUBLE)
{
- strcat( buildOptStr, " -DDOUBLE_PRECISION ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DDOUBLE_PRECISION");
#ifdef DEBUG_TRSV_TRTRI
printf("TRSV TRTRI: Setting build options ... Double... for DOUBLE PRECISION support\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_TPSV)
{
- strcat( buildOptStr, " -DPACKED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DPACKED");
#ifdef DEBUG_TRSV_TRTRI
printf("TPSV TRTRI: Setting build options ... PACKED\n");
#endif
}
if( kargs->pigFuncID == CLBLAS_TBSV)
{
- strcat( buildOptStr, " -DBANDED ");
+ addBuildOpt( buildOptStr, BUILD_OPTS_MAXLEN, "-DBANDED");
#ifdef DEBUG_TRSV_TRTRI
printf("TBSV TRTRI: Setting build options .. BANDED\n");
#endif
diff --git a/src/library/blas/include/clblas-internal.h b/src/library/blas/include/clblas-internal.h
index 81ab5127..7a9afcdc 100644
--- a/src/library/blas/include/clblas-internal.h
+++ b/src/library/blas/include/clblas-internal.h
@@ -240,6 +240,11 @@ setupBuildOpts(
cl_device_id devID,
MemoryPattern *mempat);
+void addBuildOpt(
+ char * opts,
+ size_t len,
+ const char * option);
+
// Internal scatter image API
int
diff --git a/src/library/blas/init.c b/src/library/blas/init.c
index 5095cb0f..2b257a8e 100644
--- a/src/library/blas/init.c
+++ b/src/library/blas/init.c
@@ -18,7 +18,7 @@
#include
#include
#include
-#include
+#include
#include
#include "clblas-internal.h"
diff --git a/src/library/blas/xaxpy.c b/src/library/blas/xaxpy.c
index 7499c414..d57b4c23 100644
--- a/src/library/blas/xaxpy.c
+++ b/src/library/blas/xaxpy.c
@@ -60,11 +60,11 @@ doAxpy(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xcopy.c b/src/library/blas/xcopy.c
index e0ea2a03..8e375976 100644
--- a/src/library/blas/xcopy.c
+++ b/src/library/blas/xcopy.c
@@ -60,11 +60,11 @@ doCopy(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xdot.c b/src/library/blas/xdot.c
index f29cdb6f..67bf4cd2 100644
--- a/src/library/blas/xdot.c
+++ b/src/library/blas/xdot.c
@@ -67,20 +67,20 @@ doDot(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
// Minimum size of scratchBuff is N
- if (retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, scratchBuff, 0, 1, X_VEC_ERRSET))) {
printf("Insufficient ScratchBuff\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, dotProduct, offDP, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for dotProduct\n");
return retCode;
}
diff --git a/src/library/blas/xgemm2.c b/src/library/blas/xgemm2.c
index 0a5ae436..2bba00ae 100644
--- a/src/library/blas/xgemm2.c
+++ b/src/library/blas/xgemm2.c
@@ -209,18 +209,18 @@ doGemm(
/* Validate arguments */
- if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET)) {
+ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
return retCode;
}
if (K != 0) {
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, M, K, A, offA, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transB, K, N, B, offB, ldb, B_MAT_ERRSET))) {
return retCode;
}
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offC, ldc, C_MAT_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xger.c b/src/library/blas/xger.c
index 92d4b311..c9e9e1c9 100644
--- a/src/library/blas/xger.c
+++ b/src/library/blas/xger.c
@@ -58,23 +58,23 @@ doGer(
/* Validate arguments */
- if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
// Check wheather enough memory was allocated
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A %d\n",retCode );
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, M, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xhemv.c b/src/library/blas/xhemv.c
index 0db6a8f9..21011dd7 100644
--- a/src/library/blas/xhemv.c
+++ b/src/library/blas/xhemv.c
@@ -54,17 +54,17 @@ doHemv(
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
- offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+ A, offA, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
return retCode;
}
if ((commandQueues == NULL) || (numCommandQueues == 0))
diff --git a/src/library/blas/xher.c b/src/library/blas/xher.c
index af36962b..7131057c 100644
--- a/src/library/blas/xher.c
+++ b/src/library/blas/xher.c
@@ -56,16 +56,16 @@ doher(
/* Validate arguments */
- if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+ if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET)) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
diff --git a/src/library/blas/xher2.c b/src/library/blas/xher2.c
index cb676592..21a8ddcf 100644
--- a/src/library/blas/xher2.c
+++ b/src/library/blas/xher2.c
@@ -59,21 +59,21 @@ doHer2(
/* Validate arguments */
- if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xher2k.c b/src/library/blas/xher2k.c
index 302a648b..4c3d2f2a 100644
--- a/src/library/blas/xher2k.c
+++ b/src/library/blas/xher2k.c
@@ -71,7 +71,7 @@ doHer2k(
}
// Validate arguments
- if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
return retCode;
}
@@ -79,15 +79,15 @@ doHer2k(
return clblasInvalidValue;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offa, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, B, offb, ldb, B_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, C, offc, ldc, C_MAT_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xherk.c b/src/library/blas/xherk.c
index 18d1fb4d..b4f409d7 100644
--- a/src/library/blas/xherk.c
+++ b/src/library/blas/xherk.c
@@ -64,7 +64,7 @@ doHerk(
}
// Validate arguments
- if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET )) {
+ if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
return retCode;
}
@@ -72,11 +72,11 @@ doHerk(
return clblasInvalidValue;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xhpmv.c b/src/library/blas/xhpmv.c
index 991819c4..1f0fe67b 100644
--- a/src/library/blas/xhpmv.c
+++ b/src/library/blas/xhpmv.c
@@ -53,17 +53,17 @@ doHpmv(
/* Validate arguments */
- if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+ if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
- offa, 0, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+ AP, offa, 0, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
return retCode;
}
if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -182,4 +182,4 @@ clblasZhpmv(
return doHpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
Y, offy, incy, numCommandQueues, commandQueues,
numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xrot.c b/src/library/blas/xrot.c
index 7fd981bc..d07ec87d 100644
--- a/src/library/blas/xrot.c
+++ b/src/library/blas/xrot.c
@@ -58,11 +58,11 @@ doRot(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xrotg.c b/src/library/blas/xrotg.c
index fb9c8e1b..e4971480 100644
--- a/src/library/blas/xrotg.c
+++ b/src/library/blas/xrotg.c
@@ -14,10 +14,6 @@
* limitations under the License.
* ************************************************************************/
-/***********************************************************************
-** Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
//#define DEBUG_ROTG
#include
@@ -73,21 +69,21 @@ doRotg(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, A, offA, 1, X_VEC_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, B, offB, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for B\n");
return retCode;
}
- if (retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(cType, 1, C, offC, 1, X_VEC_ERRSET))) {
printf("Invalid Size for C\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, S, offS, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for S\n");
return retCode;
}
diff --git a/src/library/blas/xrotm.c b/src/library/blas/xrotm.c
index fcdfcb08..4130cf5d 100644
--- a/src/library/blas/xrotm.c
+++ b/src/library/blas/xrotm.c
@@ -60,15 +60,15 @@ doRotm(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 5, param, offParam, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for PARAM\n"); // PARAM is of minimum length 5
return retCode;
}
diff --git a/src/library/blas/xrotmg.c b/src/library/blas/xrotmg.c
index b3c22298..e6e48b6d 100644
--- a/src/library/blas/xrotmg.c
+++ b/src/library/blas/xrotmg.c
@@ -14,10 +14,6 @@
* limitations under the License.
* ************************************************************************/
-/***********************************************************************
-** Rotgright (C) 2011 Advanced Micro Devices, Inc. All Rights Reserved.
-***********************************************************************/
-
#include
#include
#include
@@ -69,23 +65,23 @@ doRotmg(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, D1, offD1, 1, X_VEC_ERRSET))) {
printf("Invalid Size for D1\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, D2, offD2, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for D2\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, X1, offX1, 1, X_VEC_ERRSET))) {
printf("Invalid Size for X1\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, Y1, offY1, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for Y1\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, 1, param, offParam, 1, Y_VEC_ERRSET))) {
printf("Invalid Size for PARAM\n");
return retCode;
}
diff --git a/src/library/blas/xscal.c b/src/library/blas/xscal.c
index 6722383a..b2620310 100644
--- a/src/library/blas/xscal.c
+++ b/src/library/blas/xscal.c
@@ -57,7 +57,7 @@ doScal(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
diff --git a/src/library/blas/xshbmv.c b/src/library/blas/xshbmv.c
index e0a5087a..94f733da 100644
--- a/src/library/blas/xshbmv.c
+++ b/src/library/blas/xshbmv.c
@@ -68,19 +68,19 @@ doSHbmv(
}
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET ))
+ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)))
{
return retCode;
}
- if (retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
- N, N, K, 0, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, clblasNoTrans,
+ N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xspmv.c b/src/library/blas/xspmv.c
index d522ba84..b40e0269 100644
--- a/src/library/blas/xspmv.c
+++ b/src/library/blas/xspmv.c
@@ -53,17 +53,17 @@ doSpmv(
/* Validate arguments */
- if (retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(AP, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, AP,
- offa, 0, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+ AP, offa, 0, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
return retCode;
}
if ((commandQueues == NULL) || (numCommandQueues == 0))
@@ -184,4 +184,4 @@ clblasDspmv(
return doSpmv(&kargs, order, uplo, N, AP, offa, X, offx, incx,
Y, offy, incy, numCommandQueues, commandQueues,
numEventsInWaitList, eventWaitList, events);
-}
\ No newline at end of file
+}
diff --git a/src/library/blas/xswap.c b/src/library/blas/xswap.c
index 38066186..1d83a5b2 100644
--- a/src/library/blas/xswap.c
+++ b/src/library/blas/xswap.c
@@ -60,11 +60,11 @@ doSwap(
// Check wheather enough memory was allocated
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xsymm.c b/src/library/blas/xsymm.c
index e61a33f6..5c87fc6e 100644
--- a/src/library/blas/xsymm.c
+++ b/src/library/blas/xsymm.c
@@ -50,31 +50,31 @@ doSymm( CLBlasKargs *kargs, clblasOrder order, clblasUplo uplo, clblasSide side,
/* Validate arguments */
- if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
printf("SYMM:- Invalid mem object..\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B, offb, ldb, B_MAT_ERRSET))) {
printf("Invalid Size for B\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, C, offc, ldc, C_MAT_ERRSET))) {
printf("Invalid Size for C\n");
return retCode;
}
if (side == clblasLeft)
{
// MxM x MxN
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, M, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
} else {
// MxN x NxN
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
diff --git a/src/library/blas/xsymv.c b/src/library/blas/xsymv.c
index 55b23e85..790e8720 100644
--- a/src/library/blas/xsymv.c
+++ b/src/library/blas/xsymv.c
@@ -60,17 +60,17 @@ doSymv(
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A,
- offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N,
+ A, offA, lda, A_MAT_ERRSET ))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET ))) {
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, y, offy, incy, Y_VEC_ERRSET ))) {
return retCode;
}
diff --git a/src/library/blas/xsyr.c b/src/library/blas/xsyr.c
index d2d1ae7c..9358920f 100644
--- a/src/library/blas/xsyr.c
+++ b/src/library/blas/xsyr.c
@@ -55,7 +55,7 @@ doSyr(
/* Validate arguments */
- if (retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET )) {
+ if ((retCode = checkMemObjects(A, X, 0, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
@@ -65,11 +65,11 @@ doSyr(
* checkMatrixSizes() does not account of "offa" argument.
* Need to be added.
*/
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
diff --git a/src/library/blas/xsyr2.c b/src/library/blas/xsyr2.c
index 2f0a1856..fddcfbd2 100644
--- a/src/library/blas/xsyr2.c
+++ b/src/library/blas/xsyr2.c
@@ -58,21 +58,21 @@ doSyr2(
/* Validate arguments */
- if (retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(A, X, Y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, N, N, A, offa, lda, A_MAT_ERRSET ))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, X, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, Y, offy, incy, Y_VEC_ERRSET))) {
printf("Invalid Size for Y\n");
return retCode;
}
diff --git a/src/library/blas/xsyr2k.c b/src/library/blas/xsyr2k.c
index e99a617b..25ed438c 100644
--- a/src/library/blas/xsyr2k.c
+++ b/src/library/blas/xsyr2k.c
@@ -58,7 +58,7 @@ doSyr2k(
}
// Validate arguments
- if (retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET )) {
+ if ((retCode = checkMemObjects(A, B, C, true, A_MAT_ERRSET, B_MAT_ERRSET, C_MAT_ERRSET))) {
return retCode;
}
@@ -66,13 +66,13 @@ doSyr2k(
return clblasInvalidValue;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, A, offA, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transAB, N, K, B, offB, ldb, B_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xsyrk.c b/src/library/blas/xsyrk.c
index 4157d5e8..2582830e 100644
--- a/src/library/blas/xsyrk.c
+++ b/src/library/blas/xsyrk.c
@@ -55,7 +55,7 @@ doSyrk(
}
// Validate arguments
- if (retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET)) {
+ if ((retCode = checkMemObjects(A, C, NULL, false, A_MAT_ERRSET, C_MAT_ERRSET, END_ERRSET))) {
return retCode;
}
@@ -63,10 +63,10 @@ doSyrk(
return clblasInvalidValue;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, N, K, A, offA, lda, A_MAT_ERRSET))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, false, N, N, C, offC, ldc, C_MAT_ERRSET))) {
return retCode;
}
diff --git a/src/library/blas/xtbmv.c b/src/library/blas/xtbmv.c
index 8f59bc99..b3b0d3b7 100644
--- a/src/library/blas/xtbmv.c
+++ b/src/library/blas/xtbmv.c
@@ -59,20 +59,20 @@ doTbmv(
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET )) {
+ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
- if (retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET)) {
+ if ((retCode = checkBandedMatrixSizes(kargs->dtype, order, trans, N, N, K, 0, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET)) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET)) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
printf("Invalid Size for scratch vector\n");
return retCode;
}
diff --git a/src/library/blas/xtrmm.c b/src/library/blas/xtrmm.c
index b7611dae..8aff2079 100644
--- a/src/library/blas/xtrmm.c
+++ b/src/library/blas/xtrmm.c
@@ -55,16 +55,16 @@ doTrmm(
/* Validate arguments */
- if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET))) {
return retCode;
}
msize = (side == clblasLeft) ? M : N;
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
- offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+ A, offA, lda, A_MAT_ERRSET ))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
- offB, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+ B, offB, ldb, B_MAT_ERRSET ))) {
return retCode;
}
diff --git a/src/library/blas/xtrmv.c b/src/library/blas/xtrmv.c
index 2f4e2166..145c799f 100644
--- a/src/library/blas/xtrmv.c
+++ b/src/library/blas/xtrmv.c
@@ -57,20 +57,20 @@ doTrmv(
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET)) {
+ if ((retCode = checkMemObjects(A, x, y, true, A_MAT_ERRSET, X_VEC_ERRSET, Y_VEC_ERRSET))) {
printf("Invalid mem object..\n");
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
printf("Invalid Size for A\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
printf("Invalid Size for X\n");
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, y, 0, incx, Y_VEC_ERRSET))) {
printf("Invalid Size for scratch vector\n");
return retCode;
}
diff --git a/src/library/blas/xtrsm.c b/src/library/blas/xtrsm.c
index 9fb5b4af..d2fd7f09 100644
--- a/src/library/blas/xtrsm.c
+++ b/src/library/blas/xtrsm.c
@@ -55,17 +55,17 @@ doTrsm(
/* Validate arguments */
- if (retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET )) {
+ if ((retCode = checkMemObjects(A, B, NULL, false, A_MAT_ERRSET, B_MAT_ERRSET, END_ERRSET ))) {
return retCode;
}
msize = (side == clblasLeft) ? M : N;
- if (retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize, A,
- offA, lda, A_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, transA, msize, msize,
+ A, offA, lda, A_MAT_ERRSET ))) {
return retCode;
}
- if (retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N, B,
- offB, ldb, B_MAT_ERRSET )) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, clblasNoTrans, M, N,
+ B, offB, ldb, B_MAT_ERRSET ))) {
return retCode;
}
diff --git a/src/library/blas/xtrsv.c b/src/library/blas/xtrsv.c
index c3342287..1e48349a 100644
--- a/src/library/blas/xtrsv.c
+++ b/src/library/blas/xtrsv.c
@@ -351,7 +351,7 @@ doTrsv(
/* Validate arguments */
- if (retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET)) {
+ if ((retCode = checkMemObjects(A, x, (cl_mem) NULL, false, A_MAT_ERRSET, X_VEC_ERRSET, END_ERRSET))) {
#ifdef DEBUG_TRSV
printf("Invalid mem object..\n");
#endif
@@ -363,13 +363,13 @@ doTrsv(
* checkMatrixSizes() does not account for "offa" argument.
* Need to pass "offa" when "checkMatrixSizes()" is changed.
*/
- if (retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET)) {
+ if ((retCode = checkMatrixSizes(kargs->dtype, order, trans, N, N, A, offa, lda, A_MAT_ERRSET))) {
#ifdef DEBUG_TRSV
printf("Invalid Size for A\n");
#endif
return retCode;
}
- if (retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET )) {
+ if ((retCode = checkVectorSizes(kargs->dtype, N, x, offx, incx, X_VEC_ERRSET))) {
#ifdef DEBUG_TRSV
printf("Invalid Size for X\n");
#endif
diff --git a/src/library/clBLAS.pc.in b/src/library/clBLAS.pc.in
new file mode 100644
index 00000000..433ca635
--- /dev/null
+++ b/src/library/clBLAS.pc.in
@@ -0,0 +1,12 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}/bin@SUFFIX_BIN@
+includedir=${prefix}/include
+libdir=${prefix}/lib@SUFFIX_LIB@
+
+Name: clBLAS
+Description: Open source OpenCL BLAS library
+Version: @clBLAS_VERSION@
+URL: https://github.com/clMathLibraries/clBLAS
+
+Cflags: -I${includedir}
+Libs: -L${libdir} -lclBLAS
diff --git a/src/library/common/kern_cache.c b/src/library/common/kern_cache.c
index 787d139f..1006e482 100644
--- a/src/library/common/kern_cache.c
+++ b/src/library/common/kern_cache.c
@@ -425,7 +425,9 @@ fullKernelSize(Kernel *kern)
size += allSizes[i];
}
- clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+ if (!kern->noSource) {
+ clGetProgramInfo(kern->program, CL_PROGRAM_SOURCE, 0, NULL, &retSize);
+ }
return (size + retSize + sizeof(Kernel) + kern->extraSize);
}
diff --git a/src/library/common/tests/CMakeLists.txt b/src/library/common/tests/CMakeLists.txt
index 213e0bca..b1e34871 100644
--- a/src/library/common/tests/CMakeLists.txt
+++ b/src/library/common/tests/CMakeLists.txt
@@ -44,22 +44,15 @@ include_directories(${OPENCL_INCLUDE_DIRS} ${clBLAS_SOURCE_DIR} ${clBLAS_SOURCE_
add_executable(t_dblock_kgen ${SRC_DBLOCK_KGEN})
target_link_libraries(t_dblock_kgen ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_dblock_kgen PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
add_executable(t_gens_cache ${SRC_GENS_CACHE})
target_link_libraries(t_gens_cache ${OPENCL_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( t_gens_cache PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
-if( TARGET_PLATFORM EQUAL 64 )
- # CPack configuration; include the executable into the package
- install( TARGETS t_dblock_kgen t_gens_cache
- RUNTIME DESTINATION bin64
- LIBRARY DESTINATION lib64
- ARCHIVE DESTINATION lib64/import
- )
-else()
- # CPack configuration; include the executable into the package
- install( TARGETS t_dblock_kgen t_gens_cache
- RUNTIME DESTINATION bin32
- LIBRARY DESTINATION lib32
- ARCHIVE DESTINATION lib32/import
- )
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS t_dblock_kgen t_gens_cache
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/library/common/tests/t_gens_cache.c b/src/library/common/tests/t_gens_cache.c
index 177a25b3..5a2b9823 100644
--- a/src/library/common/tests/t_gens_cache.c
+++ b/src/library/common/tests/t_gens_cache.c
@@ -23,7 +23,11 @@
#include
#include
+#ifdef __APPLE__
+#include
+#else
#include
+#endif
#include
#include
diff --git a/src/library/tools/ktest/CMakeLists.txt b/src/library/tools/ktest/CMakeLists.txt
index 34828f0e..2cc8c318 100644
--- a/src/library/tools/ktest/CMakeLists.txt
+++ b/src/library/tools/ktest/CMakeLists.txt
@@ -140,19 +140,11 @@ source_group(\\ FILES ${KTEST_SRC})
add_executable(make-ktest ${KTEST_SRC} ${KTEST_EXTERNAL_SRC})
add_dependencies(make-ktest GENERATE_CLT)
target_link_libraries(make-ktest ${OPENCL_LIBRARIES} ${Boost_LIBRARIES} ${MATH_LIBRARY})
+set_target_properties( make-ktest PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
-if( TARGET_PLATFORM EQUAL 64 )
- # CPack configuration; include the executable into the package
- install( TARGETS make-ktest
- RUNTIME DESTINATION bin64
- LIBRARY DESTINATION lib64
- ARCHIVE DESTINATION lib64/import
- )
-else()
- # CPack configuration; include the executable into the package
- install( TARGETS make-ktest
- RUNTIME DESTINATION bin32
- LIBRARY DESTINATION lib32
- ARCHIVE DESTINATION lib32/import
- )
-endif()
+# CPack configuration; include the executable into the package
+install( TARGETS make-ktest
+ RUNTIME DESTINATION bin${SUFFIX_BIN}
+ LIBRARY DESTINATION lib${SUFFIX_LIB}
+ ARCHIVE DESTINATION lib${SUFFIX_LIB}/import
+ )
diff --git a/src/library/tools/ktest/step.h b/src/library/tools/ktest/step.h
index 7148c726..0472e499 100644
--- a/src/library/tools/ktest/step.h
+++ b/src/library/tools/ktest/step.h
@@ -18,7 +18,11 @@
#ifndef KTEST_PATTERN_H__
#define KTEST_PATTERN_H__
+#ifdef __APPLE__
+#include
+#else
#include
+#endif
#include
#include