diff --git a/CMakeLists.txt b/CMakeLists.txt
index e67f30c..4fee9bb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,90 @@
-    cmake_minimum_required (VERSION 3.17 FATAL_ERROR)
+    cmake_minimum_required (VERSION 3.21 FATAL_ERROR)
 
 #-------------------------------------------------------------------------------
-#  Setup languages to use. Only enable CUDA if GPU's are in use.
+#  Setup languages to use. Only enable HIP if GPU's are in use.
 #-------------------------------------------------------------------------------
     project (kronmult LANGUAGES CXX)
 
-    option (USE_GPU "Use CUDA for gpu support" OFF)
+    option (USE_GPU "Use HIP for gpu support" OFF)
+    set(GPU_ARCH "70" CACHE STRING "GPU architecture code for AMD/NVIDIA")
     if (USE_GPU)
-        enable_language (CUDA)
+        set(KRONMULT_PLATFORM_AMD 0)
+        set(KRONMULT_PLATFORM_NVCC 0)
+
+        # search for HIP and libraries
+        if(NOT DEFINED HIP_PATH)
+            if(NOT DEFINED ENV{HIP_PATH})
+                set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+            else()
+                set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+            endif()
+        endif()
+
+        # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack)
+        if (NOT DEFINED HIP_CLANG_PATH)
+            if(NOT DEFINED ENV{HIP_CLANG_PATH})
+                set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries")
+            else()
+                set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries")
+            endif()
+        endif()
+
+        # note: could probably grab this path directly using hipconfig?
+        if (NOT DEFINED HIP_CLANG_INCLUDE_PATH)
+            if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH})
+                # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers
+                set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory")
+            else()
+                set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory")
+            endif()
+        endif()
+
+        # try to find hipconfig executable which can help detect platforms and include dirs
+        find_program(HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
+        if(HIPCONFIG_PATH)
+            execute_process(COMMAND ${HIPCONFIG_PATH} --platform OUTPUT_VARIABLE KRONMULT_HIP_PLATFORM)
+        elseif(DEFINED ENV{HIP_PLATFORM})
+            set(KRONMULT_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
+        else()
+            message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set")
+        endif()
+
+        message(STATUS "HIP platform has been detected as ${KRONMULT_HIP_PLATFORM}")
+        # hip >= 4.2 is now using "amd" to identify platform
+        if(KRONMULT_HIP_PLATFORM STREQUAL "hcc" OR KRONMULT_HIP_PLATFORM STREQUAL "amd")
+            set(KRONMULT_PLATFORM_AMD 1)
+        # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
+        elseif(KRONMULT_HIP_PLATFORM STREQUAL "nvcc" OR KRONMULT_HIP_PLATFORM STREQUAL "nvidia")
+            set(KRONMULT_PLATFORM_NVCC 1)
+        endif()
+
+        set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
+
+        # look for HIP cmake configs in different locations
+        list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
+        list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/cmake" "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
+        set(CMAKE_HIP_ARCHITECTURES OFF)
+        find_package(HIP REQUIRED 4.0)
+        if(KRONMULT_PLATFORM_AMD)
+            enable_language (HIP)
+            set_target_properties(hip-lang::device PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${HIP_PATH}/include)
+        endif()
+
+        message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
+        message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
+        message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
+        message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}")
+        message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
+
+        if(KRONMULT_PLATFORM_NVCC)
+            enable_language(CUDA)
+            set (CMAKE_CUDA_STANDARD 14)
+            set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+            add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
+        else()
+            add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
+        endif()
+
     endif ()
 
 #-------------------------------------------------------------------------------
@@ -25,7 +102,7 @@
 #  Sanitizer options
 #-------------------------------------------------------------------------------
     set (CMAKE_CXX_FLAGS_SANITIZED -fno-omit-frame-pointer)
-    set (CMAKE_CUDA_FLAGS_SANITIZED -fno-omit-frame-pointer)
+    set (CMAKE_HIP_FLAGS_SANITIZED -fno-omit-frame-pointer)
     set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "")
     set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "")
 
@@ -36,7 +113,7 @@
 
         if (${SANITIZE_${upper_name}})
             set (CMAKE_CXX_FLAGS_SANITIZED "${CMAKE_CXX_FLAGS_SANITIZED} -fsanitize=${name}")
-            set (CMAKE_CUDA_FLAGS_SANITIZED "${CMAKE_CUDA_FLAGS_SANITIZED} -fsanitize=${name}")
+            set (CMAKE_HIP_FLAGS_SANITIZED "${CMAKE_HIP_FLAGS_SANITIZED} -fsanitize=${name}")
             set (CMAKE_EXE_LINKER_FLAGS_SANITIZED "${CMAKE_EXE_LINKER_FLAGS_SANITIZED} -fsanitize=${name}")
             set (CMAKE_SHARED_LINKER_FLAGS_SANITIZED "${CMAKE_SHARED_LINKER_FLAGS_SANITIZED} -fsanitize=${name}")
         endif ()
@@ -49,19 +126,32 @@
     register_sanitizer_option (undefined ON)
 
 #-------------------------------------------------------------------------------
-#  Define a macro function to set a targets source files to the CUDA language.
+#  Define a macro function to set a targets source files to the HIP language.
 #
 #  Can't use $<$<COMPILE_LANGUAGE:LANG>:foo> generator expressions for
 #  target_compile_features
 #
 #  Need to handle this with an explicit if statement.
 #-------------------------------------------------------------------------------
-    macro (target_set_cuda target)
+    macro (target_set_hip target)
         if (USE_GPU)
-            target_compile_features (${target} PUBLIC cuda_std_14)
             get_target_property (SOURCES ${target} SOURCES)
-            set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA)
-            set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES 60)
+            set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE HIP)
+
+            if(KRONMULT_PLATFORM_NVCC)
+                target_compile_features (${target} PUBLIC cuda_std_14)
+                set_source_files_properties (${SOURCES} PROPERTIES LANGUAGE CUDA)
+                set_target_properties (${target} PROPERTIES CUDA_ARCHITECTURES ${GPU_ARCH})
+            else()
+                set_source_files_properties (${SOURCES} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+                set_target_properties (${target} PROPERTIES HIP_ARCHITECTURES gfx${GPU_ARCH})
+                set_target_properties (${target} PROPERTIES LINKER_LANGUAGE HIP)
+                #target_link_libraries(${target} PUBLIC hip-lang::device)
+            endif()
+            set_target_properties (${target} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+
+            # HIP_INCLUDE_DIRS is not getting set for nvidia platforms
+            target_include_directories(${target} SYSTEM PRIVATE ${HIP_INCLUDE_DIRS} ${HIP_PATH}/include)
         endif ()
     endmacro ()
 
@@ -76,7 +166,7 @@
     add_library (kron SHARED)
 
 #  Note can't use generator expressions on the source files since it interfers
-#  with setting the source property to target CUDA.
+#  with setting the source property to target HIP.
     target_sources (kron
 
                     PRIVATE
@@ -109,17 +199,18 @@
     )
 
     target_compile_features (kron PUBLIC cxx_std_17)
-    target_set_cuda (kron)
+    target_set_hip (kron)
 
     target_compile_options (kron
                             PUBLIC
                             $<$<COMPILE_LANGUAGE:CXX>:-Wall -Wextra -Wpedantic>
-                            $<$<COMPILE_LANGUAGE:CUDA>:--compiler-options -fPIC --keep-device-functions>
+                            $<$<BOOL:${KRONMULT_PLATFORM_NVCC}>:--compiler-options -fPIC --keep-device-functions>
+                            $<$<BOOL:${KRONMULT_PLATFORM_AMD}>:--offload-arch=gfx${GPU_ARCH}>
     )
 
     target_compile_definitions (kron
                                 PUBLIC
-                                $<$<COMPILE_LANGUAGE:CUDA>:USE_GPU>
+                                $<$<BOOL:${USE_GPU}>:USE_GPU>
     )
 
     if (OpenMP_CXX_FOUND)
@@ -140,7 +231,7 @@
                         PRIVATE
                         ${source}
         )
-        target_set_cuda (${target})
+        target_set_hip (${target})
         target_link_libraries (${target} PUBLIC kron)
 
         add_test (NAME ${target}
diff --git a/kgemm_nn_batched.cpp b/kgemm_nn_batched.cpp
index 3ba54fd..96f2a08 100644
--- a/kgemm_nn_batched.cpp
+++ b/kgemm_nn_batched.cpp
@@ -20,7 +20,7 @@ void kgemm_nn_batched( int const mm, int const nn, int const kk,
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kgemm_nn_batched<double><<< batchCount, nthreads>>>( mm,nn,kk,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched<double>), dim3(batchCount), dim3(nthreads), 0, 0,  mm,nn,kk,
                           alpha,
                           Aarray_, ldAarray_,
                           Barray_, ldBarray_,
diff --git a/kgemm_nt_batched.cpp b/kgemm_nt_batched.cpp
index 62c443e..059360b 100644
--- a/kgemm_nt_batched.cpp
+++ b/kgemm_nt_batched.cpp
@@ -19,7 +19,7 @@ void kgemm_nt_batched( int const mm, int const nn, int const kk,
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kgemm_nt_batched<double><<< batchCount, nthreads >>>( mm,nn,kk,
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  mm,nn,kk,
                           alpha,
                           Aarray_, ldAarray_,
                           Barray_, ldBarray_,
diff --git a/kroncommon.hpp b/kroncommon.hpp
index 750bc57..a5c9df5 100644
--- a/kroncommon.hpp
+++ b/kroncommon.hpp
@@ -4,8 +4,7 @@
 
 
 #ifdef USE_GPU
-#include <cuda.h>
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #define GLOBAL_FUNCTION  __global__ 
 #define SYNCTHREADS __syncthreads()
 #define SHARED_MEMORY __shared__
diff --git a/kronmult1_batched.cpp b/kronmult1_batched.cpp
index edfb3e0..917172d 100644
--- a/kronmult1_batched.cpp
+++ b/kronmult1_batched.cpp
@@ -13,7 +13,7 @@ void kronmult1_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult1_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult1_batched<double>( n, 
diff --git a/kronmult1_pbatched.cpp b/kronmult1_pbatched.cpp
index afaa266..413191e 100644
--- a/kronmult1_pbatched.cpp
+++ b/kronmult1_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult1_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult1_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult1_pbatched<double>( n, 
diff --git a/kronmult1_xbatched.cpp b/kronmult1_xbatched.cpp
index 61d5314..14a698b 100644
--- a/kronmult1_xbatched.cpp
+++ b/kronmult1_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult1_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult1_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda, 
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/kronmult2_batched.cpp b/kronmult2_batched.cpp
index 54a4b24..72ca53c 100644
--- a/kronmult2_batched.cpp
+++ b/kronmult2_batched.cpp
@@ -13,7 +13,7 @@ void kronmult2_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult2_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult2_batched<double>( n, 
diff --git a/kronmult2_pbatched.cpp b/kronmult2_pbatched.cpp
index bcfe204..ace712e 100644
--- a/kronmult2_pbatched.cpp
+++ b/kronmult2_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult2_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult2_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult2_pbatched<double>( n, 
diff --git a/kronmult2_xbatched.cpp b/kronmult2_xbatched.cpp
index bee62d4..039f244 100644
--- a/kronmult2_xbatched.cpp
+++ b/kronmult2_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult2_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult2_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda,
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/kronmult3_batched.cpp b/kronmult3_batched.cpp
index 4a7e719..a69e2fb 100644
--- a/kronmult3_batched.cpp
+++ b/kronmult3_batched.cpp
@@ -13,7 +13,7 @@ void kronmult3_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult3_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult3_batched<double>( n, 
diff --git a/kronmult3_pbatched.cpp b/kronmult3_pbatched.cpp
index 822f822..706fbe3 100644
--- a/kronmult3_pbatched.cpp
+++ b/kronmult3_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult3_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult3_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult3_pbatched<double>( n, 
diff --git a/kronmult3_xbatched.cpp b/kronmult3_xbatched.cpp
index 4b609d5..6f8c785 100644
--- a/kronmult3_xbatched.cpp
+++ b/kronmult3_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult3_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult3_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda,
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/kronmult4_batched.cpp b/kronmult4_batched.cpp
index e9327ae..059b1a3 100644
--- a/kronmult4_batched.cpp
+++ b/kronmult4_batched.cpp
@@ -13,7 +13,7 @@ void kronmult4_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult4_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult4_batched<double>( n, 
diff --git a/kronmult4_pbatched.cpp b/kronmult4_pbatched.cpp
index fae2eea..0ce2b19 100644
--- a/kronmult4_pbatched.cpp
+++ b/kronmult4_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult4_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult4_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult4_pbatched<double>( n, 
diff --git a/kronmult4_xbatched.cpp b/kronmult4_xbatched.cpp
index 4148c1a..6c891c5 100644
--- a/kronmult4_xbatched.cpp
+++ b/kronmult4_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult4_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult4_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda, 
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/kronmult5_batched.cpp b/kronmult5_batched.cpp
index 417833d..1889aa1 100644
--- a/kronmult5_batched.cpp
+++ b/kronmult5_batched.cpp
@@ -13,7 +13,7 @@ void kronmult5_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult5_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult5_batched<double>( n, 
diff --git a/kronmult5_pbatched.cpp b/kronmult5_pbatched.cpp
index 5398ff6..ceeff45 100644
--- a/kronmult5_pbatched.cpp
+++ b/kronmult5_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult5_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult5_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult5_pbatched<double>( n, 
diff --git a/kronmult5_xbatched.cpp b/kronmult5_xbatched.cpp
index f3e14da..23495b4 100644
--- a/kronmult5_xbatched.cpp
+++ b/kronmult5_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult5_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult5_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda,
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/kronmult6_batched.cpp b/kronmult6_batched.cpp
index 7b3a624..f4f0454 100644
--- a/kronmult6_batched.cpp
+++ b/kronmult6_batched.cpp
@@ -13,7 +13,7 @@ void kronmult6_batched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult6_batched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult6_batched<double>( n, 
diff --git a/kronmult6_pbatched.cpp b/kronmult6_pbatched.cpp
index 30e29fd..7ceed69 100644
--- a/kronmult6_pbatched.cpp
+++ b/kronmult6_pbatched.cpp
@@ -13,7 +13,7 @@ void kronmult6_pbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult6_pbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, Xarray_, Yarray_, Warray_, batchCount);
 #else
         kronmult6_pbatched<double>( n, 
diff --git a/kronmult6_xbatched.cpp b/kronmult6_xbatched.cpp
index a5ab2f9..66d8825 100644
--- a/kronmult6_xbatched.cpp
+++ b/kronmult6_xbatched.cpp
@@ -14,7 +14,7 @@ void kronmult6_xbatched(
         int constexpr nwarps = 8;
         int constexpr nthreads = nwarps * warpsize;
 
-        kronmult6_xbatched<double><<< batchCount, nthreads >>>( n, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<double>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
            Aarray_, lda,
 	   Xarray_, Yarray_, Warray_, batchCount);
 #else
diff --git a/test_kgemm_nn_batched.cpp b/test_kgemm_nn_batched.cpp
index e83a728..0cfd5be 100644
--- a/test_kgemm_nn_batched.cpp
+++ b/test_kgemm_nn_batched.cpp
@@ -5,7 +5,7 @@
 #include<vector>
 
 #ifdef USE_GPU
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -18,11 +18,11 @@ static inline
 void host2gpu( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest, 
-                                        src, 
-                                        nbytes,  
-                                        cudaMemcpyHostToDevice );
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                      src,
+                                      nbytes,
+                                      hipMemcpyHostToDevice );
+        assert( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -32,11 +32,11 @@ static inline
 void gpu2host( void * dest, void * src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest,
-                                        src,
-                                        nbytes,
-                                        cudaMemcpyDeviceToHost);
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                      src,
+                                      nbytes,
+                                      hipMemcpyDeviceToHost);
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -47,8 +47,8 @@ static inline
 void *myalloc( size_t const nbytes ) {
               void *devPtr = nullptr;
 #ifdef USE_GPU
-              cudaError_t istat = cudaMalloc( &devPtr, nbytes );
-              expect( istat == cudaSuccess );
+              hipError_t istat = hipMalloc( &devPtr, nbytes );
+              expect( istat == hipSuccess );
 #else
               devPtr = malloc( nbytes );
 #endif
@@ -59,8 +59,8 @@ void *myalloc( size_t const nbytes ) {
 static inline
 void myfree( void * devPtr ) {
 #ifdef USE_GPU
-                cudaError_t istat = cudaFree( devPtr);
-                expect( istat == cudaSuccess );
+                hipError_t istat = hipFree( devPtr);
+                expect( istat == hipSuccess );
 #else
                 free( devPtr );
 #endif
@@ -328,11 +328,11 @@ T test_kgemm_nn_batched( int const mm,
         int const nwarps = min( min(32,mm), min(nn,kk));
         int const nthreads = nwarps * warpsize;
 
-        cudaError_t istat_sync_start = cudaDeviceSynchronize();
-        expect( istat_sync_start == cudaSuccess );
+        hipError_t istat_sync_start = hipDeviceSynchronize();
+        expect( istat_sync_start == hipSuccess );
 
 
-        kgemm_nn_batched<T><<< batchCount, nthreads >>>( mm,nn,kk, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nn_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  mm,nn,kk,
                           alpha,
                           ddAarray_, dldAarray_,
                           ddBarray_, dldBarray_,
@@ -340,8 +340,8 @@ T test_kgemm_nn_batched( int const mm,
                           ddCarray_, dldCarray_,
                           batchCount);
 
-        cudaError_t istat_sync_end = cudaDeviceSynchronize();
-        expect( istat_sync_end == cudaSuccess );
+        hipError_t istat_sync_end = hipDeviceSynchronize();
+        expect( istat_sync_end == hipSuccess );
         }
 #else
         {
diff --git a/test_kgemm_nt_batched.cpp b/test_kgemm_nt_batched.cpp
index 58c68e5..a126b8d 100644
--- a/test_kgemm_nt_batched.cpp
+++ b/test_kgemm_nt_batched.cpp
@@ -6,7 +6,7 @@
 #include <cmath>
 
 #ifdef USE_GPU
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -19,11 +19,11 @@ static inline
 void host2gpu( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest, 
-                                        src, 
-                                        nbytes,  
-                                        cudaMemcpyHostToDevice );
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                        src,
+                                        nbytes,
+                                        hipMemcpyHostToDevice );
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -33,11 +33,11 @@ static inline
 void gpu2host( void * dest, void * src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest,
+        hipError_t istat = hipMemcpy( dest,
                                         src,
                                         nbytes,
-                                        cudaMemcpyDeviceToHost);
-        expect( istat == cudaSuccess );
+                                        hipMemcpyDeviceToHost);
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -48,8 +48,8 @@ static inline
 void *myalloc( size_t const nbytes ) {
               void *devPtr = nullptr;
 #ifdef USE_GPU
-              cudaError_t istat = cudaMalloc( &devPtr, nbytes );
-              expect( istat == cudaSuccess );
+              hipError_t istat = hipMalloc( &devPtr, nbytes );
+              expect( istat == hipSuccess );
 #else
               devPtr = malloc( nbytes );
 #endif
@@ -60,8 +60,8 @@ void *myalloc( size_t const nbytes ) {
 static inline
 void myfree( void * devPtr ) {
 #ifdef USE_GPU
-                cudaError_t istat = cudaFree( devPtr);
-                expect( istat == cudaSuccess );
+                hipError_t istat = hipFree( devPtr);
+                expect( istat == hipSuccess );
 #else
                 free( devPtr );
 #endif
@@ -327,11 +327,11 @@ T test_kgemm_nt_batched( int const mm,
         int const nwarps = min( min(32,mm), min(nn,kk));
         int const nthreads = nwarps * warpsize;
 
-        cudaError_t istat_sync_start = cudaDeviceSynchronize();
-        expect( istat_sync_start == cudaSuccess );
+        hipError_t istat_sync_start = hipDeviceSynchronize();
+        expect( istat_sync_start == hipSuccess );
 
 
-        kgemm_nt_batched<T><<< batchCount, nthreads >>>( mm,nn,kk, 
+        hipLaunchKernelGGL(HIP_KERNEL_NAME(kgemm_nt_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  mm,nn,kk,
                           alpha,
                           ddAarray_, dldAarray_,
                           ddBarray_, dldBarray_,
@@ -339,8 +339,8 @@ T test_kgemm_nt_batched( int const mm,
                           ddCarray_, dldCarray_,
                           batchCount);
 
-        cudaError_t istat_sync_end = cudaDeviceSynchronize();
-        expect( istat_sync_end == cudaSuccess );
+        hipError_t istat_sync_end = hipDeviceSynchronize();
+        expect( istat_sync_end == hipSuccess );
         }
 #else
         {
diff --git a/test_kronmult6_batched.cpp b/test_kronmult6_batched.cpp
index 6a100b4..609ce79 100644
--- a/test_kronmult6_batched.cpp
+++ b/test_kronmult6_batched.cpp
@@ -13,7 +13,7 @@
 
 
 #ifdef USE_GPU
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -24,11 +24,11 @@ static inline
 void host2gpu( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest, 
-                                        src, 
-                                        nbytes,  
-                                        cudaMemcpyHostToDevice );
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                        src,
+                                        nbytes,
+                                        hipMemcpyHostToDevice );
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -38,11 +38,11 @@ static inline
 void gpu2host( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest,
+        hipError_t istat = hipMemcpy( dest,
                                         src,
                                         nbytes,
-                                        cudaMemcpyDeviceToHost);
-        expect( istat == cudaSuccess );
+                                        hipMemcpyDeviceToHost);
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -53,8 +53,8 @@ static inline
 void *myalloc( size_t nbytes ) {
               void *devPtr = nullptr;
 #ifdef USE_GPU
-              cudaError_t istat = cudaMalloc( &devPtr, nbytes );
-              expect( istat == cudaSuccess );
+              hipError_t istat = hipMalloc( &devPtr, nbytes );
+              expect( istat == hipSuccess );
 #else
               devPtr = malloc( nbytes );
 #endif
@@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) {
 static inline
 void myfree( void * devPtr ) {
 #ifdef USE_GPU
-                cudaError_t istat = cudaFree( devPtr);
-                expect( istat == cudaSuccess );
+                hipError_t istat = hipFree( devPtr);
+                expect( istat == hipSuccess );
 #else
                 free( devPtr );
 #endif
@@ -195,42 +195,42 @@ T test_kronmult_batched(  int const idim,
         // note  the input Zarray will be over-written
         // --------------------------------------------
         switch(idim) { 
-        case 1:  kronmult1_batched<T><<< batchCount, nthreads >>>( n,
+        case 1:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
                            dWarray_,
                            batchCount );
             break;
-        case 2:  kronmult2_batched<T><<< batchCount, nthreads >>>( n,
+        case 2:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
                            dWarray_,
                            batchCount );
             break;
-        case 3:  kronmult3_batched<T><<< batchCount, nthreads >>>( n,
+        case 3:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
                            dWarray_,
                            batchCount );
             break;
-        case 4:  kronmult4_batched<T><<< batchCount, nthreads >>>( n,
+        case 4:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
                            dWarray_,
                            batchCount );
             break;
-        case 5:  kronmult5_batched<T><<< batchCount, nthreads >>>( n,
+        case 5:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
                            dWarray_,
                            batchCount );
             break;
-        case 6:  kronmult6_batched<T><<< batchCount, nthreads >>>( n,
+        case 6:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_batched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dZarray_,
                            dYarray_,
@@ -244,8 +244,8 @@ T test_kronmult_batched(  int const idim,
         // -------------------------------------------
         // note important to wait for kernel to finish
         // -------------------------------------------
-        cudaError_t istat = cudaDeviceSynchronize();
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipDeviceSynchronize();
+        expect( istat == hipSuccess );
         }
 #else
 
diff --git a/test_kronmult6_pbatched.cpp b/test_kronmult6_pbatched.cpp
index f8bfc1d..62d41c4 100644
--- a/test_kronmult6_pbatched.cpp
+++ b/test_kronmult6_pbatched.cpp
@@ -13,7 +13,7 @@
 
 
 #ifdef USE_GPU
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -24,11 +24,11 @@ static inline
 void host2gpu( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest, 
-                                        src, 
-                                        nbytes,  
-                                        cudaMemcpyHostToDevice );
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                        src,
+                                        nbytes,
+                                        hipMemcpyHostToDevice );
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -38,11 +38,11 @@ static inline
 void gpu2host( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest,
+        hipError_t istat = hipMemcpy( dest,
                                         src,
                                         nbytes,
-                                        cudaMemcpyDeviceToHost);
-        expect( istat == cudaSuccess );
+                                        hipMemcpyDeviceToHost);
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -53,8 +53,8 @@ static inline
 void *myalloc( size_t nbytes ) {
               void *devPtr = nullptr;
 #ifdef USE_GPU
-              cudaError_t istat = cudaMalloc( &devPtr, nbytes );
-              expect( istat == cudaSuccess );
+              hipError_t istat = hipMalloc( &devPtr, nbytes );
+              expect( istat == hipSuccess );
 #else
               devPtr = malloc( nbytes );
 #endif
@@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) {
 static inline
 void myfree( void * devPtr ) {
 #ifdef USE_GPU
-                cudaError_t istat = cudaFree( devPtr);
-                expect( istat == cudaSuccess );
+                hipError_t istat = hipFree( devPtr);
+                expect( istat == hipSuccess );
 #else
                 free( devPtr );
 #endif
@@ -269,42 +269,42 @@ T test_kronmult_pbatched(  int const idim,
         // note  the input Zarray will be over-written
         // --------------------------------------------
         switch(idim) { 
-        case 1:  kronmult1_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 1:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 2:  kronmult2_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 2:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 3:  kronmult3_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 3:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 4:  kronmult4_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 4:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 5:  kronmult5_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 5:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 6:  kronmult6_pbatched<T><<< batchCount, nthreads >>>( n,
+        case 6:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_pbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAarray_,
                            dpdZarray_,
                            dpdYarray_,
@@ -318,8 +318,8 @@ T test_kronmult_pbatched(  int const idim,
         // -------------------------------------------
         // note important to wait for kernel to finish
         // -------------------------------------------
-        cudaError_t istat = cudaDeviceSynchronize();
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipDeviceSynchronize();
+        expect( istat == hipSuccess );
         }
 #else
 
diff --git a/test_kronmult6_xbatched.cpp b/test_kronmult6_xbatched.cpp
index 90a938a..7534b59 100644
--- a/test_kronmult6_xbatched.cpp
+++ b/test_kronmult6_xbatched.cpp
@@ -13,7 +13,7 @@
 
 
 #ifdef USE_GPU
-#include <cuda_runtime.h>
+#include <hip/hip_runtime.h>
 #else
 #include <stdlib.h>
 #include <string.h>
@@ -24,11 +24,11 @@ static inline
 void host2gpu( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest, 
-                                        src, 
-                                        nbytes,  
-                                        cudaMemcpyHostToDevice );
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipMemcpy( dest,
+                                        src,
+                                        nbytes,
+                                        hipMemcpyHostToDevice );
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -38,11 +38,11 @@ static inline
 void gpu2host( void *dest, void *src, size_t nbytes )
 {
 #ifdef USE_GPU
-        cudaError_t istat = cudaMemcpy( dest,
+        hipError_t istat = hipMemcpy( dest,
                                         src,
                                         nbytes,
-                                        cudaMemcpyDeviceToHost);
-        expect( istat == cudaSuccess );
+                                        hipMemcpyDeviceToHost);
+        expect( istat == hipSuccess );
 #else
         memcpy( dest, src, nbytes );
 #endif
@@ -53,8 +53,8 @@ static inline
 void *myalloc( size_t nbytes ) {
               void *devPtr = nullptr;
 #ifdef USE_GPU
-              cudaError_t istat = cudaMalloc( &devPtr, nbytes );
-              expect( istat == cudaSuccess );
+              hipError_t istat = hipMalloc( &devPtr, nbytes );
+              expect( istat == hipSuccess );
 #else
               devPtr = malloc( nbytes );
 #endif
@@ -65,8 +65,8 @@ void *myalloc( size_t nbytes ) {
 static inline
 void myfree( void * devPtr ) {
 #ifdef USE_GPU
-                cudaError_t istat = cudaFree( devPtr);
-                expect( istat == cudaSuccess );
+                hipError_t istat = hipFree( devPtr);
+                expect( istat == hipSuccess );
 #else
                 free( devPtr );
 #endif
@@ -302,42 +302,42 @@ T test_kronmult_xbatched(  int const idim,
         // note  the input Zarray will be over-written
         // --------------------------------------------
         switch(idim) { 
-        case 1:  kronmult1_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 1:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 2:  kronmult2_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 2:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 3:  kronmult3_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 3:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 4:  kronmult4_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 4:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 5:  kronmult5_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 5:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
                            dpdWarray_,
                            batchCount );
             break;
-        case 6:  kronmult6_xbatched<T><<< batchCount, nthreads >>>( n,
+        case 6:  hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<T>), dim3(batchCount), dim3(nthreads ), 0, 0,  n,
                            dAparray_, lda,
                            dpdZarray_,
                            dpdYarray_,
@@ -351,8 +351,8 @@ T test_kronmult_xbatched(  int const idim,
         // -------------------------------------------
         // note important to wait for kernel to finish
         // -------------------------------------------
-        cudaError_t istat = cudaDeviceSynchronize();
-        expect( istat == cudaSuccess );
+        hipError_t istat = hipDeviceSynchronize();
+        expect( istat == hipSuccess );
         }
 #else