Use the RoCM/HIP device to accelerate certain DPLASMA kernels (#57)

* configure: Add the --with-hip option Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: Configury Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: kernel typedefs Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: update for hip-enabled parsec Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: detect hipblas and rocsolvers Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> Conflicts: src/CMakeLists.txt * hip: precision generator rules Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: cleanup unused dyld hipblas functions Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: Update lapack stagein Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Update for feature/common_gpu parsec branch changes Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Some conflicting updates between hip and common_gpu need more resolution Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: stream info registration * hip: potrf on AMD Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip:po: Some errors introduced when merging Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Add HIP to the lookahead gpu gemm Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Add HIP to zgemm_summa Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: rework of PO and workspaces Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: remove unecessary hiblas init calls Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip:po:errors in ldam asserts Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip:po: some of the changes had broken cusolver Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * fix printlogcuda/hip Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Auto-generate hip stage-in/out functions Use proper error checks instead of asserts * hip:zgemm_gpu: don't use hipComplex * Return the proper PARSEC_HOOK_RETURN_ERROR in GPU error cases Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Update for the new device mask for incarnations Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * So far only NN gemm can run with HIP Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Use the correct DPLASMA_HAVE_HIP Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Remove weight properties from HIP bodies Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Reorder and uniformize cuda and hip bodies Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * A PARSEC_HAVE_HIP was still present Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Rework zpotrf_U Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * hip: add NT/TN/TT cases to gemm_summa Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Update parsec to a version that works with GPUs Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * zpotrf_wrapper: uid and handles don't exist when not using a GPU device Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Update dtd for hip/cuda specializations for the dtd workspaces Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Make all gemm_summa the same between hip/cuda Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu> * Use the same controls as parsec for GPU_WITH_CUDA/HIP * hip: merge error: the device count must be updated in both hip and cuda builds * hip: printlog hipblascomplex not compatible with creal * hip: final cleanup --------- Signed-off-by: Aurelien Bouteiller <bouteill@icl.utk.edu>
ICLDisco · Aug 9, 2024 · edf5be3 · edf5be3
1 parent f4dd66c
commit edf5be3
Show file tree

Hide file tree

Showing 51 changed files with 1,373 additions and 438 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -16,6 +16,10 @@ set(DPLASMA_VERSION "${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR}")
 
 ############################################################################
 # CMake Policies Tuning
+if(POLICY CMP0144)
+  # CMP0144: find_package uses upper-case <PACKAGENAME>_ROOT variables in addition to <PackageName>_ROOT
+  cmake_policy(SET CMP0144 NEW)
+endif(POLICY CMP0144)
 set(CMAKE_NO_SYSTEM_FROM_IMPORTED True)
 
 ############################################################################
@@ -231,12 +235,30 @@ endif(NOT TARGET PaRSEC::parsec AND NOT TARGET PaRSEC::parsec_ptgpp)
 
 ############################################################################
 # Resume configuring dplasma
-option(DPLASMA_HAVE_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA})
-if(DPLASMA_HAVE_CUDA)
+option(DPLASMA_GPU_WITH_CUDA "Use CUDA to accelerate DPLASMA routines" ${PARSEC_HAVE_CUDA})
+if(DPLASMA_GPU_WITH_CUDA)
+  if(NOT PARSEC_HAVE_CUDA)
+    message(FATAL_ERROR "CUDA support for DPLASMA requested, but detected PaRSEC does not support it")
+  endif()
   message(STATUS "CUDA support for DPLASMA enabled")
   if(NOT TARGET CUDA::cusolver)
     find_package(CUDAToolkit REQUIRED)
   endif(NOT TARGET CUDA::cusolver)
+  set(DPLASMA_HAVE_CUDA ${PARSEC_HAVE_CUDA} CACHE BOOL "True if DPLASMA provide support for CUDA")
+endif()
+option(DPLASMA_GPU_WITH_HIP "Use HIP to accelerate DPLASMA routines" ${PARSEC_HAVE_HIP})
+if(DPLASMA_GPU_WITH_HIP)
+  if(NOT PARSEC_HAVE_HIP)
+    message(FATAL_ERROR "HIP support for DPLASMA requested, but detected PaRSEC does not support it")
+  endif()
+  message(STATUS "HIP support for DPLASMA enabled")
+  # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
+  set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH})
+  list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm)
+  find_package(hipblas REQUIRED)
+  find_package(rocsolver REQUIRED)
+  set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
+  set(DPLASMA_HAVE_HIP ${PARSEC_HAVE_HIP} CACHE BOOL "True if DPLASMA provide support for HIP")
 endif()
 
 ############################################################################

diff --git a/configure b/configure
@@ -132,6 +132,9 @@ cat <<EOF
                 compile kernels optimized for the CUDA SM model x, y and z
                 where x,y,z are two digit numbers representing a valid CUDA architecture (e.g. 35,37,60) (default=autodetect)
 
+--with-hip[=DIR]
+                use the AMD RoCM accelerator libray [installed in DIR] (default=autodetect)
+
 
 Some influential environment variables:
   CC          C compiler command
@@ -296,6 +299,11 @@ while [ "x$1" != x ]; do
         --with-cuda-sm-targets) with_cuda_sm_targets=yes; shift;;
         --without-cuda-sm-targets) with_cuda_sm_targets=no; shift;;
 
+# RoCM options
+        --with-hip=*) with_hip="${1#*=}"; shift;;
+        --with-hip) with_hip=yes; shift;;
+        --without-hip) with_hip=no; shift;;
+
 # Python options
         --with-python=*) with_python="${1#*=}"; shift;;
         --with-python) with_python=yes; shift;;
@@ -399,12 +407,12 @@ _EOF
     mkdir -p "$NATIVE_DIR" && pushd "$NATIVE_DIR"
     rm -rf CMakeCache.txt CMakeFiles
 
-    # Disable MPI, CUDA, HWLOC when creating the build-tools
+    # Disable MPI, GPU, HWLOC when creating the build-tools
     local NATIVE_MPI="-DPARSEC_DIST_WITH_MPI=OFF"
-    local NATIVE_CUDA="-DPARSEC_GPU_WITH_CUDA=OFF"
+    local NATIVE_GPU="-DPARSEC_GPU_WITH_CUDA=OFF -DPARSEC_GPU_WITH_HIP=OFF"
     local NATIVE_HWLOC=""
     local NATIVE_COMPILERS="-DSUPPORT_FORTRAN=OFF"
-    local NATIVE_OPTS="-DBUILD_TESTING=OFF -DBUILD_TOOLS=ON -DBUILD_PARSEC=ON -DCMAKE_INSTALL_PREFIX=$NATIVE_PREFIX $NATIVE_MPI $NATIVE_CUDA $NATIVE_HWLOC $NATIVE_COMPILERS"
+    local NATIVE_OPTS="-DBUILD_TESTING=OFF -DBUILD_TOOLS=ON -DBUILD_PARSEC=ON -DCMAKE_INSTALL_PREFIX=$NATIVE_PREFIX $NATIVE_MPI $NATIVE_GPU $NATIVE_HWLOC $NATIVE_COMPILERS"
 
     set_cmake_executable #may have been changed in the platform file
     echo "CC=\"${NATIVE_CC}\" CFLAGS=\"${NATIVE_CFLAGS}\" CXX=\"${NATIVE_CXX}\" CXXFLAGS=\"${NATIVE_CXXFLAGS}\" LDFLAGS=\"${NATIVE_LDFLAGS}\" ${cmake_executable} -G\"${cmake_generator}\" ${NATIVE_OPTS} ${PARSEC_TOOLCHAIN_OPTIONS} $(for i in "$@"; do printf ' %q' "$i"; done) ${srcdir}"
@@ -621,6 +629,12 @@ x) ;;
 *) CMAKE_DEFINES+=" -DCUDA_SM_TARGETS='${with_cuda_sm_targets/,/;}'";;
 esac
 
+case x$with_hip in
+xno) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=OFF -DDPLASMA_GPU_WITH_HIP=OFF";;
+xyes) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON -DDPLASMA_GPU_WITH_HIP=ON";;
+x) ;;
+*) CMAKE_DEFINES+=" -DPARSEC_GPU_WITH_HIP=ON -DHIP_ROOT=$(printf %q "$with_hip") -DDPLASMA_GPU_WITH_HIP=ON";;
+esac
 
 case x$with_python in
 xno) echo >&2 "Python is required. Please provide a path to the python executable."; exit 3;;

diff --git a/share/help-dplasma.txt b/share/help-dplasma.txt
@@ -1,8 +1,9 @@
-[cu*_alloc_failed]
-There was not enough memory available on a CUDA device
+[gpu_alloc_failed]
+There was not enough memory available on a GPU device
 while trying to allocate a %s handle to manage tasks on
-this device, or another CUDA device on the node. The
+this device, or another GPU device on the node. The
 PaRSEC runtime system may be configured to reserve too
-much memory on CUDA devices. Try reducing the amount of
+much memory on GPU devices. Try reducing the amount of
 reserved memory by setting the PaRSEC MCA parameter
-'device_cuda_memory_use' to a lower value.
+'device_cuda_memory_use' (or similar for the type of
+device) to a lower value.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -28,8 +28,27 @@ if( NOT DPLASMA_HAVE_COMPLEX_H )
   list(APPEND EXTRA_SOURCES complex.c)
 endif()
 if( DPLASMA_HAVE_CUDA )
-  list(APPEND EXTRA_SOURCES dplasmaaux_cuda.c)
+  list(APPEND EXTRA_SOURCES dplasmaaux_cuda.c cuda/lapack_cuda_stage_in.c)
 endif()
+if( DPLASMA_HAVE_HIP )
+  list(APPEND EXTRA_SOURCES dplasmaaux_hip.c)
+  FILE(GLOB cuda_sources cuda/[^\\.]*.[ch])
+  find_package(Perl REQUIRED)
+  find_program(HIPIFY_PERL_COMMAND NAMES hipify-perl HINTS ${HIP_BIN_INSTALL_DIR} REQUIRED)
+  foreach(cuda_file ${cuda_sources})
+    file(RELATIVE_PATH cuda_filename ${CMAKE_CURRENT_SOURCE_DIR}/cuda ${cuda_file})
+    string(REPLACE cuda hip hip_file ${cuda_filename})
+    string(PREPEND hip_file "${CMAKE_CURRENT_BINARY_DIR}/hip/")
+    add_custom_command(OUTPUT ${hip_file}
+                       DEPENDS ${cuda_file} # do not use MAIN_DEPENDENCY, that overides the default .c.o rule
+                       COMMAND ${CMAKE_COMMAND} -E copy "${cuda_file}" "${hip_file}.prehip"
+                       COMMAND ${PERL_EXECUTABLE} ${HIPIFY_PERL_COMMAND} --inplace --print-stats "${hip_file}"
+                       COMMAND ${PERL_EXECUTABLE} -i -pe "s{(cuda)}{ substr uc hip | (uc \$1 ^ \$1), 0, 3 }egi" "${hip_file}" VERBATIM) # Convert all remaining cuda/CUDA
+    if(${hip_file} MATCHES [^\\.]*.c) # do not add .h to sources
+      list(APPEND EXTRA_SOURCES ${hip_file})
+    endif()
+  endforeach()
+endif( DPLASMA_HAVE_HIP )
 
 ### Generate .c files from .jdf for all required precisions
 set(JDF
@@ -236,7 +255,9 @@ target_link_libraries(dplasma
   PaRSEC::parsec
   LAPACKE::LAPACKE
   $<$<BOOL:${DPLASMA_HAVE_CUDA}>:CUDA::cublas>
-  $<$<BOOL:${DPLASMA_HAVE_CUDA}>:CUDA::cusolver>)
+  $<$<BOOL:${DPLASMA_HAVE_CUDA}>:CUDA::cusolver>
+  $<$<BOOL:${DPLASMA_HAVE_HIP}>:roc::hipblas>
+  $<$<BOOL:${DPLASMA_HAVE_HIP}>:roc::rocsolver>)
 set_target_properties(dplasma PROPERTIES VERSION ${DPLASMA_VERSION_MAJOR}.${DPLASMA_VERSION_MINOR}
                                        SOVERSION ${DPLASMA_VERSION_MAJOR})
 

diff --git a/src/cuda/README.md b/src/cuda/README.md
@@ -0,0 +1,3 @@
+This directory contains files that are automatically converted from CUDA to HIP using Hipify.
+If your file is not automatically convertible, put it somewhere else.
+
diff --git a/src/cuda/lapack_cuda_stage_in.c b/src/cuda/lapack_cuda_stage_in.c
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2020-2024 The University of Tennessee and The University
+ *                         of Tennessee Research Foundation. All rights
+ *                         reserved.
+ *
+ * $COPYRIGHT
+ *
+ */
+
+#include "dplasma.h"
+#include "dplasmajdf_lapack_dtt.h"
+
+#if defined(DPLASMA_HAVE_CUDA)
+#include <cuda.h>
+#include <parsec/mca/device/cuda/device_cuda.h>
+
+/* Use cudaMemcpy2DAsync or loop with cudaMemcpyAsync for data transfers to device */
+#define USE_COPY_2D
+
+int
+dplasma_cuda_lapack_stage_in(parsec_gpu_task_t *gtask,
+                uint32_t flow_mask,
+                parsec_gpu_exec_stream_t *gpu_stream)
+{
+    cudaError_t ret;
+    parsec_data_copy_t * copy_in;
+    parsec_data_copy_t * copy_out;
+    parsec_device_gpu_module_t *in_elem_dev;
+    parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
+    dplasma_data_collection_t * ddc;
+    parsec_task_t *task = gtask->ec;
+    int elem_sz;
+    int i;
+    for(i = 0; i < task->task_class->nb_flows; i++){
+        if(flow_mask & (1U << i)){
+            copy_in = task->data[i].data_in;
+            copy_out = task->data[i].data_out;
+            ddc = (dplasma_data_collection_t*)gtask->flow_dc[i];
+            assert(ddc != NULL);
+            elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
+            in_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_in->device_index);
+            if( (in_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+                ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
+                                                    copy_in->device_private,
+                                                    gtask->flow_nb_elts[i],
+                                                    (in_elem_dev->super.type != PARSEC_DEV_CUDA)?
+                                                            cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice,
+                                                    cuda_stream->cuda_stream);
+                PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
+            }else{
+
+#ifdef USE_COPY_2D
+                int ldd, nrows, ncols;
+                ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols);
+                size_t dpitch = ddc->dc_original->mb * elem_sz;
+                size_t spitch = ldd * elem_sz;
+                size_t width  = nrows * elem_sz;
+                size_t height = ncols;
+                /* copy width bytes heigth times, skipping pitch - width bytes every time */
+                ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private,
+                                                      dpitch, /*dst pitch bytes*/
+                                                      copy_in->device_private,
+                                                      spitch, /*src pitch bytes*/
+                                                      width, height,
+                                                      cudaMemcpyHostToDevice,
+                                                      cuda_stream->cuda_stream );
+                PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } );
+
+
+#else
+
+                int ldd, nrows, ncols;
+                ADTT_INFO_internal(copy_in, ddc, &ldd, &nrows, &ncols);
+
+                int j;
+                for(j=0; j<ncols; j++) {
+                    char*src = ((char*)copy_in->device_private) + j * ldd * elem_sz;
+                    char*dst = ((char*)copy_out->device_private) + j * ddc->dc_original->mb * elem_sz;
+                    ret = cudaMemcpyAsync(dst,
+                                          src,
+                                          nrows * elem_sz,
+                                          cudaMemcpyHostToDevice,
+                                          cuda_stream->cuda_stream );
+                    PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
+
+                }
+#endif
+
+
+            }
+        }
+    }
+    return PARSEC_SUCCESS;
+}
+
+int
+dplasma_cuda_lapack_stage_out(parsec_gpu_task_t *gtask,
+                 uint32_t flow_mask,
+                 parsec_gpu_exec_stream_t *gpu_stream)
+{
+    cudaError_t ret;
+    parsec_data_copy_t * copy_in;
+    parsec_data_copy_t * copy_out;
+    parsec_device_gpu_module_t *out_elem_dev;
+    parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t*)gpu_stream;
+    parsec_task_t *task = gtask->ec;
+    dplasma_data_collection_t * ddc;
+    int elem_sz;
+    int i;
+    for(i = 0; i < task->task_class->nb_flows; i++){
+        if(flow_mask & (1U << i)){
+            copy_in = task->data[i].data_out;
+            copy_out = copy_in->original->device_copies[0];
+            ddc = (dplasma_data_collection_t*)gtask->flow_dc[i];
+            assert(ddc != NULL);
+            elem_sz = parsec_datadist_getsizeoftype(ddc->dc_original->mtype);
+            out_elem_dev = (parsec_device_gpu_module_t*)parsec_mca_device_get( copy_out->device_index);
+
+            if( (out_elem_dev->super.type == PARSEC_DEV_CUDA) || (ddc->dc_original->storage != PARSEC_MATRIX_LAPACK)){
+                ret = (cudaError_t)cudaMemcpyAsync( copy_out->device_private,
+                                                    copy_in->device_private,
+                                                    gtask->flow_nb_elts[i],
+                                                    out_elem_dev->super.type != PARSEC_DEV_CUDA ?
+                                                            cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice,
+                                                    cuda_stream->cuda_stream);
+                PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
+            }else{
+
+#ifdef USE_COPY_2D
+                int ldd, nrows, ncols;
+                ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols);
+                size_t dpitch = ldd * elem_sz;
+                size_t spitch = ddc->dc_original->mb * elem_sz;
+                size_t width  = nrows * elem_sz;
+                size_t height = ncols;
+                /* copy width bytes heigth times, skipping pitch - width bytes every time */
+                ret = (cudaError_t)cudaMemcpy2DAsync( copy_out->device_private,
+                                                      dpitch, /*dst pitch bytes*/
+                                                      copy_in->device_private,
+                                                      spitch, /*src pitch bytes*/
+                                                      width, height,
+                                                      cudaMemcpyDeviceToHost,
+                                                      cuda_stream->cuda_stream);
+                PARSEC_CUDA_CHECK_ERROR( "cudaMemcpy2DAsync ", ret, { return PARSEC_ERROR; } );
+#else
+                int ldd, nrows, ncols;
+                ADTT_INFO_internal(copy_out, ddc, &ldd, &nrows, &ncols);
+                int j;
+                for(j=0; j<ncols; j++) {
+                    char*src = ((char*)copy_in->device_private) + j * ddc->dc_original->mb * elem_sz;
+                    char*dst = ((char*)copy_out->device_private) + j * ldd * elem_sz;
+                    ret = cudaMemcpyAsync(dst,
+                                          src,
+                                          nrows * elem_sz,
+                                          cudaMemcpyDeviceToHost,
+                                          cuda_stream->cuda_stream);
+                    PARSEC_CUDA_CHECK_ERROR( "cudaMemcpyAsync ", ret, { return PARSEC_ERROR; } );
+                }
+#endif
+            }
+        }
+    }
+    return PARSEC_SUCCESS;
+}
+#endif /* defined(DPLASMA_HAVE_CUDA) */
diff --git a/src/dplasmaaux.c b/src/dplasmaaux.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2021 The University of Tennessee and The University
+ * Copyright (c) 2011-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2013      Inria. All rights reserved.
@@ -109,3 +109,4 @@ dplasma_aux_getGEMMLookahead( parsec_tiled_matrix_t *A )
         return dplasma_imax( ceil( alpha ), 2 );
     }
 }
+
diff --git a/src/dplasmaaux.h b/src/dplasmaaux.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2011-2021 The University of Tennessee and The University
+ * Copyright (c) 2011-2024 The University of Tennessee and The University
  *                         of Tennessee Research Foundation.  All rights
  *                         reserved.
  * Copyright (c) 2013      Inria. All rights reserved.
@@ -112,5 +112,7 @@ extern void *dplasma_pcomm;
 #if defined(DPLASMA_HAVE_CUDA)
 #include "dplasmaaux_cuda.h"
 #endif
-
+#if defined(DPLASMA_HAVE_HIP)
+#include "dplasmaaux_hip.h"
+#endif
 #endif /* _DPLASMAAUX_H_INCLUDED */