Add a simple ping-pong test for GPU accelerators

make a token pass from CPU to each GPU, and back, a few times, to check a possible bug found by @devreal. Part of the DTD interface was not fully ported to HIP Enable (cuda|hip)_pingpong test in CI Add a PTG GPU pingpong test to compare with the behavior in DTD -- Work in progress Tests need to import the appropriate GPU-specific header file, as insert_function_internal.h doesn't do it for them anymore Enable PTG test over CUDA Fix errors in data distribution initialization and some DAG errors in the PTG of the GPU pingpong test Rename files and directories to match the new status of tests (tests/runtime/cuda is renamed tests/runtime/gpu and the pingpong tests are named to specify the API and not a particular device name, since they should work on both GPU types) Only define the pingpong tests if a suitable compiler is found for the kernels Do a ping-pong-pong test instead of ping-pong, to see how dependencies are tracked on GPU-to-GPU task dependency Fix the checks of the pingpong test, and add it in the Testings.cmake PTG ping-pong test: in order to guide the selection of the best device, the advised data needs to flow from a CPU task, not directly from memory. Trying to introduce the gpu_nvidia runner in the CI matrix Add ROCm, create one github_runner-[device].yaml file per device; remove debugging info from CMakeLists.txt Add some infrastructure to make sure CI does the device tests where it should, and issue an error if things cannot be tested (e.g. because the GPUs are down or the compiler/spack is broken) Trying to work around the xml2 issue with mesa. Signed-off-by: George Bosilca <bosilca@icl.utk.edu> Integrate the gpu_amd/release in the test suite Add support to rocm-smi in check_nb_devices.sh Conditional CMake command that depends upon the github runner loaded to prepare for testing
ICLDisco · Nov 6, 2023 · c644262 · c644262
1 parent 92da8b6
commit c644262
Show file tree

Hide file tree

Showing 32 changed files with 850 additions and 38 deletions.
diff --git a/.github/CI/github_runner-cpu.yaml b/.github/CI/github_runner-cpu.yaml
@@ -0,0 +1,27 @@
+spack:
+  definitions:
+  - pkgs:
+    - gcc@12.1.0
+    - git
+    - patch
+    - flex
+    - bison
+    - hwloc
+    - unzip
+    - python@3
+    - py-pip
+    - py-pandas
+    - py-matplotlib
+    - py-tables
+    - py-networkx
+    - py-cython
+    - py-wheel
+    - cmake
+    - ninja
+    - otf2@2.3
+    - openmpi
+
+  view: true
+  specs:
+    - matrix:
+      - [$pkgs]
diff --git a/.github/CI/github_runner-gpu_amd.yaml b/.github/CI/github_runner-gpu_amd.yaml
@@ -0,0 +1,29 @@
+spack:
+  definitions:
+  - pkgs:
+    - gcc@11.3.0
+    - git
+    - hip
+    - patch
+    - flex
+    - bison
+    - libxml2
+    - hwloc
+    - unzip
+    - python@3
+    - py-pip
+    - py-pandas
+    - py-matplotlib
+    - py-tables
+    - py-networkx
+    - py-cython
+    - py-wheel
+    - cmake
+    - ninja
+    - otf2@2.3
+    - openmpi
+
+  view: true
+  specs:
+    - matrix:
+      - [$pkgs]
diff --git a/.github/CI/github_runner.yaml → .github/CI/github_runner-gpu_nvidia.yaml b/.github/CI/github_runner.yaml → .github/CI/github_runner-gpu_nvidia.yaml
@@ -1,8 +1,9 @@
 spack:
   definitions:
   - pkgs:
-    - gcc@12.1.0
+    - gcc@11.3.0
     - git
+    - cuda@12
     - patch
     - flex
     - bison

diff --git a/.github/workflows/build_cmake.yml b/.github/workflows/build_cmake.yml
@@ -19,18 +19,22 @@ env:
 
 jobs:
   debug:
-    runs-on: [self-hosted, Linux]
     strategy:
       fail-fast: false
       matrix:
         build_type : [ Debug ]
         shared_type : [ OFF, ON ]
         profiling : [ ON ]
+        device : [cpu, gpu_nvidia, gpu_amd]
 
-    name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
+    runs-on: ${{matrix.device}}
+
+    name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
     env:
       BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
       INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
+      RUNNER_ENV : github_runner-${{matrix.device}}
+      DEVICE_ENV : ${{matrix.device}}
       BUILD_CONFIG : >
         -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -40,6 +44,7 @@ jobs:
         -DPARSEC_PROF_TRACE=${{ matrix.profiling }}
         -DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
+        -DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
 
     steps:
     - uses: actions/checkout@v2
@@ -104,18 +109,22 @@ jobs:
         path: ${{ env.BUILD_DIRECTORY }}/CMakeFiles/CMakeError.log
   release:
     needs: debug
-    runs-on: [self-hosted, Linux]
     strategy:
       fail-fast: false
       matrix:
         build_type : [ Release ]
         shared_type : [ ON ]
         profiling : [ OFF, ON ]
+        device : [cpu, gpu_nvidia, gpu_amd]
+
+    runs-on: ${{matrix.device}}
 
-    name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
+    name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
     env:
       BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
       INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
+      RUNNER_ENV : github_runner-${{matrix.device}}
+      DEVICE_ENV : ${{matrix.device}}
       BUILD_CONFIG : >
         -G Ninja
         -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
@@ -124,6 +133,7 @@ jobs:
         -DPARSEC_PROF_TRACE=${{ matrix.profiling }}
         -DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
         -DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
+        -DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
 
     steps:
     - uses: actions/checkout@v2
@@ -159,7 +169,14 @@ jobs:
       # The CMake binaries on the Github Actions machines are (as of this writing) 3.12
       run: |
         source ${{github.workspace}}/.github/CI/spack_setup.sh
-        cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG
+        if [ "${{matrix.device}}" == "gpu_amd" ]; then
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=ON -DPARSEC_GPU_WITH_CUDA=OFF
+        elif [ "${{matrix.device}}" == "gpu_nvidia" ]; then
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=ON
+        else
+          cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=OFF
+        fi
+
 
     - name: Build
       working-directory: ${{ env.BUILD_DIRECTORY }}

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -59,6 +59,9 @@ if(POLICY CMP0098)
   # CMP0098: New in version 3.17, FindFLEX runs flex in directory CMAKE_CURRENT_BINARY_DIR when executing.
   cmake_policy(SET CMP0098 NEW)
 endif(POLICY CMP0098)
+if(POLICY CMP0104 AND NOT CUDA_ARCHITECTURES)
+  set(CUDA_ARCHITECTURES OFF)
+endif()
 
 set(CMAKE_NO_SYSTEM_FROM_IMPORTED True)
 # On OSX only find the Apple frameworks is nothing else is available.
@@ -75,6 +78,11 @@ include(CTest)
 # ccmake tunable parameters
 #####
 
+# CTest related options
+set(PARSEC_REQUIRE_DEVICE_TEST "NONE" CACHE STRING "Make tests fail if specified device support is disabled (default NONE, valid values are HIP or amd, CUDA or nvidia, or NONE or cpu). The intended use is to ensure that device tests are passed in CI, and avoid failing silently if there is no GPU on the target system.")
+set_property(CACHE PARSEC_REQUIRE_DEVICE_TEST PROPERTY STRINGS "NONE" "HIP" "CUDA" "cpu" "gpu_amd" "gpu_nvidia")
+mark_as_advanced(PARSEC_REQUIRE_DEVICE_TEST)
+
 ## Check for the support of additional languages and capabilities
 option(SUPPORT_FORTRAN
        "Enable support for Fortran bindings (default ON)" ON)
@@ -123,6 +131,9 @@ mark_as_advanced(BUILD_PARSEC)
 ### Misc options
 option(BUILD_SHARED_LIBS
     "Build shared libraries" ON)
+if(BUILD_SHARED_LIBS)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif(BUILD_SHARED_LIBS)
 option(BUILD_64bits
   "Build 64 bits mode" ON)
 if(NOT CMAKE_BUILD_TYPE)
@@ -717,16 +728,16 @@ int main(int argc, char *argv[]) {
       if(CMAKE_CUDA_COMPILER)
         enable_language(CUDA)
       endif(CMAKE_CUDA_COMPILER)
+      cmake_pop_check_state()
     endif (CUDAToolkit_FOUND)
     set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
   endif( PARSEC_GPU_WITH_CUDA )
 
   if( PARSEC_GPU_WITH_HIP )
     # This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
     set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH})
-    list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm)
-    find_package(HIP 5 QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
-    set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
+    list(APPEND CMAKE_SYSTEM_PREFIX_PATH $ENV{ROCM_PATH}/lib/cmake)
+    find_package(HIP QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
     if(HIP_FOUND AND PARSEC_HAVE_CUDA)
       # the underlying reason is that the generated ptg code cannot include at the same time
       # cuda_runtime.h and hip_runtime.h, so we need to modify the dev_cuda.h to not expose any
@@ -738,6 +749,8 @@ int main(int argc, char *argv[]) {
       get_target_property(extra_hip_libs hip::host INTERFACE_LINK_LIBRARIES)
       list(APPEND EXTRA_LIBS ${extra_hip_libs})
       set(HIP_NOT_CUDA_FOUND TRUE)
+      enable_language(HIP)
+      set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
     else()
       set(HIP_NOT_CUDA_FOUND FALSE)
     endif()
@@ -747,8 +760,8 @@ int main(int argc, char *argv[]) {
   if( PARSEC_GPU_WITH_LEVEL_ZERO )
     find_package(level-zero)
     find_package(DPCPP)
-    set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
     if (LEVEL_ZERO_FOUND AND PARSEC_HAVE_DPCPP)
+      set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
       include_directories("${LEVEL_ZERO_INCLUDE_DIR}/level_zero/")
       set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel Level Zero")
       message(STATUS "Found Intel level-zero ${LEVEL_ZERO_VERSION} in -I${LEVEL_ZERO_INCLUDE_DIR} / -L${LEVEL_ZERO_LIBRARY_DIR}")
@@ -939,6 +952,7 @@ add_subdirectory(parsec)
 # Add dependency to Level-Zero if it is enabled
 #
 if(PARSEC_HAVE_LEVEL_ZERO)
+    message(STATUS "parsec depends on ze_loader")
     target_link_libraries(parsec PRIVATE level_zero::ze_loader)
 endif(PARSEC_HAVE_LEVEL_ZERO)
 

diff --git a/parsec/interfaces/dtd/insert_function.c b/parsec/interfaces/dtd/insert_function.c
@@ -41,6 +41,9 @@
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
 #include "parsec/mca/device/cuda/device_cuda.h"
 #endif  /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+#include "parsec/mca/device/hip/device_hip.h"
+#endif  /* defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 
 #include "parsec/mca/mca_repository.h"
 #include "parsec/constants.h"
@@ -1491,9 +1494,8 @@ parsec_dtd_startup(parsec_context_t *context,
         parsec_device_module_t *device = parsec_mca_device_get(_i);
         if( NULL == device ) continue;
         if( !(tp->devices_index_mask & (1 << device->device_index))) continue;  /* not supported */
-        // If CUDA is enabled, let the CUDA device activated for this
-        // taskpool.
-        if( PARSEC_DEV_CUDA == device->type ) continue;
+        // If a GPU is enabled, let the device be activated for this taskpool.
+        if( PARSEC_DEV_IS_GPU(device->type) ) continue;
         if( NULL != device->taskpool_register )
             if( PARSEC_SUCCESS !=
                 device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
@@ -2327,7 +2329,7 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
     }
 
     parsec_device_module_t *device = parsec_mca_device_get(dev_index);
-    assert(NULL != device);
+     assert(NULL != device);
     /* We already know the device is a GPU device from the test above */
     gpu_task->stage_in  = parsec_default_gpu_stage_in;
     gpu_task->stage_out = parsec_default_gpu_stage_out;
@@ -2400,7 +2402,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
     }
 
     incarnations[i].type = device_type;
-    if(PARSEC_DEV_CUDA == device_type) {
+    if(PARSEC_DEV_IS_GPU(device_type)) {
         incarnations[i].hook = parsec_dtd_gpu_task_submit;
         dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
     }
@@ -2998,11 +3000,11 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
                         FLOW_OF(last_user.task, last_user.flow_index)->flags &= ~RELEASE_OWNERSHIP_SPECIAL;
 
                         if( this_task->super.data[flow_index].data_in != NULL) {
-/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 /*                            parsec_atomic_lock(&this_task->super.data[flow_index].data_in->original->lock); */
 /* #endif */
                             (void)parsec_atomic_fetch_dec_int32(&this_task->super.data[flow_index].data_in->readers);
-/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
+/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
 /*                            parsec_atomic_unlock(&this_task->super.data[flow_index].data_in->original->lock); */
 /* #endif */
                         }
@@ -3287,8 +3289,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,
 
             __parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
             (*incarnations)[0].type = device_type;
-            if( device_type == PARSEC_DEV_CUDA ) {
-                /* Special case for CUDA: we need an intermediate */
+            if( PARSEC_DEV_IS_GPU(device_type) ) {
+                /* Special case for GPUs: we need an intermediate */
                 (*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
                 dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
             }

diff --git a/parsec/interfaces/dtd/insert_function_internal.h b/parsec/interfaces/dtd/insert_function_internal.h
@@ -21,10 +21,6 @@
 #include "parsec/execution_stream.h"
 #include "parsec/mca/device/device_gpu.h"
 
-#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
-#include "parsec/mca/device/cuda/device_cuda.h"
-#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */
-
 BEGIN_C_DECLS
 
 #define PARSEC_DTD_NB_TASK_CLASSES  25 /*< Max number of task classes allowed */

diff --git a/tests/dsl/dtd/dtd_test_cuda_task_insert.c b/tests/dsl/dtd/dtd_test_cuda_task_insert.c
@@ -4,6 +4,7 @@
 #include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
 #include "tests/tests_data.h"
+#include "parsec/mca/device/cuda/device_cuda_internal.h"
 
 #if defined(PARSEC_HAVE_MPI)
 #include <mpi.h>

diff --git a/tests/dsl/dtd/dtd_test_new_tile.c b/tests/dsl/dtd/dtd_test_new_tile.c
@@ -9,6 +9,9 @@
 #include "tests/tests_timing.h"
 #include "parsec/interfaces/dtd/insert_function_internal.h"
 #include "parsec/utils/debug.h"
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
+#include "parsec/mca/device/cuda/device_cuda_internal.h"
+#endif
 
 #if defined(PARSEC_HAVE_STRING_H)
 #include <string.h>

diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(scheduling)
-add_Subdirectory(cuda)
+add_Subdirectory(gpu)
 
 if( MPI_C_FOUND )
   parsec_addtest_executable(C multichain)

diff --git a/tests/runtime/Testings.cmake b/tests/runtime/Testings.cmake
@@ -1,2 +1,2 @@
 include(runtime/scheduling/Testings.cmake)
-include(runtime/cuda/Testings.cmake)
+include(runtime/gpu/Testings.cmake)
diff --git a/tests/runtime/cuda/Testings.cmake b/tests/runtime/cuda/Testings.cmake
diff --git a/tests/runtime/cuda/CMakeLists.txt → tests/runtime/gpu/CMakeLists.txt b/tests/runtime/cuda/CMakeLists.txt → tests/runtime/gpu/CMakeLists.txt
@@ -25,4 +25,24 @@ if(PARSEC_HAVE_CUDA)
   parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c")
   target_include_directories(testing_get_best_device PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
   target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf")
+
+  if(CMAKE_CUDA_COMPILER)
+    set_source_files_properties(ping_kernel.cu PROPERTIES LANGUAGE CUDA)
+    parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c)
+    target_sources(dtd_pingpong PRIVATE ping_kernel.cu)
+
+    parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.cu)
+    target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
+  endif(CMAKE_CUDA_COMPILER)
 endif(PARSEC_HAVE_CUDA)
+
+if(PARSEC_HAVE_HIP)
+  if(CMAKE_HIP_COMPILER)
+    include(ParsecCompilePTG)
+    set_source_files_properties(ping_kernel.hip.c PROPERTIES LANGUAGE HIP)
+    parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c ping_kernel.hip.c)
+
+    parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.hip.c)
+    target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
+  endif(CMAKE_HIP_COMPILER)
+endif(PARSEC_HAVE_HIP)