Skip to content

Commit

Permalink
Add a simple ping-pong test for GPU accelerators
Browse files Browse the repository at this point in the history
make a token pass from CPU to each GPU, and back, a few times, to check a possible bug found by @devreal.

Part of the DTD interface was not fully ported to HIP

Enable (cuda|hip)_pingpong test in CI

Add a PTG GPU pingpong test to compare with the behavior in DTD -- Work in progress

Tests need to import the appropriate GPU-specific header file, as insert_function_internal.h doesn't do it for them anymore

Enable PTG test over CUDA

Fix errors in data distribution initialization and some DAG errors in the PTG of the GPU pingpong test

Rename files and directories to match the new status of tests (tests/runtime/cuda is renamed tests/runtime/gpu and the pingpong tests are named to specify the API and not a particular device name, since they should work on both GPU types)

Only define the pingpong tests if a suitable compiler is found for the kernels

Do a ping-pong-pong test instead of ping-pong, to see how dependencies are tracked on GPU-to-GPU task dependency

Fix the checks of the pingpong test, and add it in the Testings.cmake

PTG ping-pong test: in order to guide the selection of the best device, the advised data needs to flow from a CPU task, not directly from memory.

Trying to introduce the gpu_nvidia runner in the CI matrix

Add ROCm, create one github_runner-[device].yaml file per device; remove debugging info from CMakeLists.txt

Add some infrastructure to make sure CI does the device tests where it should, and issue an error if things cannot be tested (e.g. because the GPUs are down or the compiler/spack is broken)

Trying to work around the xml2 issue with mesa.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>

Integrate the gpu_amd/release in the test suite

Add support to rocm-smi in check_nb_devices.sh

Conditional CMake command that depends upon the github runner loaded to prepare for testing
  • Loading branch information
therault committed Nov 6, 2023
1 parent 92da8b6 commit c644262
Show file tree
Hide file tree
Showing 32 changed files with 850 additions and 38 deletions.
27 changes: 27 additions & 0 deletions .github/CI/github_runner-cpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
spack:
definitions:
- pkgs:
- gcc@12.1.0
- git
- patch
- flex
- bison
- hwloc
- unzip
- python@3
- py-pip
- py-pandas
- py-matplotlib
- py-tables
- py-networkx
- py-cython
- py-wheel
- cmake
- ninja
- otf2@2.3
- openmpi

view: true
specs:
- matrix:
- [$pkgs]
29 changes: 29 additions & 0 deletions .github/CI/github_runner-gpu_amd.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
spack:
definitions:
- pkgs:
- gcc@11.3.0
- git
- hip
- patch
- flex
- bison
- libxml2
- hwloc
- unzip
- python@3
- py-pip
- py-pandas
- py-matplotlib
- py-tables
- py-networkx
- py-cython
- py-wheel
- cmake
- ninja
- otf2@2.3
- openmpi

view: true
specs:
- matrix:
- [$pkgs]
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
spack:
definitions:
- pkgs:
- gcc@12.1.0
- gcc@11.3.0
- git
- cuda@12
- patch
- flex
- bison
Expand Down
27 changes: 22 additions & 5 deletions .github/workflows/build_cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,22 @@ env:

jobs:
debug:
runs-on: [self-hosted, Linux]
strategy:
fail-fast: false
matrix:
build_type : [ Debug ]
shared_type : [ OFF, ON ]
profiling : [ ON ]
device : [cpu, gpu_nvidia, gpu_amd]

name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
runs-on: ${{matrix.device}}

name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
env:
BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
RUNNER_ENV : github_runner-${{matrix.device}}
DEVICE_ENV : ${{matrix.device}}
BUILD_CONFIG : >
-G Ninja
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
Expand All @@ -40,6 +44,7 @@ jobs:
-DPARSEC_PROF_TRACE=${{ matrix.profiling }}
-DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
-DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
-DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -104,18 +109,22 @@ jobs:
path: ${{ env.BUILD_DIRECTORY }}/CMakeFiles/CMakeError.log
release:
needs: debug
runs-on: [self-hosted, Linux]
strategy:
fail-fast: false
matrix:
build_type : [ Release ]
shared_type : [ ON ]
profiling : [ OFF, ON ]
device : [cpu, gpu_nvidia, gpu_amd]

runs-on: ${{matrix.device}}

name: "Type=${{ matrix.build_type }} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
name: "Type=${{ matrix.build_type }} device=${{matrix.device}} shared=${{ matrix.shared_type }} profiling=${{matrix.profiling}}"
env:
BUILD_DIRECTORY : "${{github.workspace}}/build/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
INSTALL_DIRECTORY : "${{github.workspace}}/install/${{ matrix.build_type }}/shared_${{matrix.shared_type}}/profile_${{matrix.profiling}}"
RUNNER_ENV : github_runner-${{matrix.device}}
DEVICE_ENV : ${{matrix.device}}
BUILD_CONFIG : >
-G Ninja
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
Expand All @@ -124,6 +133,7 @@ jobs:
-DPARSEC_PROF_TRACE=${{ matrix.profiling }}
-DMPIEXEC_PREFLAGS='--bind-to;none;--oversubscribe'
-DCMAKE_INSTALL_PREFIX=$INSTALL_DIRECTORY
-DPARSEC_REQUIRE_DEVICE_TEST=${{matrix.device}}
steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -159,7 +169,14 @@ jobs:
# The CMake binaries on the Github Actions machines are (as of this writing) 3.12
run: |
source ${{github.workspace}}/.github/CI/spack_setup.sh
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG
if [ "${{matrix.device}}" == "gpu_amd" ]; then
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=ON -DPARSEC_GPU_WITH_CUDA=OFF
elif [ "${{matrix.device}}" == "gpu_nvidia" ]; then
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=ON
else
cmake $GITHUB_WORKSPACE -DCMAKE_BUILD_TYPE=$BUILD_TYPE $BUILD_CONFIG -DPARSEC_GPU_WITH_HIP=OFF -DPARSEC_GPU_WITH_CUDA=OFF
fi
- name: Build
working-directory: ${{ env.BUILD_DIRECTORY }}
Expand Down
22 changes: 18 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ if(POLICY CMP0098)
# CMP0098: New in version 3.17, FindFLEX runs flex in directory CMAKE_CURRENT_BINARY_DIR when executing.
cmake_policy(SET CMP0098 NEW)
endif(POLICY CMP0098)
if(POLICY CMP0104 AND NOT CUDA_ARCHITECTURES)
set(CUDA_ARCHITECTURES OFF)
endif()

set(CMAKE_NO_SYSTEM_FROM_IMPORTED True)
# On OSX only find the Apple frameworks is nothing else is available.
Expand All @@ -75,6 +78,11 @@ include(CTest)
# ccmake tunable parameters
#####

# CTest related options
set(PARSEC_REQUIRE_DEVICE_TEST "NONE" CACHE STRING "Make tests fail if specified device support is disabled (default NONE, valid values are HIP or amd, CUDA or nvidia, or NONE or cpu). The intended use is to ensure that device tests are passed in CI, and avoid failing silently if there is no GPU on the target system.")
set_property(CACHE PARSEC_REQUIRE_DEVICE_TEST PROPERTY STRINGS "NONE" "HIP" "CUDA" "cpu" "gpu_amd" "gpu_nvidia")
mark_as_advanced(PARSEC_REQUIRE_DEVICE_TEST)

## Check for the support of additional languages and capabilities
option(SUPPORT_FORTRAN
"Enable support for Fortran bindings (default ON)" ON)
Expand Down Expand Up @@ -123,6 +131,9 @@ mark_as_advanced(BUILD_PARSEC)
### Misc options
option(BUILD_SHARED_LIBS
"Build shared libraries" ON)
if(BUILD_SHARED_LIBS)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
endif(BUILD_SHARED_LIBS)
option(BUILD_64bits
"Build 64 bits mode" ON)
if(NOT CMAKE_BUILD_TYPE)
Expand Down Expand Up @@ -717,16 +728,16 @@ int main(int argc, char *argv[]) {
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
endif(CMAKE_CUDA_COMPILER)
cmake_pop_check_state()
endif (CUDAToolkit_FOUND)
set(PARSEC_HAVE_CU_COMPILER ${CMAKE_CUDA_COMPILER} CACHE BOOL "True if PaRSEC provide support for compiling .cu files")
endif( PARSEC_GPU_WITH_CUDA )

if( PARSEC_GPU_WITH_HIP )
# This is kinda ugly but the PATH and HINTS don't get transmitted to sub-dependents
set(CMAKE_SYSTEM_PREFIX_PATH_save ${CMAKE_SYSTEM_PREFIX_PATH})
list(APPEND CMAKE_SYSTEM_PREFIX_PATH /opt/rocm)
find_package(HIP 5 QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
list(APPEND CMAKE_SYSTEM_PREFIX_PATH $ENV{ROCM_PATH}/lib/cmake)
find_package(HIP QUIET) #quiet because hip-config.cmake is not part of core-cmake and will spam a loud warning when hip/rocm is not installed
if(HIP_FOUND AND PARSEC_HAVE_CUDA)
# the underlying reason is that the generated ptg code cannot include at the same time
# cuda_runtime.h and hip_runtime.h, so we need to modify the dev_cuda.h to not expose any
Expand All @@ -738,6 +749,8 @@ int main(int argc, char *argv[]) {
get_target_property(extra_hip_libs hip::host INTERFACE_LINK_LIBRARIES)
list(APPEND EXTRA_LIBS ${extra_hip_libs})
set(HIP_NOT_CUDA_FOUND TRUE)
enable_language(HIP)
set(CMAKE_SYSTEM_PREFIX_PATH ${CMAKE_SYSTEM_PREFIX_PATH_save})
else()
set(HIP_NOT_CUDA_FOUND FALSE)
endif()
Expand All @@ -747,8 +760,8 @@ int main(int argc, char *argv[]) {
if( PARSEC_GPU_WITH_LEVEL_ZERO )
find_package(level-zero)
find_package(DPCPP)
set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
if (LEVEL_ZERO_FOUND AND PARSEC_HAVE_DPCPP)
set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel level-zero")
include_directories("${LEVEL_ZERO_INCLUDE_DIR}/level_zero/")
set(PARSEC_HAVE_LEVEL_ZERO ${LEVEL_ZERO_FOUND} CACHE BOOL "True if PaRSEC provide support for Intel Level Zero")
message(STATUS "Found Intel level-zero ${LEVEL_ZERO_VERSION} in -I${LEVEL_ZERO_INCLUDE_DIR} / -L${LEVEL_ZERO_LIBRARY_DIR}")
Expand Down Expand Up @@ -939,6 +952,7 @@ add_subdirectory(parsec)
# Add dependency to Level-Zero if it is enabled
#
if(PARSEC_HAVE_LEVEL_ZERO)
message(STATUS "parsec depends on ze_loader")
target_link_libraries(parsec PRIVATE level_zero::ze_loader)
endif(PARSEC_HAVE_LEVEL_ZERO)

Expand Down
20 changes: 11 additions & 9 deletions parsec/interfaces/dtd/insert_function.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
#include "parsec/mca/device/cuda/device_cuda.h"
#endif /* defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
#include "parsec/mca/device/hip/device_hip.h"
#endif /* defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */

#include "parsec/mca/mca_repository.h"
#include "parsec/constants.h"
Expand Down Expand Up @@ -1491,9 +1494,8 @@ parsec_dtd_startup(parsec_context_t *context,
parsec_device_module_t *device = parsec_mca_device_get(_i);
if( NULL == device ) continue;
if( !(tp->devices_index_mask & (1 << device->device_index))) continue; /* not supported */
// If CUDA is enabled, let the CUDA device activated for this
// taskpool.
if( PARSEC_DEV_CUDA == device->type ) continue;
// If a GPU is enabled, let the device be activated for this taskpool.
if( PARSEC_DEV_IS_GPU(device->type) ) continue;
if( NULL != device->taskpool_register )
if( PARSEC_SUCCESS !=
device->taskpool_register(device, (parsec_taskpool_t *)tp)) {
Expand Down Expand Up @@ -2327,7 +2329,7 @@ static parsec_hook_return_t parsec_dtd_gpu_task_submit(parsec_execution_stream_t
}

parsec_device_module_t *device = parsec_mca_device_get(dev_index);
assert(NULL != device);
assert(NULL != device);
/* We already know the device is a GPU device from the test above */
gpu_task->stage_in = parsec_default_gpu_stage_in;
gpu_task->stage_out = parsec_default_gpu_stage_out;
Expand Down Expand Up @@ -2400,7 +2402,7 @@ int parsec_dtd_task_class_add_chore(parsec_taskpool_t *tp,
}

incarnations[i].type = device_type;
if(PARSEC_DEV_CUDA == device_type) {
if(PARSEC_DEV_IS_GPU(device_type)) {
incarnations[i].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)function;
}
Expand Down Expand Up @@ -2998,11 +3000,11 @@ parsec_insert_dtd_task(parsec_task_t *__this_task)
FLOW_OF(last_user.task, last_user.flow_index)->flags &= ~RELEASE_OWNERSHIP_SPECIAL;

if( this_task->super.data[flow_index].data_in != NULL) {
/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
/* parsec_atomic_lock(&this_task->super.data[flow_index].data_in->original->lock); */
/* #endif */
(void)parsec_atomic_fetch_dec_int32(&this_task->super.data[flow_index].data_in->readers);
/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) */
/* #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) || defined(PARSEC_HAVE_DEV_HIP_SUPPORT) */
/* parsec_atomic_unlock(&this_task->super.data[flow_index].data_in->original->lock); */
/* #endif */
}
Expand Down Expand Up @@ -3287,8 +3289,8 @@ __parsec_dtd_taskpool_create_task(parsec_taskpool_t *tp,

__parsec_chore_t **incarnations = (__parsec_chore_t **)&tc->incarnations;
(*incarnations)[0].type = device_type;
if( device_type == PARSEC_DEV_CUDA ) {
/* Special case for CUDA: we need an intermediate */
if( PARSEC_DEV_IS_GPU(device_type) ) {
/* Special case for GPUs: we need an intermediate */
(*incarnations)[0].hook = parsec_dtd_gpu_task_submit;
dtd_tc->gpu_func_ptr = (parsec_advance_task_function_t)fpointer;
}
Expand Down
4 changes: 0 additions & 4 deletions parsec/interfaces/dtd/insert_function_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,6 @@
#include "parsec/execution_stream.h"
#include "parsec/mca/device/device_gpu.h"

#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
#include "parsec/mca/device/cuda/device_cuda.h"
#endif /* PARSEC_HAVE_DEV_CUDA_SUPPORT */

BEGIN_C_DECLS

#define PARSEC_DTD_NB_TASK_CLASSES 25 /*< Max number of task classes allowed */
Expand Down
1 change: 1 addition & 0 deletions tests/dsl/dtd/dtd_test_cuda_task_insert.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "parsec/data_dist/matrix/two_dim_rectangle_cyclic.h"
#include "parsec/interfaces/dtd/insert_function_internal.h"
#include "tests/tests_data.h"
#include "parsec/mca/device/cuda/device_cuda_internal.h"

#if defined(PARSEC_HAVE_MPI)
#include <mpi.h>
Expand Down
3 changes: 3 additions & 0 deletions tests/dsl/dtd/dtd_test_new_tile.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
#include "tests/tests_timing.h"
#include "parsec/interfaces/dtd/insert_function_internal.h"
#include "parsec/utils/debug.h"
#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
#include "parsec/mca/device/cuda/device_cuda_internal.h"
#endif

#if defined(PARSEC_HAVE_STRING_H)
#include <string.h>
Expand Down
2 changes: 1 addition & 1 deletion tests/runtime/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
add_subdirectory(scheduling)
add_Subdirectory(cuda)
add_Subdirectory(gpu)

if( MPI_C_FOUND )
parsec_addtest_executable(C multichain)
Expand Down
2 changes: 1 addition & 1 deletion tests/runtime/Testings.cmake
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
include(runtime/scheduling/Testings.cmake)
include(runtime/cuda/Testings.cmake)
include(runtime/gpu/Testings.cmake)
13 changes: 0 additions & 13 deletions tests/runtime/cuda/Testings.cmake

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,24 @@ if(PARSEC_HAVE_CUDA)
parsec_addtest_executable(C testing_get_best_device SOURCES "testing_get_best_device.c")
target_include_directories(testing_get_best_device PRIVATE $<$<NOT:${PARSEC_BUILD_INPLACE}>:${CMAKE_CURRENT_SOURCE_DIR}>)
target_ptg_sources(testing_get_best_device PRIVATE "get_best_device_check.jdf")

if(CMAKE_CUDA_COMPILER)
set_source_files_properties(ping_kernel.cu PROPERTIES LANGUAGE CUDA)
parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c)
target_sources(dtd_pingpong PRIVATE ping_kernel.cu)

parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.cu)
target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
endif(CMAKE_CUDA_COMPILER)
endif(PARSEC_HAVE_CUDA)

if(PARSEC_HAVE_HIP)
if(CMAKE_HIP_COMPILER)
include(ParsecCompilePTG)
set_source_files_properties(ping_kernel.hip.c PROPERTIES LANGUAGE HIP)
parsec_addtest_executable(C dtd_pingpong SOURCES dtd_pingpong.c ping_kernel.hip.c)

parsec_addtest_executable(C ptg_pingpong SOURCES ping_kernel.hip.c)
target_ptg_sources(ptg_pingpong PRIVATE "ptg_pingpong.jdf")
endif(CMAKE_HIP_COMPILER)
endif(PARSEC_HAVE_HIP)
Loading

0 comments on commit c644262

Please sign in to comment.