Conversation
| # --------------------------------------------------------------------- | ||
| # nccl setup | ||
| # --------------------------------------------------------------------- | ||
| if(GHEX_USE_NCCL) | ||
| link_libraries("-lnccl") | ||
| # include_directories("") | ||
| endif() |
There was a problem hiding this comment.
This is a hack. Add proper FindNCCL.cmake module.
This can be tested using the icon uenv manually setting export LIBRARY_PATH=/user-environment/env/default/lib64:/user-environment/env/default/lib.
| struct cuda_event { | ||
| cudaEvent_t m_event; | ||
| ghex::util::moved_bit m_moved; | ||
|
|
||
| cuda_event() { | ||
| GHEX_CHECK_CUDA_RESULT(cudaEventCreateWithFlags(&m_event, cudaEventDisableTiming)) | ||
| } | ||
| cuda_event(const cuda_event&) = delete; | ||
| cuda_event& operator=(const cuda_event&) = delete; | ||
| cuda_event(cuda_event&& other) = default; | ||
| cuda_event& operator=(cuda_event&&) = default; | ||
|
|
||
| ~cuda_event() | ||
| { | ||
| if (!m_moved) | ||
| { | ||
| GHEX_CHECK_CUDA_RESULT_NO_THROW(cudaEventDestroy(m_event)) | ||
| } | ||
| } | ||
|
|
||
| operator bool() const noexcept { return m_moved; } | ||
| operator cudaEvent_t() const noexcept { return m_event; } | ||
| cudaEvent_t& get() noexcept { return m_event; } | ||
| const cudaEvent_t& get() const noexcept { return m_event; } | ||
| }; | ||
|
|
| ghex::util::moved_bit m_moved; | ||
| bool m_valid; | ||
| communicator_type m_comm; | ||
| #ifdef GHEX_USE_NCCL | ||
| ncclComm_t m_nccl_comm; | ||
| #endif |
There was a problem hiding this comment.
Move this to oomph.
| private: | ||
| template<typename... Archs, typename... Fields> | ||
| void nccl_exchange_impl(buffer_info_type<Archs, Fields>... buffer_infos) { | ||
| pack_nccl(); | ||
|
|
||
| ncclGroupStart(); | ||
| post_sends_nccl(); | ||
| post_recvs_nccl(); | ||
| ncclGroupEnd(); | ||
|
|
||
| unpack_nccl(); | ||
| } |
There was a problem hiding this comment.
Add customization point or similar to allow doing this with NCCL?
| // application data | ||
| auto& d = local_domains[0]; | ||
| ghex::test::util::memory<int> field(d.size()*levels, 0); | ||
| #ifndef GHEX_USE_NCCL |
There was a problem hiding this comment.
To do: tests etc. don't like NCCL when using host memory. Copy to device when using NCCL or disallow it completely?
| auto h_gpu = co.exchange(patterns(data_gpu)); | ||
| #ifdef GHEX_USE_NCCL | ||
| cudaDeviceSynchronize(); | ||
| #endif | ||
| h_gpu.wait(); |
There was a problem hiding this comment.
Introduce another API that allows skipping a blocking wait.
|
I'm keeping this branch around for use with icon4py, but closing this PR by replacing it with a cleaner version. |
Opening this just for reference. This first version was just a proof of concept to see what improvement we might see.
This version hacks in NCCL support in
communication_object. Performance is very good intra-node in icon-exclaim. I haven't tested inter-node performance (which would use libfabric; the icon uenv is still using an old version of NCCL and libfabric which may not perform so well).I plan to work on updating this to a cleaner version with an oomph NCCL backend. However, this may need some new customization points etc. as the NCCL communication has to/can be set up a bit differently compared to the MPI/libfabric implementations (pack + send/recv + unpack can all be scheduled in one go).
Based on #184.