Skip to content

Commit 15dc143

Browse files
authored
Merge pull request kokkos#7333 from ndellingwood/master-release-4.4.01
Master release 4.4.01
2 parents 08ceff9 + 5cb2fa3 commit 15dc143

26 files changed

+481
-101
lines changed

.github/workflows/continuous-integration-workflow.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ jobs:
113113
cmake -B builddir \
114114
-DCMAKE_INSTALL_PREFIX=/usr \
115115
${{ matrix.clang-tidy }} \
116+
-DBUILD_SHARED_LIBS=ON \
116117
-Ddesul_ROOT=/usr/desul-install/ \
117118
-DKokkos_ENABLE_DESUL_ATOMICS_EXTERNAL=ON \
118119
-DKokkos_ENABLE_HWLOC=ON \

.github/workflows/releases.yml

Lines changed: 12 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -13,33 +13,26 @@ jobs:
1313
hashes: ${{ steps.hash.outputs.hashes }}
1414
runs-on: ubuntu-latest
1515
steps:
16-
- uses: actions/checkout@v4
16+
- uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4.1.7
1717
- name: Build artifacts
1818
run: |
19-
git archive -o kokkos-${{ github.ref_name }}.zip HEAD
20-
git archive -o kokkos-${{ github.ref_name }}.tar.gz HEAD
19+
git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.zip HEAD
20+
git archive --prefix=kokkos-${{ github.ref_name }}/ -o kokkos-${{ github.ref_name }}.tar.gz HEAD
2121
2222
- name: Generate hashes
2323
shell: bash
2424
id: hash
2525
run: |
2626
# sha256sum generates sha256 hash for all artifacts.
2727
# base64 -w0 encodes to base64 and outputs on a single line.
28-
echo "hashes=$(sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz | base64 -w0)" >> "$GITHUB_OUTPUT"
28+
sha256sum kokkos-${{ github.ref_name }}.zip kokkos-${{ github.ref_name }}.tar.gz > kokkos-${{ github.ref_name }}-SHA-256.txt
29+
echo "hashes=$(base64 -w0 kokkos-${{ github.ref_name }}-SHA-256.txt)" >> "$GITHUB_OUTPUT"
2930
30-
- name: Upload source code (zip)
31-
uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5
31+
- name: Upload artifacts
32+
uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0
3233
with:
33-
name: kokkos-${{ github.ref_name }}.zip
34-
path: kokkos-${{ github.ref_name }}.zip
35-
if-no-files-found: error
36-
retention-days: 5
37-
38-
- name: Upload source code (tar.gz)
39-
uses: actions/upload-artifact@89ef406dd8d7e03cfd12d9e0a4a378f454709029 # v4.3.5
40-
with:
41-
name: kokkos-${{ github.ref_name }}.tar.gz
42-
path: kokkos-${{ github.ref_name }}.tar.gz
34+
name: release-artifacts
35+
path: kokkos-${{ github.ref_name }}*
4336
if-no-files-found: error
4437
retention-days: 5
4538

@@ -65,19 +58,14 @@ jobs:
6558
runs-on: ubuntu-latest
6659
if: startsWith(github.ref, 'refs/tags/')
6760
steps:
68-
- name: Download kokkos-${{ github.ref_name }}.zip
61+
- name: Download artifacts
6962
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
7063
with:
71-
name: kokkos-${{ github.ref_name }}.zip
72-
73-
- name: Download kokkos-${{ github.ref_name }}.tar.gz
74-
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
75-
with:
76-
name: kokkos-${{ github.ref_name }}.tar.gz
77-
64+
name: release-artifacts
7865
- name: Upload assets
7966
uses: softprops/action-gh-release@c062e08bd532815e2082a85e87e3ef29c3e6d191 # v2.0.8
8067
with:
8168
files: |
8269
kokkos-${{ github.ref_name }}.zip
8370
kokkos-${{ github.ref_name }}.tar.gz
71+
kokkos-${{ github.ref_name }}-SHA-256.txt

.jenkins

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -461,6 +461,7 @@ pipeline {
461461
-DKokkos_ENABLE_CUDA=ON \
462462
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
463463
-DKokkos_ENABLE_LIBDL=OFF \
464+
-DKokkos_ENABLE_OPENMP=ON \
464465
-DKokkos_ENABLE_IMPL_MDSPAN=OFF \
465466
-DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \
466467
.. && \

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
# CHANGELOG
22

3+
## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01)
4+
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01)
5+
6+
### Features:
7+
* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229)
8+
9+
### Backend and Architecture Enhancements:
10+
11+
#### CUDA:
12+
* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823)
13+
14+
### Bug Fixes
15+
* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284)
16+
* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296)
17+
318
## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00)
419
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00)
520

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ ENDIF()
151151

152152
set(Kokkos_VERSION_MAJOR 4)
153153
set(Kokkos_VERSION_MINOR 4)
154-
set(Kokkos_VERSION_PATCH 0)
154+
set(Kokkos_VERSION_PATCH 1)
155155
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
156156
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
157157
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")

Makefile.kokkos

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
KOKKOS_VERSION_MAJOR = 4
44
KOKKOS_VERSION_MINOR = 4
5-
KOKKOS_VERSION_PATCH = 0
5+
KOKKOS_VERSION_PATCH = 1
66
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)
77

88
# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial

cmake/KokkosCore_config.h.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated
3838
#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
3939
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
40+
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
4041
#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
4142
#cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
4243
#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY

cmake/kokkos_enable_options.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
4848
# resolved but we keep the option around a bit longer to be safe.
4949
KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
5050
KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
51+
KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")
52+
5153
KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" )
5254
KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
5355
KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP")
@@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options)
135137
ENDIF()
136138
ENDFUNCTION()
137139

138-
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
140+
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY)
139141
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
140142
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)
141143

containers/unit_tests/TestWithoutInitializing.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,17 @@
3737
#endif
3838
///@}
3939

40+
/// Some tests are skipped for unified memory space
41+
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
42+
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \
43+
if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
44+
Kokkos::CudaSpace>) \
45+
GTEST_SKIP() << "skipping since unified memory requires additional " \
46+
"fences";
47+
#else
48+
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
49+
#endif
50+
4051
TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
4152
using namespace Kokkos::Test::Tools;
4253
listen_tool_events(Config::DisableAll(), Config::EnableKernels());
@@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {
657668

658669
TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
659670
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
671+
GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
660672

661673
using namespace Kokkos::Test::Tools;
662674
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),

core/src/Cuda/Kokkos_CudaSpace.cpp

Lines changed: 36 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
#include <algorithm>
3232
#include <atomic>
3333

34-
//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
3534
#include <impl/Kokkos_Error.hpp>
3635

3736
#include <impl/Kokkos_Tools.hpp>
@@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id,
178177
cudaError_t error_code = cudaSuccess;
179178
#ifndef CUDART_VERSION
180179
#error CUDART_VERSION undefined!
180+
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
181+
// This is intended for Grace-Hopper (and future unified memory architectures)
182+
// The idea is to use host allocator and then advise to keep it in HBM on the
183+
// device, but that requires CUDA 12.2
184+
static_assert(CUDART_VERSION >= 12020,
185+
"CUDA runtime version >=12.2 required when "
186+
"Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. "
187+
"Please update your CUDA runtime version or "
188+
"reconfigure with "
189+
"-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF");
190+
if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr
191+
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
192+
if (error_code == cudaSuccess) {
193+
// One would think cudaMemLocation{device_id,
194+
// cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of
195+
// members doesn't seem to be defined.
196+
cudaMemLocation loc;
197+
loc.id = device_id;
198+
loc.type = cudaMemLocationTypeDevice;
199+
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
200+
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
201+
}
202+
}
181203
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
182204
if (arg_alloc_size >= memory_threshold_g) {
183205
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
@@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id,
190212
"Kokkos::Cuda: backend fence after async malloc");
191213
}
192214
}
193-
} else
215+
} else {
216+
error_code = cudaMalloc(&ptr, arg_alloc_size);
217+
}
218+
#else
219+
error_code = cudaMalloc(&ptr, arg_alloc_size);
194220
#endif
195-
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }
221+
196222
if (error_code != cudaSuccess) { // TODO tag as unlikely branch
197223
// This is the only way to clear the last error, which
198224
// we should do here since we're turning it into an
@@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate(
326352
}
327353
#ifndef CUDART_VERSION
328354
#error CUDART_VERSION undefined!
355+
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
356+
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
357+
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
329358
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
330359
if (arg_alloc_size >= memory_threshold_g) {
331360
Impl::cuda_device_synchronize(
@@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,
436465

437466
#include <impl/Kokkos_SharedAlloc_timpl.hpp>
438467

468+
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
439469
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
440470
Kokkos::CudaSpace);
471+
#else
472+
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
473+
#endif
441474
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
442475
Kokkos::CudaUVMSpace);
443476
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(

core/src/Cuda/Kokkos_CudaSpace.hpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,19 @@ class CudaSpace {
8888
void* allocate(const char* arg_label, const size_t arg_alloc_size,
8989
const size_t arg_logical_size = 0) const;
9090

91+
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
92+
template <typename ExecutionSpace>
93+
void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
94+
return allocate(arg_alloc_size);
95+
}
96+
template <typename ExecutionSpace>
97+
void* allocate(const ExecutionSpace&, const char* arg_label,
98+
const size_t arg_alloc_size,
99+
const size_t arg_logical_size = 0) const {
100+
return allocate(arg_label, arg_alloc_size, arg_logical_size);
101+
}
102+
#endif
103+
91104
/**\brief Deallocate untracked memory in the cuda space */
92105
void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
93106
void deallocate(const char* arg_label, void* const arg_alloc_ptr,
@@ -337,7 +350,11 @@ static_assert(
337350
template <>
338351
struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
339352
enum : bool { assignable = false };
340-
enum : bool { accessible = false };
353+
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
354+
enum : bool{accessible = false};
355+
#else
356+
enum : bool { accessible = true };
357+
#endif
341358
enum : bool { deepcopy = true };
342359
};
343360

@@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
558575
//----------------------------------------------------------------------------
559576
//----------------------------------------------------------------------------
560577

578+
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
561579
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
562580
Kokkos::CudaSpace);
581+
#else
582+
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
583+
#endif
563584
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
564585
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);
565586

core/src/Cuda/Kokkos_Cuda_Instance.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
607607

608608
//----------------------------------
609609

610+
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
611+
// Check if unified memory is available
612+
int cuda_result;
613+
cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
614+
cuda_device_id);
615+
if (cuda_result == 0) {
616+
Kokkos::abort(
617+
"Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
618+
"this device\n"
619+
"Please recompile Kokkos with "
620+
"-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
621+
}
622+
#endif
623+
624+
//----------------------------------
625+
610626
cudaStream_t singleton_stream;
611627
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
612628
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
@@ -705,6 +721,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
705721
#else
706722
os << "no\n";
707723
#endif
724+
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
725+
os << " KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
726+
os << "yes\n";
727+
#endif
708728

709729
os << "\nCuda Runtime Configuration:\n";
710730

core/src/Kokkos_View.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,8 @@ inline constexpr Kokkos::ALL_t ALL{};
571571
#pragma omp end declare target
572572
#endif
573573

574+
inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{};
575+
574576
inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};
575577

576578
inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};

core/src/OpenMP/Kokkos_OpenMP.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ int OpenMP::impl_thread_pool_size() const noexcept {
113113
}
114114

115115
int OpenMP::impl_max_hardware_threads() noexcept {
116-
return Impl::g_openmp_hardware_max_threads;
116+
return Impl::OpenMPInternal::max_hardware_threads();
117117
}
118118

119119
namespace Impl {

0 commit comments

Comments
 (0)