Skip to content

Commit

Permalink
Cherrypick SWDEV-408046 (#594) (#349)
Browse files Browse the repository at this point in the history
* SWDEV-408046 (#594)

 solves SWDEV-408046

* Clang formatting

---------

Co-authored-by: Yvan Mokwinski <yvan.mokwinski@gmail.com>
Co-authored-by: jsandham <james.sandham@amd.com>
  • Loading branch information
3 people authored Aug 29, 2023
1 parent 2ac2e27 commit aadb6e3
Show file tree
Hide file tree
Showing 28 changed files with 229 additions and 171 deletions.
19 changes: 10 additions & 9 deletions library/src/extra/bsrgemm_device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*! \file */
/* ************************************************************************
* Copyright (C) 2022 Advanced Micro Devices, Inc. All rights Reserved.
* Copyright (C) 2022-2023 Advanced Micro Devices, Inc. All rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -188,16 +188,16 @@ static __device__ __forceinline__ void insert_pair_rxc(
if(table[hash] == key)
{
// Element already present, add value to exsiting entry
atomicAdd(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val);
rocsparse_atomic_add(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val);
break;
}
else if(table[hash] == empty)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], empty, key) == empty)
if(rocsparse_atomic_cas(&table[hash], empty, key) == empty)
{
// Add value
atomicAdd(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val);
rocsparse_atomic_add(&data[BLOCKDIM * BLOCKDIM * hash + BLOCKDIM * row + col], val);
break;
}
}
Expand Down Expand Up @@ -1320,9 +1320,10 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio
}
}

atomicAdd(&data[BLOCKDIM * BLOCKDIM * (col_B - chunk_begin)
+ BLOCKDIM * r + c],
alpha * val_AB);
rocsparse_atomic_add(
&data[BLOCKDIM * BLOCKDIM * (col_B - chunk_begin) + BLOCKDIM * r
+ c],
alpha * val_AB);
}
}
else if(col_B >= chunk_end)
Expand Down Expand Up @@ -1373,7 +1374,7 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio
val_D = beta * bsr_val_D[block_dim * block_dim * j + block_dim * c + r];
}

atomicAdd(
rocsparse_atomic_add(
&data[BLOCKDIM * BLOCKDIM * (col_D - chunk_begin) + BLOCKDIM * r + c],
val_D);
}
Expand All @@ -1391,7 +1392,7 @@ __device__ void bsrgemm_block_per_row_atomic_multipass_device(rocsparse_directio
{
// Atomically determine the new chunks beginning (minimum column index of B
// that is larger than the current chunks end point)
atomicMin(&next_chunk, min_col);
rocsparse_atomic_min(&next_chunk, min_col);
}

// Wait for all threads to finish
Expand Down
20 changes: 10 additions & 10 deletions library/src/extra/csrgemm_device.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*! \file */
/* ************************************************************************
* Copyright (C) 2019-2022 Advanced Micro Devices, Inc. All rights Reserved.
* Copyright (C) 2019-2023 Advanced Micro Devices, Inc. All rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -346,7 +346,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table)
else if(table[hash] == -1)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], -1, key) == -1)
if(rocsparse_atomic_cas<I>(&table[hash], -1, key) == -1)
{
// Increment number of insertions
return true;
Expand Down Expand Up @@ -376,16 +376,16 @@ static __device__ __forceinline__ void
if(table[hash] == key)
{
// Element already present, add value to exsiting entry
atomicAdd(&data[hash], val);
rocsparse_atomic_add(&data[hash], val);
break;
}
else if(table[hash] == empty)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], empty, key) == empty)
if(rocsparse_atomic_cas<I>(&table[hash], empty, key) == empty)
{
// Add value
atomicAdd(&data[hash], val);
rocsparse_atomic_add(&data[hash], val);
break;
}
}
Expand Down Expand Up @@ -792,7 +792,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL
{
// Atomically determine the new chunks beginning (minimum column index of B
// that is larger than the current chunks end point)
atomicMin(&next_chunk, min_col);
rocsparse_atomic_min(&next_chunk, min_col);
}

// Wait for all threads to finish row nnz operation
Expand All @@ -812,7 +812,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL
if(lid == WFSIZE - 1)
{
// Atomically add this chunks nnz to the total row nnz
atomicAdd(&nnz, chunk_nnz);
rocsparse_atomic_add(&nnz, chunk_nnz);
}

// Wait for atomics to be processed
Expand Down Expand Up @@ -1316,7 +1316,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n,
table[col_B - chunk_begin] = 1;

// Atomically accumulate the intermediate products
atomicAdd(&data[col_B - chunk_begin], val_A * csr_val_B[k]);
rocsparse_atomic_add(&data[col_B - chunk_begin], val_A * csr_val_B[k]);
}
else if(col_B >= chunk_end)
{
Expand Down Expand Up @@ -1360,7 +1360,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n,
table[col_D - chunk_begin] = 1;

// Atomically accumulate the entry of D
atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]);
rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]);
}
else if(col_D >= chunk_end)
{
Expand All @@ -1382,7 +1382,7 @@ __device__ void csrgemm_fill_block_per_row_multipass_device(J n,
{
// Atomically determine the new chunks beginning (minimum column index of B
// that is larger than the current chunks end point)
atomicMin(&next_chunk, min_col);
rocsparse_atomic_min(&next_chunk, min_col);
}

// Wait for all threads to finish
Expand Down
18 changes: 9 additions & 9 deletions library/src/extra/rocsparse_csrgemm_numeric.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table)
else if(table[hash] == -1)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], -1, key) == -1)
if(rocsparse_atomic_cas(&table[hash], -1, key) == -1)
{
// Increment number of insertions
return true;
Expand Down Expand Up @@ -116,8 +116,8 @@ static __device__ __forceinline__ bool
}
else if(table[hash] == -1)
{
atomicCAS(&table[hash], -1, key);
atomicCAS(&local_idxs[hash], -1, local_idx);
rocsparse_atomic_cas(&table[hash], -1, key);
rocsparse_atomic_cas(&local_idxs[hash], -1, local_idx);
return true;
}
else
Expand All @@ -142,16 +142,16 @@ static __device__ __forceinline__ void
if(table[hash] == key)
{
// Element already present, add value to exsiting entry
atomicAdd(&data[hash], val);
rocsparse_atomic_add(&data[hash], val);
break;
}
else if(table[hash] == empty)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], empty, key) == empty)
if(rocsparse_atomic_cas(&table[hash], empty, key) == empty)
{
// Add value
atomicAdd(&data[hash], val);
rocsparse_atomic_add(&data[hash], val);
break;
}
}
Expand Down Expand Up @@ -647,7 +647,7 @@ __device__ void
table[col_B - chunk_begin] = 1;

// Atomically accumulate the intermediate products
atomicAdd(&data[col_B - chunk_begin], val_A * csr_val_B[k]);
rocsparse_atomic_add(&data[col_B - chunk_begin], val_A * csr_val_B[k]);
}
else if(col_B >= chunk_end)
{
Expand Down Expand Up @@ -691,7 +691,7 @@ __device__ void
table[col_D - chunk_begin] = 1;

// Atomically accumulate the entry of D
atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]);
rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]);
}
else if(col_D >= chunk_end)
{
Expand All @@ -713,7 +713,7 @@ __device__ void
{
// Atomically determine the new chunks beginning (minimum column index of B
// that is larger than the current chunks end point)
atomicMin(&next_chunk, min_col);
rocsparse_atomic_min(&next_chunk, min_col);
}

// Wait for all threads to finish
Expand Down
6 changes: 3 additions & 3 deletions library/src/extra/rocsparse_csrgemm_symbolic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ __device__ void
table[col_D - chunk_begin] = 1;

// Atomically accumulate the entry of D
// atomicAdd(&data[col_D - chunk_begin], beta * csr_val_D[j]);
// rocsparse_atomic_add(&data[col_D - chunk_begin], beta * csr_val_D[j]);
}
else if(col_D >= chunk_end)
{
Expand All @@ -220,7 +220,7 @@ __device__ void
{
// Atomically determine the new chunks beginning (minimum column index of B
// that is larger than the current chunks end point)
atomicMin(&next_chunk, min_col);
rocsparse_atomic_min(&next_chunk, min_col);
}

// Wait for all threads to finish
Expand Down Expand Up @@ -574,7 +574,7 @@ static __device__ __forceinline__ bool insert_key(I key, I* __restrict__ table,
else if(table[hash] == empty)
{
// If empty, add element with atomic
if(atomicCAS(&table[hash], empty, key) == empty)
if(rocsparse_atomic_cas(&table[hash], empty, key) == empty)
{
// Increment number of insertions
return true;
Expand Down
63 changes: 56 additions & 7 deletions library/src/include/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -168,22 +168,71 @@ __device__ __forceinline__ double rocsparse_shfl(double var, int src_lane, int w
__device__ __forceinline__ rocsparse_float_complex rocsparse_shfl(rocsparse_float_complex var, int src_lane, int width = warpSize) { return rocsparse_float_complex(__shfl(std::real(var), src_lane, width), __shfl(std::imag(var), src_lane, width)); }
__device__ __forceinline__ rocsparse_double_complex rocsparse_shfl(rocsparse_double_complex var, int src_lane, int width = warpSize) { return rocsparse_double_complex(__shfl(std::real(var), src_lane, width), __shfl(std::imag(var), src_lane, width)); }

__device__ __forceinline__ int64_t atomicMin(int64_t* ptr, int64_t val) { return atomicMin((unsigned long long*)ptr, val); }
__device__ __forceinline__ int64_t atomicMax(int64_t* ptr, int64_t val) { return atomicMax((unsigned long long*)ptr, val); }
__device__ __forceinline__ int64_t atomicAdd(int64_t* ptr, int64_t val) { return atomicAdd((unsigned long long*)ptr, val); }
__device__ __forceinline__ int64_t atomicCAS(int64_t* ptr, int64_t cmp, int64_t val) { return atomicCAS((unsigned long long*)ptr, cmp, val); }
template <typename T>
__device__ __forceinline__ T rocsparse_atomic_min(T * ptr, T val)
{
return atomicMin(ptr,val);
}

template <typename T>
__device__ __forceinline__ T rocsparse_atomic_max(T * ptr, T val)
{
return atomicMax(ptr,val);
}

template <typename T>
__device__ __forceinline__ T rocsparse_atomic_add(T * ptr, T val)
{
return atomicAdd(ptr,val);
}

__device__ __forceinline__ rocsparse_float_complex atomicAdd(rocsparse_float_complex* ptr, rocsparse_float_complex val)
template <typename T>
__device__ __forceinline__ T rocsparse_atomic_cas(T * ptr, T cmp, T val)
{
return atomicCAS(ptr, cmp, val);
}



template <>
__device__ __forceinline__ int64_t rocsparse_atomic_min<int64_t>(int64_t * ptr, int64_t val)
{
return atomicMin((unsigned long long*)ptr, (unsigned long long)val);
}

template <>
__device__ __forceinline__ int64_t rocsparse_atomic_max<int64_t>(int64_t * ptr, int64_t val)
{
return atomicMax((unsigned long long*)ptr, val);
}


template <>
__device__ __forceinline__ int64_t rocsparse_atomic_add<int64_t>(int64_t * ptr, int64_t val)
{
return atomicAdd((unsigned long long*)ptr, val);
}

template <>
__device__ __forceinline__ rocsparse_float_complex rocsparse_atomic_add(rocsparse_float_complex* ptr, rocsparse_float_complex val)
{
return rocsparse_float_complex(atomicAdd((float*)ptr, std::real(val)),
atomicAdd((float*)ptr + 1, std::imag(val)));
}
__device__ __forceinline__ rocsparse_double_complex atomicAdd(rocsparse_double_complex* ptr, rocsparse_double_complex val)

template <>
__device__ __forceinline__ rocsparse_double_complex rocsparse_atomic_add(rocsparse_double_complex* ptr, rocsparse_double_complex val)
{
return rocsparse_double_complex(atomicAdd((double*)ptr, std::real(val)),
atomicAdd((double*)ptr + 1, std::imag(val)));
}

template <>
__device__ __forceinline__ int64_t rocsparse_atomic_cas(int64_t* ptr, int64_t cmp, int64_t val)
{
return atomicCAS((unsigned long long*)ptr, cmp, val);
}

__device__ __forceinline__ bool rocsparse_is_inf(float val){ return (val == std::numeric_limits<float>::infinity()); }
__device__ __forceinline__ bool rocsparse_is_inf(double val){ return (val == std::numeric_limits<double>::infinity()); }
__device__ __forceinline__ bool rocsparse_is_inf(rocsparse_float_complex val){ return (std::real(val) == std::numeric_limits<float>::infinity() || std::imag(val) == std::numeric_limits<float>::infinity()); }
Expand Down Expand Up @@ -955,7 +1004,7 @@ __launch_bounds__(BLOCKSIZE) ROCSPARSE_KERNEL

if(tid == 0)
{
atomicMax(max_nnz, shared[0]);
rocsparse_atomic_max(max_nnz, shared[0]);
}
}

Expand Down
8 changes: 4 additions & 4 deletions library/src/level2/bsrsv_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ static ROCSPARSE_DEVICE_ILF void

if(pivot == true)
{
atomicMin(zero_pivot, row + idx_base);
rocsparse_atomic_min(zero_pivot, row + idx_base);
}
}
}
Expand Down Expand Up @@ -321,7 +321,7 @@ static ROCSPARSE_DEVICE_ILF void

if(pivot == true)
{
atomicMin(zero_pivot, row + idx_base);
rocsparse_atomic_min(zero_pivot, row + idx_base);
}
}
}
Expand Down Expand Up @@ -503,7 +503,7 @@ static ROCSPARSE_DEVICE_ILF void
// Find the minimum pivot, if applicable
if(pivot == true)
{
atomicMin(zero_pivot, row + idx_base);
rocsparse_atomic_min(zero_pivot, row + idx_base);
}
}
}
Expand Down Expand Up @@ -685,7 +685,7 @@ static ROCSPARSE_DEVICE_ILF void
// Find the minimum pivot, if applicable
if(pivot == true)
{
atomicMin(zero_pivot, row + idx_base);
rocsparse_atomic_min(zero_pivot, row + idx_base);
}
}
}
Loading

0 comments on commit aadb6e3

Please sign in to comment.