Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions contrib/test_jenkins.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1128,12 +1128,9 @@ run_release_mode_tests() {
# Run nt_buffer_transfer tests
#
run_nt_buffer_transfer_tests() {
if lscpu | grep -q 'AuthenticAMD'
then
build release --enable-gtest --enable-optimizations
echo "==== Running nt_buffer_transfer tests ===="
./test/gtest/gtest --gtest_filter="test_arch.nt_buffer_transfer_*"
fi
build release --enable-gtest --enable-optimizations
echo "==== Running test_arch tests with optimizations ===="
./test/gtest/gtest --gtest_filter="test_arch.*"
}

set_ucx_common_test_env() {
Expand Down
5 changes: 3 additions & 2 deletions src/ucs/arch/bitops.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,11 @@ BEGIN_C_DECLS
((sizeof(_n) <= 4) ? __builtin_ctz((uint32_t)(_n)) : __builtin_ctzl(_n))

/* Returns the number of leading 0-bits in _n.
* If _n is 0, the result is undefined
*/
#define ucs_count_leading_zero_bits(_n) \
((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : __builtin_clzl(_n))
((_n) ? ((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : \
__builtin_clzl(_n)) : \
(int)(sizeof(_n) * 8))

/* Returns the number of bits lower than 'bit_index' that are set in 'mask'
* For example: ucs_bitmap2idx(mask=0xF0, idx=6) returns 2
Expand Down
10 changes: 3 additions & 7 deletions src/ucs/arch/x86_64/cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -1060,13 +1060,12 @@ size_t ucs_x86_nt_src_buffer_transfer(void *dst, const void *src, size_t len)
return len;
}

static UCS_F_ALWAYS_INLINE
void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
static UCS_F_ALWAYS_INLINE void
ucs_x86_copy_bytes_le_128(void *dst, const void *src, uint32_t len)
{
#if defined (__LZCNT__)
__m256i y0, y1, y2, y3;
/* Handle lengths that fall usually within eager short range */
switch (_lzcnt_u32(len)) {
switch (ucs_count_leading_zero_bits(len)) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the len is zero the code expects '32' as output, is ucs_count_leading_zero_bits(0) return 32?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes you're right, should be fixed now

Copy link
Contributor

@arun-chandran-edarath arun-chandran-edarath Feb 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one issue; _lzcnt_u32(len) is expected to count the leading zeros from a 32bit operand.

_lzcnt_u32(0) should produce 32.

But now ucs_count_leading_zero_bits(len) will use _lzcnt_u64() because 'size_t len' will be 8 byte and the outputs produced makes the switch_cases wrong. [_lzcnt_u64(0) gives 64]

It should be ucs_count_leading_zero_bits((uint32_t) len) right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks, also it's catched by the test suite, just that it was not running in our CI. Fixed by changing function params, pls double check that it makes sense.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks ok to me.

/* 0 */
case 32:
break;
Expand Down Expand Up @@ -1121,9 +1120,6 @@ void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
_mm256_storeu_si256(UCS_PTR_BYTE_OFFSET(dst, len - 32), y3);
break;
}
#else
memcpy(dst, src, len);
#endif
}

/* This is an adaptation of the memcpy code from https://github.com/amd/aocl-libmem
Expand Down
68 changes: 32 additions & 36 deletions test/gtest/ucs/arch/test_x86_64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,63 +64,70 @@ class test_arch : public ucs::test {
return result;
}

void nt_buffer_transfer_test(ucs_arch_memcpy_hint_t hint) {
void nt_buffer_transfer_test(ucs_arch_memcpy_hint_t hint)
{
#ifndef __AVX__
UCS_TEST_SKIP_R("Built without AVX support");
#else
int i, j, result, ret = 0;
char *test_window_src, *test_window_dst, *src, *dst, *dup;
int i, j;
char *src, *dst;
size_t len, total_size, test_window_size, hole_size, align;

align = 64;
test_window_size = 8 * 1024;
hole_size = 2 * align;

auto msg = [&]() {
std::stringstream ss;
ss << "using length=" << len << " src_align=" << i
<< " dst_align=" << j;
return ss.str();
};

/*
* Allocate a hole above and below the test_window_size
* to check for writes beyond the designated area.
*/
total_size = test_window_size + (2 * hole_size);

ret = posix_memalign((void**)&test_window_src, align, total_size);
if (ret) {
goto src_fail;
}
auto alloc_aligned = [&align, &total_size]() {
void *ptr;
return std::unique_ptr<char>(reinterpret_cast<char*>(
!posix_memalign(&ptr, align, total_size) ? ptr : nullptr));
};

ret = posix_memalign((void**)&test_window_dst, align, total_size);
if (ret) {
goto dst_fail;
}
auto test_window_src = alloc_aligned();
auto test_window_dst = alloc_aligned();
auto dup = alloc_aligned();

ret = posix_memalign((void**)&dup, align, total_size);
if (ret) {
goto dup_fail;
}
ASSERT_TRUE(test_window_src);
ASSERT_TRUE(test_window_dst);
ASSERT_TRUE(dup);

src = test_window_src + hole_size;
dst = test_window_dst + hole_size;
src = test_window_src.get() + hole_size;
dst = test_window_dst.get() + hole_size;

/* Initialize the regions with known patterns */
memset(dup, 0x0, total_size);
memset(test_window_src, 0xdeaddead, total_size);
memset(test_window_dst, 0x0, total_size);
memset(dup.get(), 0x0, total_size);
memset(test_window_src.get(), 0xdeaddead, total_size);
memset(test_window_dst.get(), 0x0, total_size);

len = 0;

while (len < test_window_size) {
for (i = 0; i < align; i++) {
for (j = 0; j < align; j++) {
/* Perform the transfer */
ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint, len);
result = memcmp(src + j, dst + i, len);
EXPECT_EQ(0, result);
ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint,
len);
ASSERT_EQ(0, memcmp(src + j, dst + i, len)) << msg();

/* reset the copied region back to zero */
memset(dst + i, 0x0, len);

/* check for any modifications in the holes */
result = memcmp(test_window_dst, dup, total_size);
EXPECT_EQ(0, result);
ASSERT_EQ(0, memcmp(test_window_dst.get(), dup.get(),
total_size));
}
}
/* Check for each len for less than 1k sizes
Expand All @@ -132,17 +139,6 @@ class test_arch : public ucs::test {
len += 53;
}
}

free(dup);

dup_fail:
free(test_window_dst);
dst_fail:
free(test_window_src);
src_fail:
if (ret) {
UCS_TEST_ABORT("Failed to allocate memory: " << strerror(ret));
}
#endif
}
};
Expand Down
36 changes: 36 additions & 0 deletions test/gtest/ucs/test_bitops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,42 @@ UCS_TEST_F(test_bitops, is_equal) {
test_bitops::check_bitwise_equality(buffer1, buffer2, indices, 0);
}

template<typename T> void test_clz()
{
constexpr int bits = sizeof(T) * 8;
T v = 1;

for (int i = bits - 1; v != 0; v <<= 1, --i) {
ASSERT_EQ(i, ucs_count_leading_zero_bits(v));
}

ASSERT_EQ(bits, ucs_count_leading_zero_bits(v));
}

UCS_TEST_F(test_bitops, clz) {
test_clz<uint32_t>();
test_clz<uint64_t>();
test_clz<int32_t>();
test_clz<int64_t>();
test_clz<size_t>();
test_clz<ssize_t>();
}

UCS_TEST_F(test_bitops, clz_type)
{
EXPECT_GT(0, ucs_count_leading_zero_bits(~0LLU) - 1);

EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0LLU) - 1);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0LLU) - 65);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0U) - 1);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0U) - 33);

EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0LL) - 1);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0LL) - 65);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0) - 1);
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0) - 33);
}

template<typename Type> void test_mask()
{
Type expected = 0;
Expand Down
Loading