Skip to content

Commit

Permalink
gitbak
Browse files Browse the repository at this point in the history
  • Loading branch information
PingFloyd committed Feb 27, 2025
1 parent 8529d46 commit dc9cfbe
Show file tree
Hide file tree
Showing 13 changed files with 281 additions and 157 deletions.
25 changes: 11 additions & 14 deletions src/algorithm/histogram.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,6 @@ constexpr static uint32_t histogram_csize = 256;
constexpr static void avx512_histogram_u8_1x(uint32_t cnt[256],
const uint8_t *__restrict in,
const size_t inlen) noexcept {
// cryptanalysislib::template memset<uint32_t>(cnt, 0u, 256u);
// const __m512i acc = _mm512_set1_epi32(1);

uint32_t tmp1[16] __attribute__((aligned(64))) = {0};

size_t i = 0;
Expand All @@ -56,7 +53,7 @@ constexpr static void avx512_histogram_u8_1x(uint32_t cnt[256],
}
}

// tailmng
// tail mng
for (; i < inlen; ++i) {
cnt[in[i]]++;
}
Expand Down Expand Up @@ -101,6 +98,7 @@ static void avx512_histogram_u32_v3(uint32_t C[256],

/// using popcnt
/// NOTE: inputs are uint32_t: with values < 2**8
/// TODO: no tail managment
/// \param C
/// \param A
/// \param size
Expand All @@ -121,12 +119,8 @@ static void avx512_histogram_u32_v4(uint32_t C[256],


///
static inline
void FA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) {
//__m512i tmp = _mm512_ternarylogic_epi32(c, b, a, 0x96);
//h = _mm512_ternarylogic_epi32(c, b, a, 0xE8);
//l = tmp;

constexr static inline
void FA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) noexcept {
l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
h = _mm512_ternarylogic_epi32(l, b, a, 0x8E);
}
Expand Down Expand Up @@ -220,7 +214,13 @@ static void consume_buffer_2(uint8_t* data, size_t N, uint16_t* hist16) {
_mm512_storeu_epi16(hist16 + 32, _mm512_add_epi16(h1, _mm512_loadu_epi16(hist16 + 32)));
}

void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {
/// \param histogram[out]:
/// \param ptr[in]: uint8_t input data
/// \param N[in]: array size
constexpr static
void hist256_2(uint32_t* histogram,
const uint8_t* ptr,
const size_t N) noexcept {
// Scalar loop to align input pointer.
if (N >= 64) {
uint8_t* end = ptr + N;
Expand All @@ -232,7 +232,6 @@ void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {

// Input bytes are binned into buffers; 0 for 0-63, 1 for 64-127, 2 for 128-191, 3 for 192-255.
const size_t bufsize = 1024 * 16;
// = (uint8_t*)_aligned_malloc(bufsize * 4, 64);
uint8_t buffer0[bufsize*4] __attribute__((aligned(64)));
uint8_t *buffer1 = buffer0 + bufsize;
uint8_t *buffer2 = buffer1 + bufsize;
Expand Down Expand Up @@ -300,8 +299,6 @@ void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {
}
}

// _aligned_free(buffer0);

// Scalar loop to deal with any remaining input.
while (N) {
histogram[ptr[--N]] += 1;
Expand Down
24 changes: 12 additions & 12 deletions src/algorithm/zip.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ static inline void zip_u8(__m256i *__restrict__ out1,
const __m256i b = _mm256_loadu_si256(in2);
const __m256i tmp1 = _mm256_unpacklo_epi8(a, b);
*out2 = _mm256_unpackhi_epi8(a, b);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
}

/// \param out1 lower part: [x1y1, x2y2, ..., x8y8]
Expand All @@ -96,8 +96,8 @@ static inline void zip_u16(__m256i *__restrict__ out1,
const __m256i b = _mm256_loadu_si256(in2);
const __m256i tmp1 = _mm256_unpacklo_epi16(a, b);
*out2 = _mm256_unpackhi_epi16(a, b);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
}

/// \param out1 lower part: [x1y1, x2y2, ..., x4y4]
Expand All @@ -112,8 +112,8 @@ static inline void zip_u32(__m256i *__restrict__ out1,
const __m256i b = _mm256_loadu_si256(in2);
const __m256i tmp1 = _mm256_unpacklo_epi32(a, b);
*out2 = _mm256_unpackhi_epi32(a, b);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
}

/// \param out1 lower part: [x1y1, x2y2]
Expand All @@ -128,8 +128,8 @@ static inline void zip_u64(__m256i *__restrict__ out1,
const __m256i b = _mm256_loadu_si256(in2);
const __m256i tmp1 = _mm256_unpacklo_epi64(a, b);
*out2 = _mm256_unpackhi_epi64(a, b);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
}

/// \param out
Expand All @@ -140,12 +140,12 @@ static inline void zip_u8(uint16_t *__restrict__ out,
const uint8_t *__restrict__ in1,
const uint8_t *__restrict__ in2,
const size_t n) {
for (size_t i = 0; (i+32) <= n; i += 32) {
zip_u8((__m256i *)out, (__m256i *)(out + 16), (__m256i *)in1, (__m256i *)in2);
in1 += 32; in2 += 32; out += 16;
size_t i = 0;
for (; (i+32) <= n; i += 32) {
zip_u8((__m256i *)(out + i), (__m256i *)(out + 16 + i), (__m256i *)(in1+i), (__m256i *)(in2 + i));
}

for (size_t i = 0; i < n; i++) {
for (; i < n; i++) {
const uint16_t t = (uint16_t)(in1[i]) | (((uint16_t)(in2[i])) << 8u);
out[i] = t;
}
Expand Down
45 changes: 8 additions & 37 deletions src/combination/bit_subset.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

#include <cstdint>

// Generate all all subsets of bits of a given word.
// Generate all subsets of bits of a given word.
//
// E.g., for the word ('.' printed for unset bits)
// ...11.1.
Expand All @@ -19,7 +19,7 @@ template<typename T = uint64_t>
class bit_subset_T {
protected:
T U;// current subset
T V;// the full set
const T V;// the full set

public:
explicit bit_subset_T(T v) : U(0), V(v) { ; }
Expand Down Expand Up @@ -47,32 +47,6 @@ class bit_subset_T {
return U;
}

/// \return
constexpr inline T first(const T v) noexcept {
V = v;
U = 0;
return U;
}

/// \return
constexpr inline T first() noexcept {
first(V);
return U;
}

/// \return
constexpr inline T last(T v) noexcept {
V = v;
U = v;
return U;
}

/// \return
constexpr inline T last() noexcept {
last(V);
return U;
}

/// \return
constexpr inline void set(T u) noexcept {
U = u & V;
Expand Down Expand Up @@ -115,38 +89,35 @@ class bit_subset_T {
}

/// \return
constexpr inline T shift_left() {
constexpr inline T shift_left() noexcept {
U = ((U << 1) + ~V) & V;
return U;
}

/// \return
constexpr inline T shift_left_fill() {
constexpr inline T shift_left_fill() noexcept {
shift_left();
next();
return U;
}

/// \return
constexpr inline T shift_left_blocks() {
constexpr inline T shift_left_blocks() noexcept {
U = (U << 1) & V;
return U;
}

/// \return
constexpr inline T shift_left_blocks_fill() {
constexpr inline T shift_left_blocks_fill() noexcept {
shift_left_blocks();
U |= ((-V + ~V) & V);
return U;
}

/// \return
constexpr inline T rev_gray_code() {
constexpr inline T rev_gray_code() noexcept {
U ^= ((U << 1) + ~V);
U &= V;
return U;
}
};


// -------------------------
};
16 changes: 9 additions & 7 deletions src/combination/colex.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ class enumeration_colex {
/// i.e. 1111..100..00 (k high bits set)
/// Must have: 0 <= k <= n <= BITS_PER_LONG
constexpr static inline T last_comb() noexcept {
// if ( BITS_PER_LONG == k ) return ~0UL;
// else return ((1UL<<k)-1) << (n - k);
return first_comb(k) << (n - k);
}

Expand Down Expand Up @@ -64,7 +62,7 @@ class enumeration_colex {
///.
/// based on code by Doug Moore / Glenn Rhoads
/// note: might want to use bitscan near end
constexpr static inline T next_colex_comb(T x) noexcept {
constexpr static inline T next(T x) noexcept {
T r = x & -x;// lowest set bit
x += r; // replace lowest block by a one left to it

Expand All @@ -78,22 +76,26 @@ class enumeration_colex {
}

// Inverse of next_colex_comb()
constexpr static inline T prev_colex_comb(T x) noexcept {
constexpr static inline T prev(T x) noexcept {
x = next_colex_comb(~x);
if (0 != x) x = ~x;
return x;
}

public:
constexpr enumeration_colex() noexcept {};

///
constexpr inline T next() noexcept {
const T ret = val;
val = next_colex_comb(val);
val = next(val);
return ret;
}
}

///
constexpr inline T prev() noexcept {
const T ret = val;
val = prev_colex_comb(val);
val = prev(val);
return ret;
}
};
89 changes: 68 additions & 21 deletions src/combination/fibonacci_gray.h
Original file line number Diff line number Diff line change
@@ -1,25 +1,71 @@
#pragma once
#include <cstdint>



// Fibonacci Gray code with binary words.
template<typename T>
/// Fibonacci Gray code with binary words.
/// Example (n = 5)
/// 10000
/// 10001
/// 10101
/// 10100
/// 00100
/// 00101
/// 00001
/// 00000
/// 00010
/// 01010
/// 01000
/// 01001
template<typename T,
const uint32_t n>
class bit_fibgray {
public:
ulong x_; // current Fibonacci word
ulong k_; // aux
ulong fw_, lw_; // first and last Fibonacci word in Gray code
ulong mw_; // max(fw_, lw_)
ulong n_; // Number of bits
private:
T x_; // current Fibonacci word
T k_; // aux
T fw_, lw_; // first and last Fibonacci word in Gray code
T mw_; // max(fw_, lw_)

public:
explicit bit_fibgray(ulong n) {
n_ = n;
// binary --> radix(-2)
static inline constexpr T bin2neg(T x) noexcept {
// mask in radix 2 is ...10101010
const T m = 0xaaaaaaaaaaaaaaaaUL;
x += m;
x ^= m;
return x;
}

// radix(-2) --> binary
// inverse of bin2neg()
constexpr inline T neg2bin(T x) noexcept {
const T m = 0xaaaaaaaaaaaaaaaaUL;
x ^= m;
x -= m;
return x;
}

// inverse of gray_code()
// note: the returned value contains at each bit position
// the parity of all bits of the input left from it (incl. itself)
//
constexpr static inline T inverse_gray_code(T x) noexcept {
x ^= x>>1; // gray ** 1
x ^= x>>2; // gray ** 2
x ^= x>>4; // gray ** 4
x ^= x>>8; // gray ** 8
x ^= x>>16; // gray ** 16
// here: x = gray**31(input)
// note: the statements can be reordered at will
x ^= x>>32; // for 64bit words
return x;
}


public:
explicit bit_fibgray() noexcept {
fw_ = 0;
for (ulong m=(1UL<<(n-1)); m!=0; m>>=3) fw_ |= m;
for (T m=(1UL<<(n-1)); m!=0; m>>=3) fw_ |= m;
lw_ = fw_ >> 1;
if ( 0==(n&1) ) { ulong t=fw_; fw_=lw_; lw_=t; } // swap first/last
if ( 0==(n&1) ) { T t=fw_; fw_=lw_; lw_=t; } // swap first/last
mw_ = ( lw_>fw_ ? lw_ : fw_ );
x_ = fw_;

Expand All @@ -29,22 +75,23 @@ class bit_fibgray {

~bit_fibgray() { ; }

ulong data() const { return x_; }
constexpr inline T data() const noexcept { return x_; }

// Return next word in Gray code.
// Return ~0 if current word is the last one.
ulong next() {
constexpr T next() noexcept {
if ( x_ == lw_ ) return ~0UL;

ulong s = n_; // shift
T s = n; // shift
while(1) {
--s;
ulong c = 1 | (mw_ >> s); // possible difference for negbin word
ulong i = k_ - c;
ulong x = bin2neg(i);
T c = 1 | (mw_ >> s); // possible difference for negbin word
T i = k_ - c;
T x = bin2neg(i);
x ^= (x>>1);

if ( 0==(x&(x>>1))) { // is_fibrep(x)
// is_fibrep(x)
if ( 0==(x&(x>>1))) {
k_ = i;
x_ = x;
return x;
Expand Down
Loading

0 comments on commit dc9cfbe

Please sign in to comment.