gitbak

FloydZ · Feb 27, 2025 · dc9cfbe · dc9cfbe
1 parent 8529d46
commit dc9cfbe
Show file tree

Hide file tree

Showing 13 changed files with 281 additions and 157 deletions.
diff --git a/src/algorithm/histogram.h b/src/algorithm/histogram.h
@@ -33,9 +33,6 @@ constexpr static uint32_t histogram_csize = 256;
 constexpr static void avx512_histogram_u8_1x(uint32_t cnt[256],
 									  		 const uint8_t *__restrict in,
 									  		 const size_t inlen) noexcept {
-	// cryptanalysislib::template memset<uint32_t>(cnt, 0u, 256u);
-	// const __m512i acc = _mm512_set1_epi32(1);
-
 	uint32_t tmp1[16] __attribute__((aligned(64))) = {0};
 
 	size_t i = 0;
@@ -56,7 +53,7 @@ constexpr static void avx512_histogram_u8_1x(uint32_t cnt[256],
 		}
 	}
 
-	// tailmng
+	// tail mng
 	for (; i < inlen; ++i) {
 		cnt[in[i]]++;
 	}
@@ -101,6 +98,7 @@ static void avx512_histogram_u32_v3(uint32_t C[256],
 
 /// using popcnt
 /// NOTE: inputs are uint32_t: with values < 2**8
+/// TODO: no tail managment
 /// \param C
 /// \param A
 /// \param size
@@ -121,12 +119,8 @@ static void avx512_histogram_u32_v4(uint32_t C[256],
 
 
 ///
-static inline 
-void FA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) {
-    //__m512i tmp = _mm512_ternarylogic_epi32(c, b, a, 0x96);
-    //h = _mm512_ternarylogic_epi32(c, b, a, 0xE8);    
-    //l = tmp;
-
+constexr static inline
+void FA(__m512i& h, __m512i& l, __m512i a, __m512i b, __m512i c) noexcept {
     l = _mm512_ternarylogic_epi32(c, b, a, 0x96);
     h = _mm512_ternarylogic_epi32(l, b, a, 0x8E);
 }
@@ -220,7 +214,13 @@ static void consume_buffer_2(uint8_t* data, size_t N, uint16_t* hist16) {
     _mm512_storeu_epi16(hist16 + 32, _mm512_add_epi16(h1, _mm512_loadu_epi16(hist16 + 32)));
 }
 
-void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {
+/// \param histogram[out]:
+/// \param ptr[in]: uint8_t input data
+/// \param N[in]: array size
+constexpr static
+void hist256_2(uint32_t* histogram,
+               const uint8_t* ptr,
+               const size_t N) noexcept {
     // Scalar loop to align input pointer.
     if (N >= 64) {
         uint8_t* end = ptr + N;
@@ -232,7 +232,6 @@ void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {
 
     // Input bytes are binned into buffers; 0 for 0-63, 1 for 64-127, 2 for 128-191, 3 for 192-255.
     const size_t bufsize = 1024 * 16;
-    //  = (uint8_t*)_aligned_malloc(bufsize * 4, 64);
     uint8_t  buffer0[bufsize*4] __attribute__((aligned(64)));
     uint8_t *buffer1 = buffer0 + bufsize;
     uint8_t *buffer2 = buffer1 + bufsize;
@@ -300,8 +299,6 @@ void hist256_2(uint8_t* ptr, size_t N, uint32_t* histogram) {
         }
     }
 
-    // _aligned_free(buffer0);
-
     // Scalar loop to deal with any remaining input.
     while (N) {
         histogram[ptr[--N]] += 1;

diff --git a/src/algorithm/zip.h b/src/algorithm/zip.h
@@ -80,8 +80,8 @@ static inline void zip_u8(__m256i *__restrict__ out1,
 	const __m256i b = _mm256_loadu_si256(in2);
 	const __m256i tmp1 = _mm256_unpacklo_epi8(a, b);
 	*out2 = _mm256_unpackhi_epi8(a, b);
-	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
-	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
+	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
+	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
 }
 
 /// \param out1 lower part: [x1y1, x2y2, ..., x8y8]
@@ -96,8 +96,8 @@ static inline void zip_u16(__m256i *__restrict__ out1,
 	const __m256i b = _mm256_loadu_si256(in2);
 	const __m256i tmp1 = _mm256_unpacklo_epi16(a, b);
 	*out2 = _mm256_unpackhi_epi16(a, b);
-	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
-	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
+	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
+	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
 }
 
 /// \param out1 lower part: [x1y1, x2y2, ..., x4y4]
@@ -112,8 +112,8 @@ static inline void zip_u32(__m256i *__restrict__ out1,
 	const __m256i b = _mm256_loadu_si256(in2);
 	const __m256i tmp1 = _mm256_unpacklo_epi32(a, b);
 	*out2 = _mm256_unpackhi_epi32(a, b);
-	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
-	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
+	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
+	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
 }
 
 /// \param out1 lower part: [x1y1, x2y2]
@@ -128,8 +128,8 @@ static inline void zip_u64(__m256i *__restrict__ out1,
 	const __m256i b = _mm256_loadu_si256(in2);
 	const __m256i tmp1 = _mm256_unpacklo_epi64(a, b);
 	*out2 = _mm256_unpackhi_epi64(a, b);
-	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0b1000);
-	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0b01);
+	*out1 = _mm256_permute2x128_si256(tmp1, *out2, 0x20);
+	*out2 = _mm256_permute2x128_si256(tmp1, *out2, 0x31);
 }
 
 /// \param out
@@ -140,12 +140,12 @@ static inline void zip_u8(uint16_t *__restrict__ out,
 						  const uint8_t *__restrict__ in1,
 						  const uint8_t *__restrict__ in2,
 						  const size_t n) {
-	for (size_t i = 0; (i+32) <= n; i += 32) {
-		zip_u8((__m256i *)out, (__m256i *)(out + 16), (__m256i *)in1, (__m256i *)in2);
-		in1 += 32; in2 += 32; out += 16;
+	size_t i = 0;
+	for (; (i+32) <= n; i += 32) {
+		zip_u8((__m256i *)(out + i), (__m256i *)(out + 16 + i), (__m256i *)(in1+i), (__m256i *)(in2 + i));
 	}
 
-	for (size_t i = 0; i < n; i++) {
+	for (; i < n; i++) {
 		const uint16_t t = (uint16_t)(in1[i]) | (((uint16_t)(in2[i])) << 8u);
 		out[i] = t;
 	}

diff --git a/src/combination/bit_subset.h b/src/combination/bit_subset.h
@@ -2,7 +2,7 @@
 
 #include <cstdint>
 
-// Generate all all subsets of bits of  a given word.
+// Generate all subsets of bits of  a given word.
 //
 // E.g., for the word ('.' printed for unset bits)
 //   ...11.1.
@@ -19,7 +19,7 @@ template<typename T = uint64_t>
 class bit_subset_T {
 protected:
 	T U;// current subset
-	T V;// the full set
+	const T V;// the full set
 
 public:
 	explicit bit_subset_T(T v) : U(0), V(v) { ; }
@@ -47,32 +47,6 @@ class bit_subset_T {
 		return U;
 	}
 
-    /// \return
-	constexpr inline T first(const T v) noexcept {
-		V = v;
-		U = 0;
-		return U;
-	}
-
-    /// \return
-	constexpr inline T first() noexcept {
-		first(V);
-		return U;
-	}
-
-    /// \return
-	constexpr inline T last(T v) noexcept {
-		V = v;
-		U = v;
-		return U;
-	}
-
-    /// \return
-	constexpr inline T last() noexcept {
-		last(V);
-		return U;
-	}
-
     /// \return
 	constexpr inline void set(T u) noexcept {
 		U = u & V;
@@ -115,38 +89,35 @@ class bit_subset_T {
 	}
 
     /// \return
-	constexpr inline T shift_left() {
+	constexpr inline T shift_left() noexcept {
 		U = ((U << 1) + ~V) & V;
 		return U;
 	}
 
     /// \return
-	constexpr inline T shift_left_fill() {
+	constexpr inline T shift_left_fill() noexcept {
 		shift_left();
 		next();
 		return U;
 	}
 
     /// \return
-	constexpr inline T shift_left_blocks() {
+	constexpr inline T shift_left_blocks() noexcept {
 		U = (U << 1) & V;
 		return U;
 	}
 
     /// \return
-	constexpr inline T shift_left_blocks_fill() {
+	constexpr inline T shift_left_blocks_fill() noexcept {
 		shift_left_blocks();
 		U |= ((-V + ~V) & V);
 		return U;
 	}
 
     /// \return
-	constexpr inline T rev_gray_code() {
+	constexpr inline T rev_gray_code() noexcept {
 		U ^= ((U << 1) + ~V);
 		U &= V;
 		return U;
 	}
-};
-
-
-// -------------------------
+};
diff --git a/src/combination/colex.h b/src/combination/colex.h
@@ -33,8 +33,6 @@ class enumeration_colex {
 	/// i.e.  1111..100..00 (k high bits set)
 	/// Must have:  0 <= k <= n <= BITS_PER_LONG
 	constexpr static inline T last_comb() noexcept {
-		//    if ( BITS_PER_LONG == k )  return  ~0UL;
-		//    else return  ((1UL<<k)-1) << (n - k);
 		return first_comb(k) << (n - k);
 	}
 
@@ -64,7 +62,7 @@ class enumeration_colex {
 	///.
 	/// based on code by Doug Moore / Glenn Rhoads
 	/// note: might want to use bitscan near end
-	constexpr static inline T next_colex_comb(T x) noexcept {
+	constexpr static inline T next(T x) noexcept {
 		T r = x & -x;// lowest set bit
 		x += r;          // replace lowest block by a one left to it
 
@@ -78,22 +76,26 @@ class enumeration_colex {
 	}
 
 	// Inverse of next_colex_comb()
-	constexpr static inline T prev_colex_comb(T x) noexcept {
+	constexpr static inline T prev(T x) noexcept {
 		x = next_colex_comb(~x);
 		if (0 != x) x = ~x;
 		return x;
 	}
 
 public:
     constexpr enumeration_colex() noexcept {};
+
+	///
     constexpr inline T next() noexcept {
         const T ret = val;
-        val = next_colex_comb(val);
+        val = next(val);
         return ret;
-    } 
+    }
+
+	///
     constexpr inline T prev() noexcept {
         const T ret = val;
-        val = prev_colex_comb(val);
+        val = prev(val);
         return ret;
     } 
 };
diff --git a/src/combination/fibonacci_gray.h b/src/combination/fibonacci_gray.h
@@ -1,25 +1,71 @@
 #pragma once 
+#include <cstdint>
 
 
-
-// Fibonacci Gray code with binary words.
-template<typename T>
+/// Fibonacci Gray code with binary words.
+/// Example (n = 5)
+/// 10000
+/// 10001
+/// 10101
+/// 10100
+/// 00100
+/// 00101
+/// 00001
+/// 00000
+/// 00010
+/// 01010
+/// 01000
+/// 01001
+template<typename T,
+          const uint32_t n>
 class bit_fibgray {
-public:
-    ulong x_;  // current Fibonacci word
-    ulong k_;  // aux
-    ulong fw_, lw_;  // first and last Fibonacci word in Gray code
-    ulong mw_;  // max(fw_, lw_)
-    ulong n_;   // Number of bits
+private:
+    T x_;  // current Fibonacci word
+    T k_;  // aux
+    T fw_, lw_;  // first and last Fibonacci word in Gray code
+    T mw_;  // max(fw_, lw_)
 
-public:
-    explicit bit_fibgray(ulong n) {
-        n_ = n;
+	// binary --> radix(-2)
+	static inline constexpr T bin2neg(T x) noexcept {
+		// mask in radix 2 is ...10101010
+		const T m = 0xaaaaaaaaaaaaaaaaUL;
+		x += m;
+		x ^= m;
+		return  x;
+	}
+
+	// radix(-2) --> binary
+	// inverse of bin2neg()
+	constexpr inline T neg2bin(T x) noexcept {
+		const T m = 0xaaaaaaaaaaaaaaaaUL;
+		x ^= m;
+		x -= m;
+		return  x;
+	}
 
+	// inverse of gray_code()
+	// note: the returned value contains at each bit position
+	// the parity of all bits of the input left from it (incl. itself)
+	//
+	constexpr static inline T inverse_gray_code(T x) noexcept {
+		x ^= x>>1;  // gray ** 1
+		x ^= x>>2;  // gray ** 2
+		x ^= x>>4;  // gray ** 4
+		x ^= x>>8;  // gray ** 8
+		x ^= x>>16;  // gray ** 16
+		// here: x = gray**31(input)
+		// note: the statements can be reordered at will
+		x ^= x>>32;  // for 64bit words
+		return  x;
+	}
+
+
+public:
+    explicit bit_fibgray() noexcept {
         fw_ = 0;
-        for (ulong m=(1UL<<(n-1)); m!=0; m>>=3)  fw_ |= m;
+        for (T m=(1UL<<(n-1)); m!=0; m>>=3)  fw_ |= m;
         lw_ = fw_ >> 1;
-        if ( 0==(n&1) )  { ulong t=fw_; fw_=lw_; lw_=t; }  // swap first/last
+        if ( 0==(n&1) )  { T t=fw_; fw_=lw_; lw_=t; }  // swap first/last
         mw_ = ( lw_>fw_ ? lw_ : fw_ );
         x_ = fw_;
 
@@ -29,22 +75,23 @@ class bit_fibgray {
 
     ~bit_fibgray()  { ; }
 
-    ulong data()  const  { return x_; }
+    constexpr inline T data() const noexcept { return x_; }
 
     // Return next word in Gray code.
     // Return ~0 if current word is the last one.
-    ulong next() {
+    constexpr T next() noexcept {
         if ( x_ == lw_ )  return ~0UL;
 
-        ulong s = n_;  // shift
+        T s = n;  // shift
         while(1) {
             --s;
-            ulong c = 1 | (mw_ >> s);  // possible difference for negbin word
-            ulong i = k_ - c;
-            ulong x = bin2neg(i);
+            T c = 1 | (mw_ >> s);  // possible difference for negbin word
+            T i = k_ - c;
+            T x = bin2neg(i);
             x ^= (x>>1);
 
-            if ( 0==(x&(x>>1))) {  // is_fibrep(x)
+			// is_fibrep(x)
+            if ( 0==(x&(x>>1))) {
                 k_ = i;
                 x_ = x;
                 return x;