diff --git a/RakeDiagSearch/RakeDiagSearch/Makefile b/RakeDiagSearch/RakeDiagSearch/Makefile index cd76cea..13da121 100644 --- a/RakeDiagSearch/RakeDiagSearch/Makefile +++ b/RakeDiagSearch/RakeDiagSearch/Makefile @@ -4,7 +4,7 @@ BOINC_LIB_DIR = $(BOINC_DIR)/lib CXX = g++ -CXXFLAGS += -O3 -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \ +CXXFLAGS += -O3 -g -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \ -I$(BOINC_DIR) \ -I$(BOINC_LIB_DIR) \ -I$(BOINC_API_DIR) \ diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp index 1dec526..6178344 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp @@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch() // Initialize mask4to1bits lookup table void MovePairSearch::InitMask4to1bits() { -#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT)) +#if defined(__AVX2__) && defined(DISABLE_PEXT) memset(mask4to1bits, 0, sizeof(mask4to1bits)); mask4to1bits[0x0000] = 0; mask4to1bits[0x000f] = 1; @@ -332,11 +332,17 @@ void MovePairSearch::OnSquareGenerated(Square newSquare) { squareA[i][j] = newSquare.Matrix[i][j]; squareA_Mask[i][j] = 1u << newSquare.Matrix[i][j]; + } + } #if defined (__SSE2__) || defined(__ARM_NEON) - squareA_MaskT[j][i] = squareA_Mask[i][j]; -#endif + for (int i = 0; i < Rank - 1; i++) + { + for (int j = 0; j < Rank; j++) + { + squareA_MaskT[j][i] = squareA_Mask[i + 1][j]; } } +#endif // Start the rows permutation MoveRows(); @@ -436,15 +442,8 @@ void MovePairSearch::MoveRows() #ifdef __ARM_NEON // Set the powers of 2 const uint32_t powersOf2[8] = { 1, 2, 4, 8, 16, 32, 64, 128 }; -#ifdef __aarch64__ const uint32x4_t vPowersOf2Lo = vld1q_u32(powersOf2); const uint32x4_t vPowersOf2Hi = vld1q_u32(powersOf2+4); -#else - const uint32x2_t vPowersOf2_1 = vld1_u32(powersOf2); - const uint32x2_t vPowersOf2_2 = vld1_u32(powersOf2+2); - const uint32x2_t vPowersOf2_3 = vld1_u32(powersOf2+4); - const uint32x2_t vPowersOf2_4 = vld1_u32(powersOf2+6); -#endif #endif while (1) @@ -515,8 +514,8 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][1]); - __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][1]); + __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][0]); + __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][0]); // AND loaded values with diagnonal masks __m256i vDiagMask1 = _mm256_set1_epi32(diagonalValues1); @@ -555,10 +554,10 @@ void MovePairSearch::MoveRows() // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][1]); - __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][5]); - __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][5]); + __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][0]); + __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][4]); + __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][0]); + __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][4]); // AND loaded values with diagnonal masks __m128i vDiagMask1 = _mm_set1_epi32(diagonalValues1); @@ -574,26 +573,26 @@ void MovePairSearch::MoveRows() vCol1a = _mm_or_si128(vCol1a, vCol2a); vCol1b = _mm_or_si128(vCol1b, vCol2b); + // Saturate_Int32_To_Int8() + __m128i vColpack = _mm_packs_epi32(vCol1a, vCol1b); + vColpack = _mm_packs_epi16(vColpack, _mm_setzero_si128()); + // check if result is zero - vCol1a = _mm_cmpeq_epi32(vCol1a, _mm_setzero_si128()); - vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128()); + __m128i vColzeros = _mm_cmpeq_epi8(vColpack, _mm_setzero_si128()); + // create mask from vector - // there are 4 bits per result, so we need to extract every 4th one - int mask1 = _mm_movemask_epi8(vCol1a); - int mask2 = _mm_movemask_epi8(vCol1b); - int mask = mask4to1bits[mask1] | (mask4to1bits[mask2] << 4); + int mask = _mm_movemask_epi8(vColzeros); // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; #elif defined(__ARM_NEON) -#ifdef __aarch64__ // load bitmasks for columns which will be on diagonals // for performance reasons load this as a row from transposed square // also excluse 0th element, row 0 has fixed position in square - uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]); - uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]); - uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]); + uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]); + uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]); + uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]); + uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]); // AND loaded values with diagnonal masks uint32x4_t vDiagMask1 = vdupq_n_u32(diagonalValues1); @@ -615,62 +614,16 @@ void MovePairSearch::MoveRows() // create mask from vector uint32x4_t v = vorrq_u32(vandq_u32(vCol1a, vPowersOf2Lo), vandq_u32(vCol1b, vPowersOf2Hi)); +#ifdef __aarch64__ uint32_t mask = vaddvq_u64(vpaddlq_u32(v)); +#else + uint32x2_t s = vmovn_u64(vpaddlq_u32(v)); + uint32_t mask = s[0] + s[1]; +#endif // add one bit for 0th row, and AND result with rowsUsage rowCandidates = (mask << 1) & rowsUsage; -#else // !__aarch64__ - // load bitmasks for columns which will be on diagonals - // for performance reasons load this as a row from transposed square - // also excluse 0th element, row 0 has fixed position in square - uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]); - uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][3]); - uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]); - uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][7]); - - uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]); - uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][3]); - uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]); - uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][7]); - - // AND loaded values with diagnonal masks - uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1); - uint32x2_t vDiagMask2 = vdup_n_u32(diagonalValues2); - - vCol1a = vand_u32(vCol1a, vDiagMask1); - vCol1b = vand_u32(vCol1b, vDiagMask1); - vCol1c = vand_u32(vCol1c, vDiagMask1); - vCol1d = vand_u32(vCol1d, vDiagMask1); - - vCol2a = vand_u32(vCol2a, vDiagMask2); - vCol2b = vand_u32(vCol2b, vDiagMask2); - vCol2c = vand_u32(vCol2c, vDiagMask2); - vCol2d = vand_u32(vCol2d, vDiagMask2); - - // non-zero means that number is duplicated, zero means that it is unique - // OR these values together first - vCol1a = vorr_u32(vCol1a, vCol2a); - vCol1b = vorr_u32(vCol1b, vCol2b); - vCol1c = vorr_u32(vCol1c, vCol2c); - vCol1d = vorr_u32(vCol1d, vCol2d); - - // check if result is zero - vCol1a = vceq_u32(vCol1a, vdup_n_u32(0)); - vCol1b = vceq_u32(vCol1b, vdup_n_u32(0)); - vCol1c = vceq_u32(vCol1c, vdup_n_u32(0)); - vCol1d = vceq_u32(vCol1d, vdup_n_u32(0)); - - // create mask from vector - uint32x2_t v = vorr_u32( - vorr_u32(vand_u32(vCol1a, vPowersOf2_1), vand_u32(vCol1b, vPowersOf2_2)), - vorr_u32(vand_u32(vCol1c, vPowersOf2_3), vand_u32(vCol1d, vPowersOf2_4))); - //uint32_t mask = vaddv_u32(v); - uint32_t mask = v[0] + v[1]; - - // add one bit for 0th row, and AND result with rowsUsage - rowCandidates = (mask << 1) & rowsUsage; -#endif -#endif // AVX2/SSE2 +#endif // AVX2/SSE2/NEON } } } diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h index 91ab4e0..eb49641 100644 --- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h +++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h @@ -43,7 +43,7 @@ class MovePairSearch int squareB[Rank][Rank]; // Generated DLS, the rows inside which will be permuted int squareA_Mask[Rank][Rank]; // Bitmasks for values in squareA #if defined (__SSE2__) || defined(__ARM_NEON) - int squareA_MaskT[Rank][Rank]; // Transposed copy of squareA_Mask + int squareA_MaskT[Rank][Rank - 1]; // Transposed copy of squareA_Mask #endif int rowsHistory[Rank]; // Array of the history of rows usage; rowsHistory[number of the row][value] = 0 | 1, where 0 means the row with the number "value" has been used for the row "number of the row" of the generated square; 1 - the row can be used. int currentSquareRows[Rank]; // Array listing the current rows used in the square. The number of the used row is at the i-th position @@ -67,7 +67,7 @@ class MovePairSearch string moveSearchComponentHeader; // Header preceding the data about the state of the component of rows permutation static const bool isDebug = false; // Flag of displaying debug information -#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT)) +#if defined(__AVX2__) && defined(DISABLE_PEXT) unsigned char mask4to1bits[0x10000]; // Lookup table to map 4 bit packs returned by movemask to 1 bit #endif };