Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SSE/AVX code optimization #2

Open
wants to merge 5 commits into
base: boinc
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion RakeDiagSearch/RakeDiagSearch/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ BOINC_LIB_DIR = $(BOINC_DIR)/lib

CXX = g++

CXXFLAGS += -O3 -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
CXXFLAGS += -O3 -g -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
-I$(BOINC_DIR) \
-I$(BOINC_LIB_DIR) \
-I$(BOINC_API_DIR) \
Expand Down
111 changes: 32 additions & 79 deletions RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch()
// Initialize mask4to1bits lookup table
void MovePairSearch::InitMask4to1bits()
{
#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
#if defined(__AVX2__) && defined(DISABLE_PEXT)
memset(mask4to1bits, 0, sizeof(mask4to1bits));
mask4to1bits[0x0000] = 0;
mask4to1bits[0x000f] = 1;
Expand Down Expand Up @@ -332,11 +332,17 @@ void MovePairSearch::OnSquareGenerated(Square newSquare)
{
squareA[i][j] = newSquare.Matrix[i][j];
squareA_Mask[i][j] = 1u << newSquare.Matrix[i][j];
}
}
#if defined (__SSE2__) || defined(__ARM_NEON)
squareA_MaskT[j][i] = squareA_Mask[i][j];
#endif
for (int i = 0; i < Rank - 1; i++)
{
for (int j = 0; j < Rank; j++)
{
squareA_MaskT[j][i] = squareA_Mask[i + 1][j];
}
}
#endif

// Start the rows permutation
MoveRows();
Expand Down Expand Up @@ -436,15 +442,8 @@ void MovePairSearch::MoveRows()
#ifdef __ARM_NEON
// Set the powers of 2
const uint32_t powersOf2[8] = { 1, 2, 4, 8, 16, 32, 64, 128 };
#ifdef __aarch64__
const uint32x4_t vPowersOf2Lo = vld1q_u32(powersOf2);
const uint32x4_t vPowersOf2Hi = vld1q_u32(powersOf2+4);
#else
const uint32x2_t vPowersOf2_1 = vld1_u32(powersOf2);
const uint32x2_t vPowersOf2_2 = vld1_u32(powersOf2+2);
const uint32x2_t vPowersOf2_3 = vld1_u32(powersOf2+4);
const uint32x2_t vPowersOf2_4 = vld1_u32(powersOf2+6);
#endif
#endif

while (1)
Expand Down Expand Up @@ -515,8 +514,8 @@ void MovePairSearch::MoveRows()
// load bitmasks for columns which will be on diagonals
// for performance reasons load this as a row from transposed square
// also excluse 0th element, row 0 has fixed position in square
__m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][1]);
__m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
__m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][0]);
__m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);

// AND loaded values with diagnonal masks
__m256i vDiagMask1 = _mm256_set1_epi32(diagonalValues1);
Expand Down Expand Up @@ -555,10 +554,10 @@ void MovePairSearch::MoveRows()
// load bitmasks for columns which will be on diagonals
// for performance reasons load this as a row from transposed square
// also excluse 0th element, row 0 has fixed position in square
__m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][1]);
__m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][5]);
__m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
__m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
__m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][0]);
__m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][4]);
__m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
__m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][4]);

// AND loaded values with diagnonal masks
__m128i vDiagMask1 = _mm_set1_epi32(diagonalValues1);
Expand All @@ -574,26 +573,26 @@ void MovePairSearch::MoveRows()
vCol1a = _mm_or_si128(vCol1a, vCol2a);
vCol1b = _mm_or_si128(vCol1b, vCol2b);

// Saturate_Int32_To_Int8()
__m128i vColpack = _mm_packs_epi32(vCol1a, vCol1b);
vColpack = _mm_packs_epi16(vColpack, _mm_setzero_si128());

// check if result is zero
vCol1a = _mm_cmpeq_epi32(vCol1a, _mm_setzero_si128());
vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128());
__m128i vColzeros = _mm_cmpeq_epi8(vColpack, _mm_setzero_si128());

// create mask from vector
// there are 4 bits per result, so we need to extract every 4th one
int mask1 = _mm_movemask_epi8(vCol1a);
int mask2 = _mm_movemask_epi8(vCol1b);
int mask = mask4to1bits[mask1] | (mask4to1bits[mask2] << 4);
int mask = _mm_movemask_epi8(vColzeros);

// add one bit for 0th row, and AND result with rowsUsage
rowCandidates = (mask << 1) & rowsUsage;
#elif defined(__ARM_NEON)
#ifdef __aarch64__
// load bitmasks for columns which will be on diagonals
// for performance reasons load this as a row from transposed square
// also excluse 0th element, row 0 has fixed position in square
uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]);
uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]);
uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]);

// AND loaded values with diagnonal masks
uint32x4_t vDiagMask1 = vdupq_n_u32(diagonalValues1);
Expand All @@ -615,62 +614,16 @@ void MovePairSearch::MoveRows()

// create mask from vector
uint32x4_t v = vorrq_u32(vandq_u32(vCol1a, vPowersOf2Lo), vandq_u32(vCol1b, vPowersOf2Hi));
#ifdef __aarch64__
uint32_t mask = vaddvq_u64(vpaddlq_u32(v));
#else
uint32x2_t s = vmovn_u64(vpaddlq_u32(v));
uint32_t mask = s[0] + s[1];
#endif

// add one bit for 0th row, and AND result with rowsUsage
rowCandidates = (mask << 1) & rowsUsage;
#else // !__aarch64__
// load bitmasks for columns which will be on diagonals
// for performance reasons load this as a row from transposed square
// also excluse 0th element, row 0 has fixed position in square
uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][3]);
uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][7]);

uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][3]);
uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][7]);

// AND loaded values with diagnonal masks
uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1);
uint32x2_t vDiagMask2 = vdup_n_u32(diagonalValues2);

vCol1a = vand_u32(vCol1a, vDiagMask1);
vCol1b = vand_u32(vCol1b, vDiagMask1);
vCol1c = vand_u32(vCol1c, vDiagMask1);
vCol1d = vand_u32(vCol1d, vDiagMask1);

vCol2a = vand_u32(vCol2a, vDiagMask2);
vCol2b = vand_u32(vCol2b, vDiagMask2);
vCol2c = vand_u32(vCol2c, vDiagMask2);
vCol2d = vand_u32(vCol2d, vDiagMask2);

// non-zero means that number is duplicated, zero means that it is unique
// OR these values together first
vCol1a = vorr_u32(vCol1a, vCol2a);
vCol1b = vorr_u32(vCol1b, vCol2b);
vCol1c = vorr_u32(vCol1c, vCol2c);
vCol1d = vorr_u32(vCol1d, vCol2d);

// check if result is zero
vCol1a = vceq_u32(vCol1a, vdup_n_u32(0));
vCol1b = vceq_u32(vCol1b, vdup_n_u32(0));
vCol1c = vceq_u32(vCol1c, vdup_n_u32(0));
vCol1d = vceq_u32(vCol1d, vdup_n_u32(0));

// create mask from vector
uint32x2_t v = vorr_u32(
vorr_u32(vand_u32(vCol1a, vPowersOf2_1), vand_u32(vCol1b, vPowersOf2_2)),
vorr_u32(vand_u32(vCol1c, vPowersOf2_3), vand_u32(vCol1d, vPowersOf2_4)));
//uint32_t mask = vaddv_u32(v);
uint32_t mask = v[0] + v[1];

// add one bit for 0th row, and AND result with rowsUsage
rowCandidates = (mask << 1) & rowsUsage;
#endif
#endif // AVX2/SSE2
#endif // AVX2/SSE2/NEON
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class MovePairSearch
int squareB[Rank][Rank]; // Generated DLS, the rows inside which will be permuted
int squareA_Mask[Rank][Rank]; // Bitmasks for values in squareA
#if defined (__SSE2__) || defined(__ARM_NEON)
int squareA_MaskT[Rank][Rank]; // Transposed copy of squareA_Mask
int squareA_MaskT[Rank][Rank - 1]; // Transposed copy of squareA_Mask
#endif
int rowsHistory[Rank]; // Array of the history of rows usage; rowsHistory[number of the row][value] = 0 | 1, where 0 means the row with the number "value" has been used for the row "number of the row" of the generated square; 1 - the row can be used.
int currentSquareRows[Rank]; // Array listing the current rows used in the square. The number of the used row is at the i-th position
Expand All @@ -67,7 +67,7 @@ class MovePairSearch
string moveSearchComponentHeader; // Header preceding the data about the state of the component of rows permutation
static const bool isDebug = false; // Flag of displaying debug information

#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
#if defined(__AVX2__) && defined(DISABLE_PEXT)
unsigned char mask4to1bits[0x10000]; // Lookup table to map 4 bit packs returned by movemask to 1 bit
#endif
};
Expand Down