diff --git a/RakeDiagSearch/RakeDiagSearch/Makefile b/RakeDiagSearch/RakeDiagSearch/Makefile
index cd76cea..13da121 100644
--- a/RakeDiagSearch/RakeDiagSearch/Makefile
+++ b/RakeDiagSearch/RakeDiagSearch/Makefile
@@ -4,7 +4,7 @@ BOINC_LIB_DIR = $(BOINC_DIR)/lib
 
 CXX = g++
 
-CXXFLAGS += -O3 -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
+CXXFLAGS += -O3 -g -ftree-vectorize -std=c++11 -static-libgcc -static-libstdc++ \
     -I$(BOINC_DIR) \
     -I$(BOINC_LIB_DIR) \
     -I$(BOINC_API_DIR) \
diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
index 1dec526..6178344 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.cpp
@@ -89,7 +89,7 @@ MovePairSearch::MovePairSearch()
 // Initialize mask4to1bits lookup table
 void MovePairSearch::InitMask4to1bits()
 {
-#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
+#if defined(__AVX2__) && defined(DISABLE_PEXT)
   memset(mask4to1bits, 0, sizeof(mask4to1bits));
   mask4to1bits[0x0000] = 0;
   mask4to1bits[0x000f] = 1;
@@ -332,11 +332,17 @@ void MovePairSearch::OnSquareGenerated(Square newSquare)
     {
       squareA[i][j] = newSquare.Matrix[i][j];
       squareA_Mask[i][j] = 1u << newSquare.Matrix[i][j];
+    }
+  }
 #if defined (__SSE2__) || defined(__ARM_NEON)
-      squareA_MaskT[j][i] = squareA_Mask[i][j];
-#endif
+  for (int i = 0; i < Rank - 1; i++)
+  {
+    for (int j = 0; j < Rank; j++)
+    {
+      squareA_MaskT[j][i] = squareA_Mask[i + 1][j];
     }
   }
+#endif
 
   // Start the rows permutation
   MoveRows();
@@ -436,15 +442,8 @@ void MovePairSearch::MoveRows()
 #ifdef __ARM_NEON
   // Set the powers of 2
   const uint32_t powersOf2[8] = { 1, 2, 4, 8, 16, 32, 64, 128 };
-#ifdef __aarch64__
   const uint32x4_t vPowersOf2Lo = vld1q_u32(powersOf2);
   const uint32x4_t vPowersOf2Hi = vld1q_u32(powersOf2+4);
-#else
-  const uint32x2_t vPowersOf2_1 = vld1_u32(powersOf2);
-  const uint32x2_t vPowersOf2_2 = vld1_u32(powersOf2+2);
-  const uint32x2_t vPowersOf2_3 = vld1_u32(powersOf2+4);
-  const uint32x2_t vPowersOf2_4 = vld1_u32(powersOf2+6);
-#endif
 #endif
 
   while (1)
@@ -515,8 +514,8 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][1]);
-            __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
+            __m256i vCol1 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[currentRowId][0]);
+            __m256i vCol2 = _mm256_loadu_si256((const __m256i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
 
             // AND loaded values with diagnonal masks
             __m256i vDiagMask1 = _mm256_set1_epi32(diagonalValues1);
@@ -555,10 +554,10 @@ void MovePairSearch::MoveRows()
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][1]);
-            __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][5]);
-            __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
+            __m128i vCol1a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][0]);
+            __m128i vCol1b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[currentRowId][4]);
+            __m128i vCol2a = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
+            __m128i vCol2b = _mm_loadu_si128((const __m128i*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
 
             // AND loaded values with diagnonal masks
             __m128i vDiagMask1 = _mm_set1_epi32(diagonalValues1);
@@ -574,26 +573,26 @@ void MovePairSearch::MoveRows()
             vCol1a = _mm_or_si128(vCol1a, vCol2a);
             vCol1b = _mm_or_si128(vCol1b, vCol2b);
 
+            // Saturate_Int32_To_Int8()
+            __m128i vColpack = _mm_packs_epi32(vCol1a, vCol1b);
+            vColpack = _mm_packs_epi16(vColpack, _mm_setzero_si128());
+
             // check if result is zero
-            vCol1a = _mm_cmpeq_epi32(vCol1a, _mm_setzero_si128());
-            vCol1b = _mm_cmpeq_epi32(vCol1b, _mm_setzero_si128());
+            __m128i vColzeros = _mm_cmpeq_epi8(vColpack, _mm_setzero_si128());
+
             // create mask from vector
-            // there are 4 bits per result, so we need to extract every 4th one
-            int mask1 = _mm_movemask_epi8(vCol1a);
-            int mask2 = _mm_movemask_epi8(vCol1b);
-            int mask = mask4to1bits[mask1] | (mask4to1bits[mask2] << 4);
+            int mask = _mm_movemask_epi8(vColzeros);
 
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;
 #elif defined(__ARM_NEON)
-#ifdef __aarch64__
             // load bitmasks for columns which will be on diagonals
             // for performance reasons load this as a row from transposed square
             // also excluse 0th element, row 0 has fixed position in square
-            uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
-            uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
-            uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
+            uint32x4_t vCol1a = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][0]);
+            uint32x4_t vCol1b = vld1q_u32((const uint32_t*)&squareA_MaskT[currentRowId][4]);
+            uint32x4_t vCol2a = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][0]);
+            uint32x4_t vCol2b = vld1q_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][4]);
 
             // AND loaded values with diagnonal masks
             uint32x4_t vDiagMask1 = vdupq_n_u32(diagonalValues1);
@@ -615,62 +614,16 @@ void MovePairSearch::MoveRows()
 
             // create mask from vector
             uint32x4_t v = vorrq_u32(vandq_u32(vCol1a, vPowersOf2Lo), vandq_u32(vCol1b, vPowersOf2Hi));
+#ifdef __aarch64__
             uint32_t mask = vaddvq_u64(vpaddlq_u32(v));
+#else
+            uint32x2_t s = vmovn_u64(vpaddlq_u32(v));
+            uint32_t mask = s[0] + s[1];
+#endif
 
             // add one bit for 0th row, and AND result with rowsUsage
             rowCandidates = (mask << 1) & rowsUsage;
-#else // !__aarch64__
-            // load bitmasks for columns which will be on diagonals
-            // for performance reasons load this as a row from transposed square
-            // also excluse 0th element, row 0 has fixed position in square
-            uint32x2_t vCol1a = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][1]);
-            uint32x2_t vCol1b = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][3]);
-            uint32x2_t vCol1c = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][5]);
-            uint32x2_t vCol1d = vld1_u32((const uint32_t*)&squareA_MaskT[currentRowId][7]);
-            
-            uint32x2_t vCol2a = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][1]);
-            uint32x2_t vCol2b = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][3]);
-            uint32x2_t vCol2c = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][5]);
-            uint32x2_t vCol2d = vld1_u32((const uint32_t*)&squareA_MaskT[Rank - 1 - currentRowId][7]);
-
-            // AND loaded values with diagnonal masks
-            uint32x2_t vDiagMask1 = vdup_n_u32(diagonalValues1);
-            uint32x2_t vDiagMask2 = vdup_n_u32(diagonalValues2);
-
-            vCol1a = vand_u32(vCol1a, vDiagMask1);
-            vCol1b = vand_u32(vCol1b, vDiagMask1);
-            vCol1c = vand_u32(vCol1c, vDiagMask1);
-            vCol1d = vand_u32(vCol1d, vDiagMask1);
-            
-            vCol2a = vand_u32(vCol2a, vDiagMask2);
-            vCol2b = vand_u32(vCol2b, vDiagMask2);
-            vCol2c = vand_u32(vCol2c, vDiagMask2);
-            vCol2d = vand_u32(vCol2d, vDiagMask2);
-
-            // non-zero means that number is duplicated, zero means that it is unique
-            // OR these values together first
-            vCol1a = vorr_u32(vCol1a, vCol2a);
-            vCol1b = vorr_u32(vCol1b, vCol2b);
-            vCol1c = vorr_u32(vCol1c, vCol2c);
-            vCol1d = vorr_u32(vCol1d, vCol2d);
-
-            // check if result is zero
-            vCol1a = vceq_u32(vCol1a, vdup_n_u32(0));
-            vCol1b = vceq_u32(vCol1b, vdup_n_u32(0));
-            vCol1c = vceq_u32(vCol1c, vdup_n_u32(0));
-            vCol1d = vceq_u32(vCol1d, vdup_n_u32(0));
-
-            // create mask from vector
-            uint32x2_t v = vorr_u32(
-              vorr_u32(vand_u32(vCol1a, vPowersOf2_1), vand_u32(vCol1b, vPowersOf2_2)),
-              vorr_u32(vand_u32(vCol1c, vPowersOf2_3), vand_u32(vCol1d, vPowersOf2_4)));
-            //uint32_t mask = vaddv_u32(v);
-            uint32_t mask = v[0] + v[1];
-
-            // add one bit for 0th row, and AND result with rowsUsage
-            rowCandidates = (mask << 1) & rowsUsage;
-#endif
-#endif // AVX2/SSE2
+#endif // AVX2/SSE2/NEON
           }
         }
     }
diff --git a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
index 91ab4e0..eb49641 100644
--- a/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
+++ b/RakeDiagSearch/RakeDiagSearch/MovePairSearch.h
@@ -43,7 +43,7 @@ class MovePairSearch
   int squareB[Rank][Rank];          // Generated DLS, the rows inside which will be permuted 
   int squareA_Mask[Rank][Rank];     // Bitmasks for values in squareA
 #if defined (__SSE2__) || defined(__ARM_NEON)
-  int squareA_MaskT[Rank][Rank];    // Transposed copy of squareA_Mask
+  int squareA_MaskT[Rank][Rank - 1];    // Transposed copy of squareA_Mask
 #endif
   int rowsHistory[Rank];      // Array of the history of rows usage; rowsHistory[number of the row][value] = 0 | 1, where 0 means the row with the number "value" has been used for the row "number of the row" of the generated square; 1 - the row can be used.
   int currentSquareRows[Rank];      // Array listing the current rows used in the square. The number of the used row is at the i-th position
@@ -67,7 +67,7 @@ class MovePairSearch
   string moveSearchComponentHeader; // Header preceding the data about the state of the component of rows permutation
   static const bool isDebug = false; // Flag of displaying debug information
 
-#if defined(__SSE2__) && (!defined(__AVX2__) || defined(DISABLE_PEXT))
+#if defined(__AVX2__) && defined(DISABLE_PEXT)
   unsigned char mask4to1bits[0x10000]; // Lookup table to map 4 bit packs returned by movemask to 1 bit
 #endif
 };