@@ -648,10 +648,8 @@ struct alignas(kRequiredVectorAlignment) F14Chunk {
648
648
// //////
649
649
// Tag filtering using NEON intrinsics
650
650
651
- SparseMaskIter tagMatchIter (std::size_t needle) const {
652
- FOLLY_SAFE_DCHECK (needle >= 0x80 && needle < 0x100 , " " );
651
+ SparseMaskIter tagMatchIter (uint8x16_t needleV) const {
653
652
uint8x16_t tagV = vld1q_u8 (&tags_[0 ]);
654
- auto needleV = vdupq_n_u8 (static_cast <uint8_t >(needle));
655
653
auto eqV = vceqq_u8 (tagV, needleV);
656
654
// get info from every byte into the bottom half of every uint16_t
657
655
// by shifting right 4, then round to get it into a 64-bit vector
@@ -676,27 +674,9 @@ struct alignas(kRequiredVectorAlignment) F14Chunk {
676
674
return static_cast <TagVector const *>(static_cast <void const *>(&tags_[0 ]));
677
675
}
678
676
679
- SparseMaskIter tagMatchIter (std::size_t needle) const {
680
- FOLLY_SAFE_DCHECK (needle >= 0x80 && needle < 0x100 , " " );
677
+ SparseMaskIter tagMatchIter (__m128i needleV) const {
681
678
auto tagV = _mm_load_si128 (tagVector ());
682
679
683
- // TRICKY! It may seem strange to have a std::size_t needle and narrow
684
- // it at the last moment, rather than making HashPair::second be a
685
- // uint8_t, but the latter choice sometimes leads to a performance
686
- // problem.
687
- //
688
- // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
689
- // to multiple instructions. One of those is a MOVD of either 4 or
690
- // 8 byte width. Only the bottom byte of that move actually affects
691
- // the result, but if a 1-byte needle has been spilled then this will
692
- // be a 4 byte load. GCC 5.5 has been observed to reload needle
693
- // (or perhaps fuse a reload and part of a previous static_cast)
694
- // needle using a MOVZX with a 1 byte load in parallel with the MOVD.
695
- // This combination causes a failure of store-to-load forwarding,
696
- // which has a big performance penalty (60 nanoseconds per find on
697
- // a microbenchmark). Keeping needle >= 4 bytes avoids the problem
698
- // and also happens to result in slightly more compact assembly.
699
- auto needleV = _mm_set1_epi8 (static_cast <uint8_t >(needle));
700
680
auto eqV = _mm_cmpeq_epi8 (tagV, needleV);
701
681
auto mask = _mm_movemask_epi8 (eqV) & kFullMask ;
702
682
return SparseMaskIter{mask};
@@ -1576,19 +1556,53 @@ class F14Table : public Policy {
1576
1556
1577
1557
std::size_t probeDelta (HashPair hp) const { return 2 * hp.second + 1 ; }
1578
1558
1559
+ #if FOLLY_NEON
1560
+
1561
+ // TRICKY! It may seem strange to have a std::size_t needle and narrow
1562
+ // it at the last moment, rather than making HashPair::second be a
1563
+ // uint8_t, but the latter choice sometimes leads to a performance
1564
+ // problem.
1565
+ //
1566
+ // On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
1567
+ // to multiple instructions. One of those is a MOVD of either 4 or
1568
+ // 8 byte width. Only the bottom byte of that move actually affects
1569
+ // the result, but if a 1-byte needle has been spilled then this will
1570
+ // be a 4 byte load. GCC 5.5 has been observed to reload needle
1571
+ // (or perhaps fuse a reload and part of a previous static_cast)
1572
+ // needle using a MOVZX with a 1 byte load in parallel with the MOVD.
1573
+ // This combination causes a failure of store-to-load forwarding,
1574
+ // which has a big performance penalty (60 nanoseconds per find on
1575
+ // a microbenchmark). Keeping needle >= 4 bytes avoids the problem
1576
+ // and also happens to result in slightly more compact assembly.
1577
+
1578
+ FOLLY_ALWAYS_INLINE uint8x16_t loadNeedleV (std::size_t needle) const {
1579
+ return vdupq_n_u8 (static_cast <uint8_t >(needle));
1580
+ }
1581
+ #elif FOLLY_SSE >= 2
1582
+ FOLLY_ALWAYS_INLINE __m128i loadNeedleV (std::size_t needle) const {
1583
+ return _mm_set1_epi8 (static_cast <uint8_t >(needle));
1584
+ }
1585
+ #else
1586
+ FOLLY_ALWAYS_INLINE std::size_t loadNeedleV (std::size_t needle) const {
1587
+ return needle;
1588
+ }
1589
+ #endif
1590
+
1579
1591
enum class Prefetch { DISABLED, ENABLED };
1580
1592
1581
1593
template <typename K>
1582
1594
FOLLY_ALWAYS_INLINE ItemIter
1583
1595
findImpl (HashPair hp, K const & key, Prefetch prefetch) const {
1596
+ FOLLY_SAFE_DCHECK (hp.second >= 0x80 && hp.second < 0x100 , " " );
1584
1597
std::size_t index = hp.first ;
1585
1598
std::size_t step = probeDelta (hp);
1599
+ auto needleV = loadNeedleV (hp.second );
1586
1600
for (std::size_t tries = 0 ; tries >> chunkShift () == 0 ; ++tries) {
1587
1601
ChunkPtr chunk = chunks_ + moduloByChunkCount (index);
1588
1602
if (prefetch == Prefetch::ENABLED && sizeof (Chunk) > 64 ) {
1589
1603
prefetchAddr (chunk->itemAddr (8 ));
1590
1604
}
1591
- auto hits = chunk->tagMatchIter (hp. second );
1605
+ auto hits = chunk->tagMatchIter (needleV );
1592
1606
while (hits.hasNext ()) {
1593
1607
auto i = hits.next ();
1594
1608
if (FOLLY_LIKELY (this ->keyMatchesItem (key, chunk->item (i)))) {
@@ -1658,13 +1672,14 @@ class F14Table : public Policy {
1658
1672
FOLLY_ALWAYS_INLINE ItemIter findMatching (K const & key, F&& func) const {
1659
1673
auto hp = splitHash (this ->computeKeyHash (key));
1660
1674
std::size_t index = hp.first ;
1675
+ auto needleV = loadNeedleV (hp.second );
1661
1676
std::size_t step = probeDelta (hp);
1662
1677
for (std::size_t tries = 0 ; tries >> chunkShift () == 0 ; ++tries) {
1663
1678
ChunkPtr chunk = chunks_ + moduloByChunkCount (index);
1664
1679
if (sizeof (Chunk) > 64 ) {
1665
1680
prefetchAddr (chunk->itemAddr (8 ));
1666
1681
}
1667
- auto hits = chunk->tagMatchIter (hp. second );
1682
+ auto hits = chunk->tagMatchIter (needleV );
1668
1683
while (hits.hasNext ()) {
1669
1684
auto i = hits.next ();
1670
1685
if (FOLLY_LIKELY (
0 commit comments