Skip to content

Commit 5c712ab

Browse files
Nicoshevfacebook-github-bot
authored andcommitted
Hoist needle vectorization
Summary: Vectorization of needle has been hoisted from tagMatchIter to outside findImpl's and findMatching's loops Small performance improvements have been noticed in both AMD64 and aarch64: AMD64 before: P1716284089 after: P1716284261 aarch64 before: P1716286979 after: P1716286963 Reviewed By: ot, ilvokhin Differential Revision: D68460634 fbshipit-source-id: 01d704917cfa2c6da744b316a57049b07dd08e29
1 parent 5b6267b commit 5c712ab

File tree

1 file changed

+39
-24
lines changed
  • third-party/folly/src/folly/container/detail

1 file changed

+39
-24
lines changed

third-party/folly/src/folly/container/detail/F14Table.h

Lines changed: 39 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -648,10 +648,8 @@ struct alignas(kRequiredVectorAlignment) F14Chunk {
648648
////////
649649
// Tag filtering using NEON intrinsics
650650

651-
SparseMaskIter tagMatchIter(std::size_t needle) const {
652-
FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
651+
SparseMaskIter tagMatchIter(uint8x16_t needleV) const {
653652
uint8x16_t tagV = vld1q_u8(&tags_[0]);
654-
auto needleV = vdupq_n_u8(static_cast<uint8_t>(needle));
655653
auto eqV = vceqq_u8(tagV, needleV);
656654
// get info from every byte into the bottom half of every uint16_t
657655
// by shifting right 4, then round to get it into a 64-bit vector
@@ -676,27 +674,9 @@ struct alignas(kRequiredVectorAlignment) F14Chunk {
676674
return static_cast<TagVector const*>(static_cast<void const*>(&tags_[0]));
677675
}
678676

679-
SparseMaskIter tagMatchIter(std::size_t needle) const {
680-
FOLLY_SAFE_DCHECK(needle >= 0x80 && needle < 0x100, "");
677+
SparseMaskIter tagMatchIter(__m128i needleV) const {
681678
auto tagV = _mm_load_si128(tagVector());
682679

683-
// TRICKY! It may seem strange to have a std::size_t needle and narrow
684-
// it at the last moment, rather than making HashPair::second be a
685-
// uint8_t, but the latter choice sometimes leads to a performance
686-
// problem.
687-
//
688-
// On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
689-
// to multiple instructions. One of those is a MOVD of either 4 or
690-
// 8 byte width. Only the bottom byte of that move actually affects
691-
// the result, but if a 1-byte needle has been spilled then this will
692-
// be a 4 byte load. GCC 5.5 has been observed to reload needle
693-
// (or perhaps fuse a reload and part of a previous static_cast)
694-
// needle using a MOVZX with a 1 byte load in parallel with the MOVD.
695-
// This combination causes a failure of store-to-load forwarding,
696-
// which has a big performance penalty (60 nanoseconds per find on
697-
// a microbenchmark). Keeping needle >= 4 bytes avoids the problem
698-
// and also happens to result in slightly more compact assembly.
699-
auto needleV = _mm_set1_epi8(static_cast<uint8_t>(needle));
700680
auto eqV = _mm_cmpeq_epi8(tagV, needleV);
701681
auto mask = _mm_movemask_epi8(eqV) & kFullMask;
702682
return SparseMaskIter{mask};
@@ -1576,19 +1556,53 @@ class F14Table : public Policy {
15761556

15771557
std::size_t probeDelta(HashPair hp) const { return 2 * hp.second + 1; }
15781558

1559+
#if FOLLY_NEON
1560+
1561+
// TRICKY! It may seem strange to have a std::size_t needle and narrow
1562+
// it at the last moment, rather than making HashPair::second be a
1563+
// uint8_t, but the latter choice sometimes leads to a performance
1564+
// problem.
1565+
//
1566+
// On architectures with SSE2 but not AVX2, _mm_set1_epi8 expands
1567+
// to multiple instructions. One of those is a MOVD of either 4 or
1568+
// 8 byte width. Only the bottom byte of that move actually affects
1569+
// the result, but if a 1-byte needle has been spilled then this will
1570+
// be a 4 byte load. GCC 5.5 has been observed to reload needle
1571+
// (or perhaps fuse a reload and part of a previous static_cast)
1572+
// needle using a MOVZX with a 1 byte load in parallel with the MOVD.
1573+
// This combination causes a failure of store-to-load forwarding,
1574+
// which has a big performance penalty (60 nanoseconds per find on
1575+
// a microbenchmark). Keeping needle >= 4 bytes avoids the problem
1576+
// and also happens to result in slightly more compact assembly.
1577+
1578+
FOLLY_ALWAYS_INLINE uint8x16_t loadNeedleV(std::size_t needle) const {
1579+
return vdupq_n_u8(static_cast<uint8_t>(needle));
1580+
}
1581+
#elif FOLLY_SSE >= 2
1582+
FOLLY_ALWAYS_INLINE __m128i loadNeedleV(std::size_t needle) const {
1583+
return _mm_set1_epi8(static_cast<uint8_t>(needle));
1584+
}
1585+
#else
1586+
FOLLY_ALWAYS_INLINE std::size_t loadNeedleV(std::size_t needle) const {
1587+
return needle;
1588+
}
1589+
#endif
1590+
15791591
enum class Prefetch { DISABLED, ENABLED };
15801592

15811593
template <typename K>
15821594
FOLLY_ALWAYS_INLINE ItemIter
15831595
findImpl(HashPair hp, K const& key, Prefetch prefetch) const {
1596+
FOLLY_SAFE_DCHECK(hp.second >= 0x80 && hp.second < 0x100, "");
15841597
std::size_t index = hp.first;
15851598
std::size_t step = probeDelta(hp);
1599+
auto needleV = loadNeedleV(hp.second);
15861600
for (std::size_t tries = 0; tries >> chunkShift() == 0; ++tries) {
15871601
ChunkPtr chunk = chunks_ + moduloByChunkCount(index);
15881602
if (prefetch == Prefetch::ENABLED && sizeof(Chunk) > 64) {
15891603
prefetchAddr(chunk->itemAddr(8));
15901604
}
1591-
auto hits = chunk->tagMatchIter(hp.second);
1605+
auto hits = chunk->tagMatchIter(needleV);
15921606
while (hits.hasNext()) {
15931607
auto i = hits.next();
15941608
if (FOLLY_LIKELY(this->keyMatchesItem(key, chunk->item(i)))) {
@@ -1658,13 +1672,14 @@ class F14Table : public Policy {
16581672
FOLLY_ALWAYS_INLINE ItemIter findMatching(K const& key, F&& func) const {
16591673
auto hp = splitHash(this->computeKeyHash(key));
16601674
std::size_t index = hp.first;
1675+
auto needleV = loadNeedleV(hp.second);
16611676
std::size_t step = probeDelta(hp);
16621677
for (std::size_t tries = 0; tries >> chunkShift() == 0; ++tries) {
16631678
ChunkPtr chunk = chunks_ + moduloByChunkCount(index);
16641679
if (sizeof(Chunk) > 64) {
16651680
prefetchAddr(chunk->itemAddr(8));
16661681
}
1667-
auto hits = chunk->tagMatchIter(hp.second);
1682+
auto hits = chunk->tagMatchIter(needleV);
16681683
while (hits.hasNext()) {
16691684
auto i = hits.next();
16701685
if (FOLLY_LIKELY(

0 commit comments

Comments
 (0)