-
Notifications
You must be signed in to change notification settings - Fork 349
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Popcnt vectorization #198
base: master
Are you sure you want to change the base?
Popcnt vectorization #198
Changes from 5 commits
88a8a0c
a65f3f4
8983625
40409bc
9748a34
7f72dc4
bd7ad14
91362e6
24ad2ce
db534bb
7fe66dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Check if the CPU provides fast operations | ||
# for popcount, leftmost and rightmost bit | ||
|
||
set(AVX2 0) | ||
# Check if we are on a Linux system | ||
if(CMAKE_SYSTEM_NAME STREQUAL "Linux") | ||
# Use /proc/cpuinfo to get the information | ||
file(STRINGS "/proc/cpuinfo" _cpuinfo) | ||
if(_cpuinfo MATCHES "(avx2)") | ||
set(AVX2 1) | ||
endif() | ||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows") | ||
# handle windows | ||
# get_filename_component(_vendor_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;VendorIdentifier]" NAME CACHE) | ||
# get_filename_component(_cpu_id "[HKEY_LOCAL_MACHINE\\Hardware\\Description\\System\\CentralProcessor\\0;Identifier]" NAME CACHE) | ||
elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") | ||
# handle MacOs | ||
execute_process(COMMAND sysctl -n machdep.cpu.features | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi Diego, thanks for your contribution. I'm just testing the code on a Mac equipped with a CPU (i7-4850HQ) which supports AVX2. Surprisingly the command
So maybe just a match on the latter output? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sounds reasonable :) I have run my code on Linux machines only, so I did not face this problem |
||
OUTPUT_VARIABLE _cpuinfo OUTPUT_STRIP_TRAILING_WHITESPACE) | ||
if(_cpuinfo MATCHES "AVX2") | ||
set(AVX2 1) | ||
endif() | ||
endif() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,9 @@ | |
#include <stdint.h> // for uint64_t uint32_t declaration | ||
#include <iostream>// for cerr | ||
#include <cassert> | ||
#include <x86intrin.h> // SSE/AVX | ||
#include "ymm_union.hpp" // convenient YMM register wrapper | ||
#include "xmm_union.hpp" // convenient XMM register wrapper | ||
#ifdef __SSE4_2__ | ||
#include <xmmintrin.h> | ||
#endif | ||
|
@@ -237,6 +240,57 @@ struct bits { | |
|
||
// ============= inline - implementations ================ | ||
|
||
#ifdef __AVX2__ | ||
inline uint64_t bits::cnt256(__m256i x){ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please define the methods further up: #ifdef __AVX2__
static uint64_t cnt256(__m256i x);
#endif same for cnt128: #ifdef __SSE4_2__
static uint64_t cnt128(__m128i x);
#endif Also note: sse4_2 -> SSE4_2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done in latest commit |
||
|
||
// 4-bit universal table, 4-bit mask | ||
static const __m256i MASK4_256 = _mm256_set1_epi8(0x0F); | ||
static const __m256i POPCNT_LOOKUP_4BF_MASK256 = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, | ||
1, 2, 2, 3, 2, 3, 3, 4, | ||
0, 1, 1, 2, 1, 2, 2, 3, | ||
1, 2, 2, 3, 2, 3, 3, 4); | ||
|
||
__m256i low, high, bwcount; | ||
|
||
// byte halves stored in separate YMM registers | ||
low = _mm256_and_si256(MASK4_256, x); | ||
high = _mm256_and_si256(MASK4_256, _mm256_srli_epi16(x, 4)); | ||
|
||
// bytewise population count | ||
bwcount = _mm256_add_epi8(_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, low), | ||
_mm256_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK256, high)); | ||
|
||
// Computes sum of absolute differences and stores intermediate results at positions 0,4,8 and 12 | ||
__m256i fourSums = _mm256_sad_epu8(bwcount, _mm256_setzero_si256()); | ||
|
||
// Use union to access individual bytes (unsigned integers) | ||
sdsl::YMM_Union<uint8_t> ymm_union; | ||
ymm_union.ymm = fourSums; | ||
return ymm_union.ymm[0] + ymm_union.ymm[4] + ymm_union.ymm[8] + ymm_union.ymm[12]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. gcc says: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which line exactly? I am able to build without warnings for some reason |
||
} | ||
#endif | ||
|
||
#ifdef __SSE4_2__ | ||
inline uint64_t bits::cnt128(__m128i x){ | ||
|
||
// 4-bit universal table, 4-bit mask | ||
static const __m128i POPCNT_LOOKUP_4BF_MASK = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4); | ||
static const __m128i MASK4 = _mm_set1_epi8(0x0F); | ||
|
||
__m128i low, high, count; | ||
|
||
low = _mm_and_si128(MASK4, x); | ||
high = _mm_and_si128(MASK4, _mm_srli_epi16(x, 4)); | ||
count = _mm_add_epi8(_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, low), | ||
_mm_shuffle_epi8(POPCNT_LOOKUP_4BF_MASK, high)); | ||
|
||
// Use union to access individual bytes (unsigned integers) | ||
sdsl::XMM_Union<uint8_t> xmm_union; | ||
xmm_union.sse = _mm_sad_epu8(x, _mm_setzero_si128()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. xmm_union has no member called sse, right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, copy-paste error |
||
return xmm_union.values[0] + xmm_union.values[4]; | ||
} | ||
#endif | ||
|
||
// see page 11, Knuth TAOCP Vol 4 F1A | ||
inline uint64_t bits::cnt(uint64_t x) | ||
{ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -62,8 +62,30 @@ class uint256_t | |
} | ||
|
||
inline uint16_t popcount() { | ||
#ifdef __AVX2__ // Fastest method: 32 table lookups per clock cycle | ||
sdsl::YMM_Union<uint64_t> ymm_union; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. YMM_Union -> YMM_union There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, will be done |
||
ymm_union[0] = m_lo; | ||
ymm_union[1] = m_mid; | ||
ymm_union[2] = m_high >> 64; | ||
ymm_union[3] = m_high; | ||
return bits::cnt256(ymm_union.ymm); | ||
#endif | ||
|
||
#ifdef __SSE4_2__ // 16 table lookups per clock cycle | ||
sdsl::XMM_Union<uint64_t> xmm_union1; | ||
sdsl::XMM_Union<uint64_t> xmm_union2; | ||
xmm_union1[0] = m_lo; | ||
xmm_union1[1] = m_mid; | ||
xmm_union2[0] = m_high >> 64; | ||
xmm_union2[1] = m_high; | ||
|
||
return bits::cnt128(xmm_union1.xmm) + bits::cnt128(xmm_union2.xmm); | ||
|
||
|
||
#else // byte after byte | ||
return ((uint16_t)bits::cnt(m_lo)) + bits::cnt(m_mid) | ||
+ bits::cnt(m_high>>64) + bits::cnt(m_high); | ||
#endif | ||
} | ||
|
||
inline uint16_t hi() { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* sdsl - succinct data structures library | ||
Copyright (C) 2012 Simon Gog | ||
|
||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see http://www.gnu.org/licenses/ . | ||
*/ | ||
/*! \file xmm_union.hpp | ||
\brief xmm_union.hpp contains a convenientunion for XMM registers (128-bits). | ||
\author Diego Havenstein | ||
*/ | ||
#ifndef INCLUDED_SDSL_XMMUNION | ||
#define INCLUDED_SDSL_XMMUNION | ||
|
||
namespace sdsl | ||
{ | ||
|
||
#ifdef __SSE4_2__ | ||
template<typename T> | ||
union XMM_union { | ||
__m128i xmm; | ||
T values[16/sizeof(T)]; | ||
}; | ||
#endif | ||
|
||
} // end namespace | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* sdsl - succinct data structures library | ||
Copyright (C) 2012 Simon Gog | ||
|
||
This program is free software: you can redistribute it and/or modify | ||
it under the terms of the GNU General Public License as published by | ||
the Free Software Foundation, either version 3 of the License, or | ||
(at your option) any later version. | ||
|
||
This program is distributed in the hope that it will be useful, | ||
but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
GNU General Public License for more details. | ||
|
||
You should have received a copy of the GNU General Public License | ||
along with this program. If not, see http://www.gnu.org/licenses/ . | ||
*/ | ||
/*! \file ymm_union.hpp | ||
\brief ymm_union.hpp contains a convenientunion for YMM registers (256-bits). | ||
\author Diego Havenstein | ||
*/ | ||
#ifndef INCLUDED_SDSL_YMMUNION | ||
#define INCLUDED_SDSL_YMMUNION | ||
|
||
namespace sdsl | ||
{ | ||
|
||
#ifdef __AVX2__ | ||
template<typename T> | ||
union YMM_union { | ||
__m256i ymm; | ||
T values[32/sizeof(T)]; | ||
}; | ||
#endif | ||
|
||
} // end namespace | ||
|
||
#endif |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
elseif( CMAKE_COMPILER_IS_CLANGXX ) instead of else()?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, should be better. But I don't know what is the best thing to put in the "else" case then (nothing?)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just output a message that AVX2 is available but only the GCC and CLANG compiler are supported at the moment.