diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 17624d8..e2ec1af 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -24,6 +24,10 @@ version 0.13.0-dev false positives from common human genome repeats. The amount of base pairs that are sampled from the beginning and end is user settable with an option to sample everything. ++ The code for the adapter counting was refactored and the vectorized algorithm + was replaced by an AVX2 version that is used when the CPU supports that + instruction set. This is both faster and far less code than the original + implementation. + Extended the README with a few usage examples. version 0.12.0 diff --git a/src/sequali/_qcmodule.c b/src/sequali/_qcmodule.c index 87f5812..f0cd91d 100644 --- a/src/sequali/_qcmodule.c +++ b/src/sequali/_qcmodule.c @@ -37,10 +37,6 @@ along with Sequali. If not, see #include -#ifdef __SSE2__ -#include "emmintrin.h" -#endif - /* Pointers to types that will be imported/initialized in the module initialization section */ @@ -2153,49 +2149,6 @@ typedef struct AdapterSequenceStruct { bitmask_t found_mask; } AdapterSequence; -/* Because we use NUCLEOTIDE_TO_INDEX we can save the bitmasks in the struct - itself. There are only 5 nucleotides (ACGTN) so this uses 40 bytes. With - init_mask and found_mask costing 8 bytes each the entire struct up to - number of sequences fits on one cache line of 64 bytes. Except for the - sequences pointer, but that is only used in case of a match. That makes - accessing the bitmasks very quick memorywise. */ -typedef struct MachineWordPatternMatcherStruct { - bitmask_t init_mask; - bitmask_t found_mask; - bitmask_t bitmasks[NUC_TABLE_SIZE]; - size_t number_of_sequences; - AdapterSequence *sequences; -} MachineWordPatternMatcher; - -static void -MachineWordPatternMatcher_destroy(MachineWordPatternMatcher *matcher) -{ - PyMem_Free(matcher->sequences); - matcher->sequences = NULL; -} - -#ifdef __SSE2__ -typedef struct AdapterSequenceSSE2Struct { - size_t adapter_index; - size_t adapter_length; - __m128i found_mask; -} AdapterSequenceSSE2; - -typedef struct MachineWordPatternMatcherSSE2Struct { - __m128i init_mask; - __m128i found_mask; - __m128i bitmasks[NUC_TABLE_SIZE]; - size_t number_of_sequences; - AdapterSequenceSSE2 *sequences; -} MachineWordPatternMatcherSSE2; - -static void -MachineWordPatternMatcherSSE2_destroy(MachineWordPatternMatcherSSE2 *matcher) -{ - PyMem_Free(matcher->sequences); -} -#endif - typedef struct AdapterCounterStruct { PyObject_HEAD size_t number_of_adapters; @@ -2204,11 +2157,12 @@ typedef struct AdapterCounterStruct { uint64_t **adapter_counter; PyObject *adapters; size_t number_of_matchers; - MachineWordPatternMatcher *matchers; - size_t number_of_sse2_matchers; -#ifdef __SSE2__ - MachineWordPatternMatcherSSE2 *sse2_matchers; -#endif + bitmask_t *init_masks; + bitmask_t *found_masks; + bitmask_t (*bitmasks)[NUC_TABLE_SIZE]; + /* Same as bitmasks, but better organization for vectorized approach. */ + bitmask_t (*by_four_bitmasks)[NUC_TABLE_SIZE][4]; + AdapterSequence **adapter_sequences; } AdapterCounter; static void @@ -2221,107 +2175,24 @@ AdapterCounter_dealloc(AdapterCounter *self) } } PyMem_Free(self->adapter_counter); - for (size_t i = 0; i < self->number_of_matchers; i++) { - MachineWordPatternMatcher_destroy(self->matchers + i); + if (self->adapter_sequences != NULL) { + for (size_t i = 0; i < self->number_of_matchers; i++) { + PyMem_Free(self->adapter_sequences[i]); + } } - PyMem_Free(self->matchers); + PyMem_Free(self->found_masks); + PyMem_Free(self->init_masks); + PyMem_Free(self->bitmasks); + PyMem_Free(self->by_four_bitmasks); + PyMem_Free(self->adapter_sequences); -#ifdef __SSE2__ - for (size_t i = 0; i < self->number_of_sse2_matchers; i++) { - MachineWordPatternMatcherSSE2_destroy(self->sse2_matchers + i); - } - PyMem_Free(self->sse2_matchers); -#endif PyTypeObject *tp = Py_TYPE((PyObject *)self); PyObject_Free(self); Py_XDECREF((PyObject *)tp); } -#ifdef __SSE2__ -int -AdapterCounter_SSE2_convert(AdapterCounter *self) -{ - self->number_of_sse2_matchers = self->number_of_matchers / 2; - if (self->number_of_sse2_matchers == 0) { - return 0; - } - MachineWordPatternMatcherSSE2 *tmp = PyMem_Malloc( - self->number_of_sse2_matchers * sizeof(MachineWordPatternMatcherSSE2)); - if (tmp == NULL) { - PyErr_NoMemory(); - return -1; - } - self->sse2_matchers = tmp; - memset(self->sse2_matchers, 0, - self->number_of_sse2_matchers * sizeof(MachineWordPatternMatcherSSE2)); - for (size_t i = 0; i < self->number_of_sse2_matchers; i++) { - MachineWordPatternMatcherSSE2 *sse2_matcher = self->sse2_matchers + i; - MachineWordPatternMatcher *normal_matcher1 = self->matchers + (i * 2); - MachineWordPatternMatcher *normal_matcher2 = self->matchers + (i * 2 + 1); - sse2_matcher->init_mask = _mm_set_epi64x(normal_matcher1->init_mask, - normal_matcher2->init_mask); - sse2_matcher->found_mask = _mm_set_epi64x(normal_matcher1->found_mask, - normal_matcher2->found_mask); - for (size_t j = 0; j < NUC_TABLE_SIZE; j++) { - sse2_matcher->bitmasks[j] = _mm_set_epi64x( - normal_matcher1->bitmasks[j], normal_matcher2->bitmasks[j]); - } - sse2_matcher->number_of_sequences = normal_matcher1->number_of_sequences + - normal_matcher2->number_of_sequences; - AdapterSequenceSSE2 *seq_tmp = PyMem_Malloc( - sse2_matcher->number_of_sequences * sizeof(AdapterSequenceSSE2)); - if (seq_tmp == NULL) { - PyErr_NoMemory(); - return -1; - } - sse2_matcher->sequences = seq_tmp; - for (size_t j = 0; j < normal_matcher1->number_of_sequences; j++) { - AdapterSequenceSSE2 *sse2_adapter = sse2_matcher->sequences + j; - AdapterSequence *normal_adapter = normal_matcher1->sequences + j; - sse2_adapter->adapter_index = normal_adapter->adapter_index; - sse2_adapter->adapter_length = normal_adapter->adapter_length; - sse2_adapter->found_mask = - _mm_set_epi64x(normal_adapter->found_mask, 0); - }; - for (size_t j = 0; j < normal_matcher2->number_of_sequences; j++) { - size_t offset = normal_matcher1->number_of_sequences; - AdapterSequenceSSE2 *sse2_adapter = - sse2_matcher->sequences + j + offset; - AdapterSequence *normal_adapter = normal_matcher2->sequences + j; - sse2_adapter->adapter_index = normal_adapter->adapter_index; - sse2_adapter->adapter_length = normal_adapter->adapter_length; - sse2_adapter->found_mask = - _mm_set_epi64x(0, normal_adapter->found_mask); - }; - } - - for (size_t i = 0; i < (self->number_of_sse2_matchers * 2); i++) { - MachineWordPatternMatcher_destroy(self->matchers + i); - } - size_t number_of_remaining_matchers = self->number_of_matchers % 2; - if (number_of_remaining_matchers == 0) { - self->number_of_matchers = 0; - PyMem_FREE(self->matchers); - self->matchers = NULL; - return 0; - } - MachineWordPatternMatcher *matcher_tmp = - PyMem_Malloc(sizeof(MachineWordPatternMatcher)); - if (matcher_tmp == NULL) { - PyErr_NoMemory(); - return -1; - } - memcpy(matcher_tmp, self->matchers + (self->number_of_sse2_matchers * 2), - sizeof(MachineWordPatternMatcher)); - PyMem_FREE(self->matchers); - self->matchers = matcher_tmp; - self->number_of_matchers = 1; - return 0; -} -#endif - static void -populate_bitmask(bitmask_t *bitmask, char *word, size_t word_length) +populate_bitmask(bitmask_t bitmask[NUC_TABLE_SIZE], char *word, size_t word_length) { for (size_t i = 0; i < word_length; i++) { char c = word[i]; @@ -2352,6 +2223,8 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) return NULL; } size_t number_of_adapters = PyTuple_Size(adapters); + size_t number_of_matchers = 1; + size_t matcher_length = 0; if (number_of_adapters < 1) { PyErr_SetString(PyExc_ValueError, "At least one adapter is expected"); goto error; @@ -2380,26 +2253,36 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) MAX_SEQUENCE_SIZE, adapter_length, adapter); goto error; } + if (matcher_length + adapter_length > MAX_SEQUENCE_SIZE) { + matcher_length = adapter_length; + number_of_matchers += 1; + } + else { + matcher_length += adapter_length; + } } self = PyObject_New(AdapterCounter, type); - uint64_t **counter_tmp = - PyMem_Malloc(sizeof(uint64_t *) * number_of_adapters); - if (counter_tmp == NULL) { + self->adapter_counter = PyMem_Calloc(number_of_adapters, sizeof(uint64_t *)); + /* Ensure there is enough space to always do vector loads. */ + size_t matcher_array_size = number_of_matchers + 3; + self->found_masks = PyMem_Calloc(matcher_array_size, sizeof(bitmask_t)); + self->init_masks = PyMem_Calloc(matcher_array_size, sizeof(bitmask_t)); + self->adapter_sequences = + PyMem_Calloc(matcher_array_size, sizeof(AdapterSequence *)); + self->bitmasks = + PyMem_Calloc(matcher_array_size, NUC_TABLE_SIZE * sizeof(bitmask_t)); + self->by_four_bitmasks = PyMem_Calloc( + matcher_array_size / 4, NUC_TABLE_SIZE * 4 * sizeof(bitmask_t)); + if (self->adapter_counter == NULL || self->found_masks == NULL || + self->init_masks == NULL || self->adapter_sequences == NULL || + self->bitmasks == NULL || self->by_four_bitmasks == NULL) { PyErr_NoMemory(); goto error; } - memset(counter_tmp, 0, sizeof(uint64_t *) * number_of_adapters); - self->adapter_counter = counter_tmp; - self->adapters = NULL; - self->matchers = NULL; self->max_length = 0; self->number_of_adapters = number_of_adapters; - self->number_of_matchers = 0; + self->number_of_matchers = number_of_matchers; self->number_of_sequences = 0; - self->number_of_sse2_matchers = 0; -#ifdef __SSE2__ - self->sse2_matchers = NULL; -#endif size_t adapter_index = 0; size_t matcher_index = 0; PyObject *adapter; @@ -2407,22 +2290,10 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) char machine_word[MACHINE_WORD_BITS]; matcher_index = 0; while (adapter_index < number_of_adapters) { - self->number_of_matchers += 1; - MachineWordPatternMatcher *tmp = - PyMem_Realloc(self->matchers, sizeof(MachineWordPatternMatcher) * - self->number_of_matchers); - if (tmp == NULL) { - PyErr_NoMemory(); - goto error; - } - self->matchers = tmp; - memset(self->matchers + matcher_index, 0, - sizeof(MachineWordPatternMatcher)); bitmask_t found_mask = 0; bitmask_t init_mask = 0; size_t adapter_in_word_index = 0; size_t word_index = 0; - MachineWordPatternMatcher *matcher = self->matchers + matcher_index; memset(machine_word, 0, MACHINE_WORD_BITS); while (adapter_index < number_of_adapters) { adapter = PyTuple_GetItem(adapters, adapter_index); @@ -2439,31 +2310,39 @@ AdapterCounter__new__(PyTypeObject *type, PyObject *args, PyObject *kwargs) .adapter_length = adapter_length, .found_mask = 1ULL << (word_index - 1), /* Last character */ }; - AdapterSequence *adapt_tmp = - PyMem_Realloc(matcher->sequences, (adapter_in_word_index + 1) * - sizeof(AdapterSequence)); + AdapterSequence empty_adapter_sequence = {0, 0, 0}; + AdapterSequence *adapt_tmp = PyMem_Realloc( + self->adapter_sequences[matcher_index], + (adapter_in_word_index + 2) * sizeof(AdapterSequence)); if (adapt_tmp == NULL) { PyErr_NoMemory(); goto error; } - matcher->sequences = adapt_tmp; - matcher->sequences[adapter_in_word_index] = adapter_sequence; + self->adapter_sequences[matcher_index] = adapt_tmp; + self->adapter_sequences[matcher_index][adapter_in_word_index] = + adapter_sequence; + self->adapter_sequences[matcher_index][adapter_in_word_index + 1] = + empty_adapter_sequence; found_mask |= adapter_sequence.found_mask; adapter_in_word_index += 1; adapter_index += 1; } - populate_bitmask(matcher->bitmasks, machine_word, word_index); - matcher->found_mask = found_mask; - matcher->init_mask = init_mask; - matcher->number_of_sequences = adapter_in_word_index; + populate_bitmask(self->bitmasks[matcher_index], machine_word, word_index); + self->found_masks[matcher_index] = found_mask; + self->init_masks[matcher_index] = init_mask; matcher_index += 1; } - self->adapters = adapters; -#ifdef __SSE2__ - if (AdapterCounter_SSE2_convert(self) != 0) { - return NULL; + /* Initialize an array for better vectorized loading. Doing it here is + much more efficient than doing it for every string. */ + for (size_t i = 0; i < self->number_of_matchers; i += 4) { + for (size_t j = 0; j < NUC_TABLE_SIZE; j++) { + self->by_four_bitmasks[i / 4][j][0] = self->bitmasks[i][j]; + self->by_four_bitmasks[i / 4][j][1] = self->bitmasks[i + 1][j]; + self->by_four_bitmasks[i / 4][j][2] = self->bitmasks[i + 2][j]; + self->by_four_bitmasks[i / 4][j][3] = self->bitmasks[i + 3][j]; + } } -#endif + self->adapters = adapters; return (PyObject *)self; error: @@ -2494,75 +2373,144 @@ AdapterCounter_resize(AdapterCounter *self, size_t new_size) return 0; } -#ifdef __SSE2__ -static inline int -bitwise_and_nonzero_si128(__m128i vector1, __m128i vector2) -{ - /* There is no way to directly check if an entire vector is set to 0 - so some trickery needs to be done to ascertain if one of the bits is - set. - _mm_movemask_epi8 only catches the most significant bit. So we need to - set that bit. Comparison for larger than 0 does not work since only - signed comparisons are available. So the most significant bit makes - integers smaller than 0. Instead we do a saturated add of 127. - _mm_adds_epu8 works on unsigned integers. So 0b10000000 (128) will - become 255. Also everything above 0 will trigger the last bit to be set. - 0 itself results in 0b01111111 so the most significant bit will not be - set. - The sequence of instructions below is faster than - return (!_mm_test_all_zeros(vector1, vector2)); - which is available in SSE4.1. So there is no value in moving up one - instruction set. */ - __m128i and = _mm_and_si128(vector1, vector2); - __m128i res = _mm_adds_epu8(and, _mm_set1_epi8(127)); - return _mm_movemask_epi8(res); -} - -static inline __m128i -update_adapter_count_array_sse2(size_t position, __m128i R, __m128i already_found, - MachineWordPatternMatcherSSE2 *matcher, - uint64_t **adapter_counter) -{ - size_t number_of_adapters = matcher->number_of_sequences; - for (size_t i = 0; i < number_of_adapters; i++) { - AdapterSequenceSSE2 *adapter = matcher->sequences + i; - __m128i adapter_found_mask = adapter->found_mask; - if (bitwise_and_nonzero_si128(adapter_found_mask, already_found)) { - continue; - } - if (bitwise_and_nonzero_si128(R, adapter_found_mask)) { - size_t found_position = position - adapter->adapter_length + 1; - adapter_counter[adapter->adapter_index][found_position] += 1; - // Make sure we only find the adapter once at the earliest position; - already_found = _mm_or_si128(already_found, adapter_found_mask); - } - } - return already_found; -} -#endif - -static inline uint64_t -update_adapter_count_array(size_t position, uint64_t R, uint64_t already_found, - MachineWordPatternMatcher *matcher, +static inline bitmask_t +update_adapter_count_array(size_t position, bitmask_t match, + bitmask_t already_found, + AdapterSequence *adapter_sequences, uint64_t **adapter_counter) { - size_t number_of_adapters = matcher->number_of_sequences; - for (size_t k = 0; k < number_of_adapters; k++) { - AdapterSequence *adapter = matcher->sequences + k; + size_t adapter_index = 0; + while (true) { + AdapterSequence *adapter = adapter_sequences + adapter_index; + size_t adapter_length = adapter->adapter_length; + if (adapter_length == 0) { + break; + } bitmask_t adapter_found_mask = adapter->found_mask; if (adapter_found_mask & already_found) { + adapter_index += 1; continue; } - if (R & adapter_found_mask) { - size_t found_position = position - adapter->adapter_length + 1; + if (match & adapter_found_mask) { + size_t found_position = position - adapter_length + 1; adapter_counter[adapter->adapter_index][found_position] += 1; - // Make sure we only find the adapter once at the earliest position; already_found |= adapter_found_mask; } + adapter_index += 1; } return already_found; } +static void +find_single_matcher(const uint8_t *sequence, size_t sequence_length, + const bitmask_t *restrict init_masks, + const bitmask_t *restrict found_masks, + const bitmask_t (*bitmasks)[NUC_TABLE_SIZE], + AdapterSequence **adapter_sequences_store, + uint64_t **adapter_counter) +{ + bitmask_t found_mask = found_masks[0]; + bitmask_t init_mask = init_masks[0]; + bitmask_t R = 0; + bitmask_t already_found = 0; + const bitmask_t *bitmask = bitmasks[0]; + AdapterSequence *adapter_sequences = adapter_sequences_store[0]; + for (size_t pos = 0; pos < sequence_length; pos++) { + R <<= 1; + R |= init_mask; + uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; + R &= bitmask[index]; + if (R & found_mask) { + already_found = update_adapter_count_array( + pos, R, already_found, adapter_sequences, adapter_counter); + } + } +} + +static void (*find_four_matchers)(const uint8_t *sequence, size_t sequence_length, + const bitmask_t *restrict init_masks, + const bitmask_t *restrict found_masks, + const bitmask_t (*by_four_bitmasks)[4], + AdapterSequence **adapter_sequences_store, + uint64_t **adapter_counter) = NULL; + +#if COMPILER_HAS_TARGETED_DISPATCH && BUILD_IS_X86_64 +__attribute__((__target__("avx2"))) static void +find_four_matchers_avx2(const uint8_t *sequence, size_t sequence_length, + const bitmask_t *restrict init_masks, + const bitmask_t *restrict found_masks, + const bitmask_t (*by_four_bitmasks)[4], + AdapterSequence **adapter_sequences_store, + uint64_t **adapter_counter) +{ + bitmask_t fmask0 = found_masks[0]; + bitmask_t fmask1 = found_masks[1]; + bitmask_t fmask2 = found_masks[2]; + bitmask_t fmask3 = found_masks[3]; + bitmask_t already_found0 = 0; + bitmask_t already_found1 = 0; + bitmask_t already_found2 = 0; + bitmask_t already_found3 = 0; + + __m256i found_mask = _mm256_loadu_si256((const __m256i *)(found_masks)); + __m256i init_mask = _mm256_loadu_si256((const __m256i *)(init_masks)); + + __m256i R = _mm256_setzero_si256(); + const bitmask_t(*bitmask)[4] = by_four_bitmasks; + + for (size_t pos = 0; pos < sequence_length; pos++) { + R = _mm256_slli_epi64(R, 1); + R = _mm256_or_si256(R, init_mask); + uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; + R = _mm256_and_si256(R, _mm256_loadu_si256((__m256i *)bitmask[index])); + + __m256i check = _mm256_and_si256(R, found_mask); + /* Adding 0b01111111 (127) to any number higher than 0 sets the bit for + 128. Movemask collects these bits. This way we can test if there is + a 1 across the entire 256-bit vector. */ + int check_int = + _mm256_movemask_epi8(_mm256_adds_epu8(check, _mm256_set1_epi8(127))); + if (check_int) { + bitmask_t Rray[4]; + _mm256_storeu_si256(((__m256i *)Rray), R); + + if (Rray[0] & fmask0) { + already_found0 = update_adapter_count_array( + pos, Rray[0], already_found0, adapter_sequences_store[0], + adapter_counter); + } + if (Rray[1] & fmask1) { + already_found1 = update_adapter_count_array( + pos, Rray[1], already_found1, adapter_sequences_store[1], + adapter_counter); + } + if (Rray[2] & fmask2) { + already_found2 = update_adapter_count_array( + pos, Rray[2], already_found2, adapter_sequences_store[2], + adapter_counter); + } + if (Rray[3] & fmask3) { + already_found3 = update_adapter_count_array( + pos, Rray[3], already_found3, adapter_sequences_store[3], + adapter_counter); + } + } + } +} + +/* Constructor runs at dynamic load time */ +__attribute__((constructor)) static void +find_four_matchers_init_func_ptr(void) +{ + if (__builtin_cpu_supports("avx2")) { + find_four_matchers = find_four_matchers_avx2; + } + else { + find_four_matchers = NULL; + } +} +#endif + static int AdapterCounter_add_meta(AdapterCounter *self, struct FastqMeta *meta) { @@ -2576,142 +2524,29 @@ AdapterCounter_add_meta(AdapterCounter *self, struct FastqMeta *meta) return -1; } } - size_t scalar_matcher_index = 0; - size_t vector_matcher_index = 0; - size_t number_of_scalar_matchers = self->number_of_matchers; - size_t number_of_vector_matchers = self->number_of_sse2_matchers; - while (scalar_matcher_index < number_of_scalar_matchers || - vector_matcher_index < number_of_vector_matchers) { - size_t remaining_scalar_matchers = - number_of_scalar_matchers - scalar_matcher_index; - size_t remaining_vector_matchers = - number_of_vector_matchers - vector_matcher_index; - - if (remaining_vector_matchers == 0 && remaining_scalar_matchers > 0) { - MachineWordPatternMatcher *matcher = - self->matchers + scalar_matcher_index; - bitmask_t found_mask = matcher->found_mask; - bitmask_t init_mask = matcher->init_mask; - bitmask_t R = 0; - bitmask_t *bitmask = matcher->bitmasks; - bitmask_t already_found = 0; - scalar_matcher_index += 1; - for (size_t pos = 0; pos < sequence_length; pos++) { - R <<= 1; - R |= init_mask; - uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; - R &= bitmask[index]; - if (R & found_mask) { - already_found = update_adapter_count_array( - pos, R, already_found, matcher, self->adapter_counter); - } - } - } -#ifdef __SSE2__ - else if (remaining_vector_matchers == 1 && remaining_scalar_matchers == 0) { - MachineWordPatternMatcherSSE2 *matcher = - self->sse2_matchers + vector_matcher_index; - __m128i found_mask = matcher->found_mask; - __m128i init_mask = matcher->init_mask; - __m128i R = _mm_setzero_si128(); - __m128i *bitmask = matcher->bitmasks; - __m128i already_found = _mm_setzero_si128(); - vector_matcher_index += 1; - for (size_t pos = 0; pos < sequence_length; pos++) { - R = _mm_slli_epi64(R, 1); - R = _mm_or_si128(R, init_mask); - uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; - __m128i mask = bitmask[index]; - R = _mm_and_si128(R, mask); - if (bitwise_and_nonzero_si128(R, found_mask)) { - already_found = update_adapter_count_array_sse2( - pos, R, already_found, matcher, self->adapter_counter); - } - } - /* In the cases below we take advantage of out of order execution - on the CPU by checking two matchers at the same time. Either two - sse2 matchers, or a bitmask_t matcher and a vector matcher. - Shift-AND is a highly dependent chain of actions, meaning there - is no opportunity for the CPU to do two thing simultaneously. By - doing two shift-AND routines at the same time, there are two - independent paths that the CPU can evaluate using out of order - execution. This leads to significant speedups. */ - } - else if (remaining_vector_matchers == 1 && remaining_scalar_matchers == 1) { - MachineWordPatternMatcherSSE2 *vector_matcher = - self->sse2_matchers + vector_matcher_index; - MachineWordPatternMatcher *scalar_matcher = - self->matchers + scalar_matcher_index; - __m128i vector_found_mask = vector_matcher->found_mask; - bitmask_t scalar_found_mask = scalar_matcher->found_mask; - __m128i vector_init_mask = vector_matcher->init_mask; - bitmask_t scalar_init_mask = scalar_matcher->init_mask; - __m128i vector_R = _mm_setzero_si128(); - bitmask_t scalar_R = 0; - __m128i *vector_bitmasks = vector_matcher->bitmasks; - bitmask_t *scalar_bitmasks = scalar_matcher->bitmasks; - __m128i vector_already_found = _mm_setzero_si128(); - bitmask_t scalar_already_found = 0; - vector_matcher_index += 1; - scalar_matcher_index += 1; - for (size_t pos = 0; pos < sequence_length; pos++) { - vector_R = _mm_slli_epi64(vector_R, 1); - scalar_R <<= 1; - vector_R = _mm_or_si128(vector_R, vector_init_mask); - scalar_R |= scalar_init_mask; - uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; - scalar_R &= scalar_bitmasks[index]; - __m128i vector_mask = vector_bitmasks[index]; - vector_R = _mm_and_si128(vector_R, vector_mask); - if (bitwise_and_nonzero_si128(vector_R, vector_found_mask)) { - vector_already_found = update_adapter_count_array_sse2( - pos, vector_R, vector_already_found, vector_matcher, - self->adapter_counter); - } - if (scalar_R & scalar_found_mask) { - scalar_already_found = update_adapter_count_array( - pos, scalar_R, scalar_already_found, scalar_matcher, - self->adapter_counter); - } - } - } - else if (remaining_vector_matchers > 1) { - MachineWordPatternMatcherSSE2 *matcher1 = - self->sse2_matchers + vector_matcher_index; - MachineWordPatternMatcherSSE2 *matcher2 = - self->sse2_matchers + vector_matcher_index + 1; - __m128i found_mask1 = matcher1->found_mask; - __m128i found_mask2 = matcher2->found_mask; - __m128i init_mask1 = matcher1->init_mask; - __m128i init_mask2 = matcher2->init_mask; - __m128i R1 = _mm_setzero_si128(); - __m128i R2 = _mm_setzero_si128(); - __m128i *bitmasks1 = matcher1->bitmasks; - __m128i *bitmasks2 = matcher2->bitmasks; - __m128i already_found1 = _mm_setzero_si128(); - __m128i already_found2 = _mm_setzero_si128(); - vector_matcher_index += 2; - for (size_t pos = 0; pos < sequence_length; pos++) { - R1 = _mm_slli_epi64(R1, 1); - R2 = _mm_slli_epi64(R2, 1); - R1 = _mm_or_si128(R1, init_mask1); - R2 = _mm_or_si128(R2, init_mask2); - uint8_t index = NUCLEOTIDE_TO_INDEX[sequence[pos]]; - __m128i mask1 = bitmasks1[index]; - __m128i mask2 = bitmasks2[index]; - R1 = _mm_and_si128(R1, mask1); - R2 = _mm_and_si128(R2, mask2); - if (bitwise_and_nonzero_si128(R1, found_mask1)) { - already_found1 = update_adapter_count_array_sse2( - pos, R1, already_found1, matcher1, self->adapter_counter); - } - if (bitwise_and_nonzero_si128(R2, found_mask2)) { - already_found2 = update_adapter_count_array_sse2( - pos, R2, already_found2, matcher2, self->adapter_counter); - } - } + size_t number_of_matchers = self->number_of_matchers; + size_t matcher_index = 0; + bitmask_t *found_masks = self->found_masks; + bitmask_t *init_masks = self->init_masks; + bitmask_t(*bitmasks)[5] = self->bitmasks; + AdapterSequence **adapter_sequences = self->adapter_sequences; + uint64_t **adapter_count_array = self->adapter_counter; + while (matcher_index < number_of_matchers) { + /* Only run when a vectorized function pointer is initialized */ + if (find_four_matchers && number_of_matchers - matcher_index > 1) { + find_four_matchers( + sequence, sequence_length, init_masks + matcher_index, + found_masks + matcher_index, + self->by_four_bitmasks[matcher_index / 4], + adapter_sequences + matcher_index, adapter_count_array); + matcher_index += 4; + continue; } -#endif + find_single_matcher( + sequence, sequence_length, init_masks + matcher_index, + found_masks + matcher_index, bitmasks + matcher_index, + adapter_sequences + matcher_index, adapter_count_array); + matcher_index += 1; } return 0; }