Skip to content

Commit 0e0c680

Browse files
authored
Merge pull request #153 from rakaly/text
Add optimistic text parsing for 20% improvement
2 parents ebe4999 + 2223ea3 commit 0e0c680

File tree

3 files changed

+129
-38
lines changed

3 files changed

+129
-38
lines changed

src/data.rs

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -44,33 +44,34 @@ const fn create_windows_1252_table() -> [char; 256] {
4444
}
4545

4646
pub(crate) static WINDOWS_1252: [char; 256] = create_windows_1252_table();
47-
pub(crate) const BOUNDARY: u8 = 1;
48-
pub(crate) const WHITESPACE: u8 = 2;
49-
pub(crate) const OPERATOR: u8 = 4;
50-
pub(crate) const COMMENT: u8 = 8;
5147

5248
#[inline]
5349
pub(crate) fn is_boundary(b: u8) -> bool {
54-
CHARACTER_CLASS[usize::from(b)] != 0
50+
boundary(b) != 0
51+
}
52+
53+
#[inline]
54+
pub(crate) fn boundary(b: u8) -> u8 {
55+
CHARACTER_CLASS[usize::from(b)]
5556
}
5657

5758
const fn create_character_class_table() -> [u8; 256] {
5859
let mut table = [0u8; 256];
59-
table[b'\t' as usize] = WHITESPACE;
60-
table[b'\n' as usize] = WHITESPACE;
61-
table[b'\x0b' as usize] = WHITESPACE; // \v
62-
table[b'\x0c' as usize] = WHITESPACE; // \f
63-
table[b'\r' as usize] = WHITESPACE;
64-
table[b' ' as usize] = WHITESPACE;
65-
table[b'!' as usize] = OPERATOR;
66-
table[b'#' as usize] = COMMENT;
67-
table[b'<' as usize] = OPERATOR;
68-
table[b'=' as usize] = OPERATOR;
69-
table[b'>' as usize] = OPERATOR;
70-
table[b'[' as usize] = BOUNDARY;
71-
table[b']' as usize] = BOUNDARY;
72-
table[b'}' as usize] = BOUNDARY;
73-
table[b'{' as usize] = BOUNDARY;
60+
table[b'\t' as usize] = 1;
61+
table[b'\n' as usize] = 1;
62+
table[b'\x0b' as usize] = 1; // \v
63+
table[b'\x0c' as usize] = 1; // \f
64+
table[b'\r' as usize] = 1;
65+
table[b' ' as usize] = 1;
66+
table[b'!' as usize] = 1;
67+
table[b'#' as usize] = 1;
68+
table[b'<' as usize] = 1;
69+
table[b'=' as usize] = 1;
70+
table[b'>' as usize] = 1;
71+
table[b'[' as usize] = 1;
72+
table[b']' as usize] = 1;
73+
table[b'}' as usize] = 1;
74+
table[b'{' as usize] = 1;
7475
table
7576
}
7677

src/text/reader.rs

Lines changed: 88 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use super::Operator;
22
use crate::{
33
buffer::{BufferError, BufferWindow, BufferWindowBuilder},
44
data::is_boundary,
5-
util::{contains_zero_byte, count_chunk, repeat_byte},
5+
util::{contains_zero_byte, count_chunk, leading_whitespace, repeat_byte},
66
Scalar,
77
};
88
use std::io::Read;
@@ -134,8 +134,7 @@ where
134134
self.buf.position()
135135
}
136136

137-
#[inline]
138-
unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
137+
unsafe fn next_opt_fallback(&mut self) -> (Option<Token>, Option<ReaderError>) {
139138
#[derive(Debug)]
140139
enum ParseState {
141140
None,
@@ -155,21 +154,7 @@ where
155154

156155
'inner: loop {
157156
match *ptr {
158-
c @ b' ' | c @ b'\t' => {
159-
ptr = ptr.add(1);
160-
loop {
161-
if ptr == end {
162-
break 'eof (0, 0);
163-
}
164-
165-
if *ptr != c {
166-
break;
167-
}
168-
169-
ptr = ptr.add(1)
170-
}
171-
}
172-
b'\n' | b'\r' | b';' => {
157+
b' ' | b'\t' | b'\n' | b'\r' | b';' => {
173158
ptr = ptr.add(1);
174159
break 'inner;
175160
}
@@ -425,6 +410,91 @@ where
425410
}
426411
}
427412

413+
#[inline]
414+
unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
415+
let mut ptr = self.buf.start;
416+
let end = self.buf.end;
417+
418+
if end.offset_from(ptr) < 9 {
419+
return self.next_opt_fallback();
420+
}
421+
422+
// 3.4 million newlines followed by an average of 3.3 tabs
423+
let data = ptr.cast::<u64>().read_unaligned().to_le();
424+
ptr = ptr.add(leading_whitespace(data) as usize);
425+
426+
// Eagerly check for brackets, there'll be millions of them
427+
if *ptr == b'{' {
428+
self.buf.advance_to(ptr.add(1));
429+
return (Some(Token::Open), None);
430+
} else if *ptr == b'}' {
431+
self.buf.advance_to(ptr.add(1));
432+
return (Some(Token::Close), None);
433+
}
434+
// unquoted values are the most frequent type of values in
435+
// text so if we see something that is alphanumeric or a
436+
// dash (for negative numbers) we eagerly attempt to match
437+
// against it. Loop unrolling is used to minimize the number
438+
// of access to the boundary lookup table.
439+
else if matches!(*ptr, b'a'..=b'z' | b'0'..=b'9' | b'A'..=b'Z' | b'-') {
440+
let start_ptr = ptr;
441+
let mut opt_ptr = start_ptr.add(1);
442+
while end.offset_from(opt_ptr) > 8 {
443+
for _ in 0..8 {
444+
if is_boundary(*opt_ptr) {
445+
self.buf.advance_to(opt_ptr);
446+
447+
// for space delimited arrays, advance one
448+
if *opt_ptr == b' ' {
449+
self.buf.advance(1);
450+
}
451+
452+
let scalar = self.buf.get(start_ptr..opt_ptr);
453+
return (Some(Token::Unquoted(scalar)), None);
454+
}
455+
opt_ptr = opt_ptr.add(1);
456+
}
457+
}
458+
459+
// optimization failed, fallback to inner parsing loop
460+
} else if *ptr == b'\"' {
461+
let start_ptr = ptr.add(1);
462+
let mut opt_ptr = start_ptr;
463+
let mut escaped = false;
464+
while end.offset_from(opt_ptr) > 8 {
465+
let data = opt_ptr.cast::<u64>().read_unaligned().to_le();
466+
escaped |= contains_zero_byte(data ^ repeat_byte(b'\\'));
467+
468+
// http://0x80.pl/notesen/2023-03-06-swar-find-any.html#faster-swar-procedure
469+
let mask = repeat_byte(0x7f);
470+
let lobits = data & mask;
471+
let x0 = (lobits ^ repeat_byte(b'\"')) + mask;
472+
let t0 = x0 | data;
473+
let t1 = t0 & repeat_byte(0x80);
474+
let t2 = t1 ^ repeat_byte(0x80);
475+
476+
if t2 != 0 {
477+
let quote_ind = t2.trailing_zeros() >> 3;
478+
479+
if !escaped {
480+
opt_ptr = opt_ptr.add(quote_ind as usize);
481+
self.buf.advance_to(opt_ptr.add(1));
482+
let scalar = self.buf.get(start_ptr..opt_ptr);
483+
return (Some(Token::Quoted(scalar)), None);
484+
} else {
485+
break;
486+
}
487+
} else {
488+
opt_ptr = opt_ptr.add(8);
489+
}
490+
}
491+
492+
// optimization failed, fallback to inner parsing loop
493+
}
494+
495+
self.next_opt_fallback()
496+
}
497+
428498
/// Advance a given number of bytes and return them.
429499
///
430500
/// The internal buffer must be large enough to accomodate all bytes.

src/util.rs

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,11 +72,31 @@ pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 {
7272
sum_usize(bytewise_equal(value, repeat_byte(byte)))
7373
}
7474

75+
#[inline]
76+
pub(crate) fn leading_whitespace(value: u64) -> u32 {
77+
let mask1 = repeat_byte(b'\t');
78+
let mask2 = repeat_byte(b'\n');
79+
let res1 = value ^ mask1;
80+
let res2 = value ^ mask2;
81+
(res1 & res2).trailing_zeros() >> 3
82+
}
83+
7584
#[cfg(test)]
7685
mod tests {
7786
use super::*;
7887
use rstest::*;
7988

89+
#[rstest]
90+
#[case(*b"\t\t\t\t\t\t\t\t", 8)]
91+
#[case(*b"a\t\t\t\t\t\t\t", 0)]
92+
#[case(*b"\t ", 1)]
93+
#[case(*b"\n\na ", 2)]
94+
#[case(*b"\n\ta ", 2)]
95+
fn test_leading_whitespace(#[case] input: [u8; 8], #[case] expected: u32) {
96+
let lhs = u64::from_le_bytes(input);
97+
assert_eq!(leading_whitespace(lhs), expected);
98+
}
99+
80100
#[rstest]
81101
#[case(*b" ", 0)]
82102
#[case(*b" { ", 1)]

0 commit comments

Comments
 (0)