From 2223ea396acc9223ed7dea91e7238641a5e0a95c Mon Sep 17 00:00:00 2001 From: Nick Babcock Date: Sun, 7 Jan 2024 06:39:46 -0600 Subject: [PATCH] Add optimistic text parsing for 20% improvement In the same vein of data driven optimizations: #111, #112 This commit introduces a fast, happy path parsing routine that takes advantage of how typical save files are laid out and how the text deserializer essentially only calls `read` when it either needs a key or a value. Keys will have leading whitespace (newline followed by tabs), so we consume up to 8 of them at once. For EU4 this covers 100% of keys in about 95% of saves (ie: it is extremely rare for there to be more than 8 whitespace characters in a row). This happy path hoists the the common keys and values (`{`, `}`, `"`, and alphanumeric+dash) so it's more obvious to the compiler and CPU what we're looking for. After all these years, I still can't derive a good function to identify a boundary character within 8 bytes, but I think I found the next best thing: loop unrolling. Speaking of boundary characters, I removed the notion of character classes as they were unused. The happy path for parsing quoted data will now process 8 bytes at once and will ask for forgiveness if an escape character is encountered. The 20% improvement comes from measuring eu4 save deserialization, so the improvement to this individual function is much greater. --- src/data.rs | 41 +++++++++--------- src/text/reader.rs | 106 +++++++++++++++++++++++++++++++++++++-------- src/util.rs | 20 +++++++++ 3 files changed, 129 insertions(+), 38 deletions(-) diff --git a/src/data.rs b/src/data.rs index 8e0d3fa..79acfb4 100644 --- a/src/data.rs +++ b/src/data.rs @@ -44,33 +44,34 @@ const fn create_windows_1252_table() -> [char; 256] { } pub(crate) static WINDOWS_1252: [char; 256] = create_windows_1252_table(); -pub(crate) const BOUNDARY: u8 = 1; -pub(crate) const WHITESPACE: u8 = 2; -pub(crate) const OPERATOR: u8 = 4; -pub(crate) const COMMENT: u8 = 8; #[inline] pub(crate) fn is_boundary(b: u8) -> bool { - CHARACTER_CLASS[usize::from(b)] != 0 + boundary(b) != 0 +} + +#[inline] +pub(crate) fn boundary(b: u8) -> u8 { + CHARACTER_CLASS[usize::from(b)] } const fn create_character_class_table() -> [u8; 256] { let mut table = [0u8; 256]; - table[b'\t' as usize] = WHITESPACE; - table[b'\n' as usize] = WHITESPACE; - table[b'\x0b' as usize] = WHITESPACE; // \v - table[b'\x0c' as usize] = WHITESPACE; // \f - table[b'\r' as usize] = WHITESPACE; - table[b' ' as usize] = WHITESPACE; - table[b'!' as usize] = OPERATOR; - table[b'#' as usize] = COMMENT; - table[b'<' as usize] = OPERATOR; - table[b'=' as usize] = OPERATOR; - table[b'>' as usize] = OPERATOR; - table[b'[' as usize] = BOUNDARY; - table[b']' as usize] = BOUNDARY; - table[b'}' as usize] = BOUNDARY; - table[b'{' as usize] = BOUNDARY; + table[b'\t' as usize] = 1; + table[b'\n' as usize] = 1; + table[b'\x0b' as usize] = 1; // \v + table[b'\x0c' as usize] = 1; // \f + table[b'\r' as usize] = 1; + table[b' ' as usize] = 1; + table[b'!' as usize] = 1; + table[b'#' as usize] = 1; + table[b'<' as usize] = 1; + table[b'=' as usize] = 1; + table[b'>' as usize] = 1; + table[b'[' as usize] = 1; + table[b']' as usize] = 1; + table[b'}' as usize] = 1; + table[b'{' as usize] = 1; table } diff --git a/src/text/reader.rs b/src/text/reader.rs index 70fba6f..ec16dc5 100644 --- a/src/text/reader.rs +++ b/src/text/reader.rs @@ -2,7 +2,7 @@ use super::Operator; use crate::{ buffer::{BufferError, BufferWindow, BufferWindowBuilder}, data::is_boundary, - util::{contains_zero_byte, count_chunk, repeat_byte}, + util::{contains_zero_byte, count_chunk, leading_whitespace, repeat_byte}, Scalar, }; use std::io::Read; @@ -134,8 +134,7 @@ where self.buf.position() } - #[inline] - unsafe fn next_opt(&mut self) -> (Option, Option) { + unsafe fn next_opt_fallback(&mut self) -> (Option, Option) { #[derive(Debug)] enum ParseState { None, @@ -155,21 +154,7 @@ where 'inner: loop { match *ptr { - c @ b' ' | c @ b'\t' => { - ptr = ptr.add(1); - loop { - if ptr == end { - break 'eof (0, 0); - } - - if *ptr != c { - break; - } - - ptr = ptr.add(1) - } - } - b'\n' | b'\r' | b';' => { + b' ' | b'\t' | b'\n' | b'\r' | b';' => { ptr = ptr.add(1); break 'inner; } @@ -425,6 +410,91 @@ where } } + #[inline] + unsafe fn next_opt(&mut self) -> (Option, Option) { + let mut ptr = self.buf.start; + let end = self.buf.end; + + if end.offset_from(ptr) < 9 { + return self.next_opt_fallback(); + } + + // 3.4 million newlines followed by an average of 3.3 tabs + let data = ptr.cast::().read_unaligned().to_le(); + ptr = ptr.add(leading_whitespace(data) as usize); + + // Eagerly check for brackets, there'll be millions of them + if *ptr == b'{' { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Open), None); + } else if *ptr == b'}' { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Close), None); + } + // unquoted values are the most frequent type of values in + // text so if we see something that is alphanumeric or a + // dash (for negative numbers) we eagerly attempt to match + // against it. Loop unrolling is used to minimize the number + // of access to the boundary lookup table. + else if matches!(*ptr, b'a'..=b'z' | b'0'..=b'9' | b'A'..=b'Z' | b'-') { + let start_ptr = ptr; + let mut opt_ptr = start_ptr.add(1); + while end.offset_from(opt_ptr) > 8 { + for _ in 0..8 { + if is_boundary(*opt_ptr) { + self.buf.advance_to(opt_ptr); + + // for space delimited arrays, advance one + if *opt_ptr == b' ' { + self.buf.advance(1); + } + + let scalar = self.buf.get(start_ptr..opt_ptr); + return (Some(Token::Unquoted(scalar)), None); + } + opt_ptr = opt_ptr.add(1); + } + } + + // optimization failed, fallback to inner parsing loop + } else if *ptr == b'\"' { + let start_ptr = ptr.add(1); + let mut opt_ptr = start_ptr; + let mut escaped = false; + while end.offset_from(opt_ptr) > 8 { + let data = opt_ptr.cast::().read_unaligned().to_le(); + escaped |= contains_zero_byte(data ^ repeat_byte(b'\\')); + + // http://0x80.pl/notesen/2023-03-06-swar-find-any.html#faster-swar-procedure + let mask = repeat_byte(0x7f); + let lobits = data & mask; + let x0 = (lobits ^ repeat_byte(b'\"')) + mask; + let t0 = x0 | data; + let t1 = t0 & repeat_byte(0x80); + let t2 = t1 ^ repeat_byte(0x80); + + if t2 != 0 { + let quote_ind = t2.trailing_zeros() >> 3; + + if !escaped { + opt_ptr = opt_ptr.add(quote_ind as usize); + self.buf.advance_to(opt_ptr.add(1)); + let scalar = self.buf.get(start_ptr..opt_ptr); + return (Some(Token::Quoted(scalar)), None); + } else { + break; + } + } else { + opt_ptr = opt_ptr.add(8); + } + } + + // optimization failed, fallback to inner parsing loop + } + + self.next_opt_fallback() + } + /// Advance a given number of bytes and return them. /// /// The internal buffer must be large enough to accomodate all bytes. diff --git a/src/util.rs b/src/util.rs index f7f1d94..9487c0e 100644 --- a/src/util.rs +++ b/src/util.rs @@ -72,11 +72,31 @@ pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 { sum_usize(bytewise_equal(value, repeat_byte(byte))) } +#[inline] +pub(crate) fn leading_whitespace(value: u64) -> u32 { + let mask1 = repeat_byte(b'\t'); + let mask2 = repeat_byte(b'\n'); + let res1 = value ^ mask1; + let res2 = value ^ mask2; + (res1 & res2).trailing_zeros() >> 3 +} + #[cfg(test)] mod tests { use super::*; use rstest::*; + #[rstest] + #[case(*b"\t\t\t\t\t\t\t\t", 8)] + #[case(*b"a\t\t\t\t\t\t\t", 0)] + #[case(*b"\t ", 1)] + #[case(*b"\n\na ", 2)] + #[case(*b"\n\ta ", 2)] + fn test_leading_whitespace(#[case] input: [u8; 8], #[case] expected: u32) { + let lhs = u64::from_le_bytes(input); + assert_eq!(leading_whitespace(lhs), expected); + } + #[rstest] #[case(*b" ", 0)] #[case(*b" { ", 1)]