From 2223ea396acc9223ed7dea91e7238641a5e0a95c Mon Sep 17 00:00:00 2001
From: Nick Babcock <nbabcock19@hotmail.com>
Date: Sun, 7 Jan 2024 06:39:46 -0600
Subject: [PATCH] Add optimistic text parsing for 20% improvement

In the same vein of data driven optimizations: #111, #112

This commit introduces a fast, happy path parsing routine that takes
advantage of how typical save files are laid out and how the text
deserializer essentially only calls `read` when it either needs a key or
a value.

Keys will have leading whitespace (newline followed by tabs), so we
consume up to 8 of them at once. For EU4 this covers 100% of keys
in about 95% of saves (ie: it is extremely rare for there to be more
than 8 whitespace characters in a row).

This happy path hoists the the common keys and values (`{`, `}`, `"`,
and alphanumeric+dash) so it's more obvious to the compiler and CPU what
we're looking for.

After all these years, I still can't derive a good function to identify
a boundary character within 8 bytes, but I think I found the next best
thing: loop unrolling. Speaking of boundary characters, I removed the
notion of character classes as they were unused.

The happy path for parsing quoted data will now process 8 bytes at once
and will ask for forgiveness if an escape character is encountered.

The 20% improvement comes from measuring eu4 save deserialization, so
the improvement to this individual function is much greater.
---
 src/data.rs        |  41 +++++++++---------
 src/text/reader.rs | 106 +++++++++++++++++++++++++++++++++++++--------
 src/util.rs        |  20 +++++++++
 3 files changed, 129 insertions(+), 38 deletions(-)
diff --git a/src/data.rs b/src/data.rs
index 8e0d3fa..79acfb4 100644
--- a/src/data.rs
+++ b/src/data.rs
@@ -44,33 +44,34 @@ const fn create_windows_1252_table() -> [char; 256] {
 }
 
 pub(crate) static WINDOWS_1252: [char; 256] = create_windows_1252_table();
-pub(crate) const BOUNDARY: u8 = 1;
-pub(crate) const WHITESPACE: u8 = 2;
-pub(crate) const OPERATOR: u8 = 4;
-pub(crate) const COMMENT: u8 = 8;
 
 #[inline]
 pub(crate) fn is_boundary(b: u8) -> bool {
-    CHARACTER_CLASS[usize::from(b)] != 0
+    boundary(b) != 0
+}
+
+#[inline]
+pub(crate) fn boundary(b: u8) -> u8 {
+    CHARACTER_CLASS[usize::from(b)]
 }
 
 const fn create_character_class_table() -> [u8; 256] {
     let mut table = [0u8; 256];
-    table[b'\t' as usize] = WHITESPACE;
-    table[b'\n' as usize] = WHITESPACE;
-    table[b'\x0b' as usize] = WHITESPACE; // \v
-    table[b'\x0c' as usize] = WHITESPACE; // \f
-    table[b'\r' as usize] = WHITESPACE;
-    table[b' ' as usize] = WHITESPACE;
-    table[b'!' as usize] = OPERATOR;
-    table[b'#' as usize] = COMMENT;
-    table[b'<' as usize] = OPERATOR;
-    table[b'=' as usize] = OPERATOR;
-    table[b'>' as usize] = OPERATOR;
-    table[b'[' as usize] = BOUNDARY;
-    table[b']' as usize] = BOUNDARY;
-    table[b'}' as usize] = BOUNDARY;
-    table[b'{' as usize] = BOUNDARY;
+    table[b'\t' as usize] = 1;
+    table[b'\n' as usize] = 1;
+    table[b'\x0b' as usize] = 1; // \v
+    table[b'\x0c' as usize] = 1; // \f
+    table[b'\r' as usize] = 1;
+    table[b' ' as usize] = 1;
+    table[b'!' as usize] = 1;
+    table[b'#' as usize] = 1;
+    table[b'<' as usize] = 1;
+    table[b'=' as usize] = 1;
+    table[b'>' as usize] = 1;
+    table[b'[' as usize] = 1;
+    table[b']' as usize] = 1;
+    table[b'}' as usize] = 1;
+    table[b'{' as usize] = 1;
     table
 }
 
diff --git a/src/text/reader.rs b/src/text/reader.rs
index 70fba6f..ec16dc5 100644
--- a/src/text/reader.rs
+++ b/src/text/reader.rs
@@ -2,7 +2,7 @@ use super::Operator;
 use crate::{
     buffer::{BufferError, BufferWindow, BufferWindowBuilder},
     data::is_boundary,
-    util::{contains_zero_byte, count_chunk, repeat_byte},
+    util::{contains_zero_byte, count_chunk, leading_whitespace, repeat_byte},
     Scalar,
 };
 use std::io::Read;
@@ -134,8 +134,7 @@ where
         self.buf.position()
     }
 
-    #[inline]
-    unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
+    unsafe fn next_opt_fallback(&mut self) -> (Option<Token>, Option<ReaderError>) {
         #[derive(Debug)]
         enum ParseState {
             None,
@@ -155,21 +154,7 @@ where
 
                     'inner: loop {
                         match *ptr {
-                            c @ b' ' | c @ b'\t' => {
-                                ptr = ptr.add(1);
-                                loop {
-                                    if ptr == end {
-                                        break 'eof (0, 0);
-                                    }
-
-                                    if *ptr != c {
-                                        break;
-                                    }
-
-                                    ptr = ptr.add(1)
-                                }
-                            }
-                            b'\n' | b'\r' | b';' => {
+                            b' ' | b'\t' | b'\n' | b'\r' | b';' => {
                                 ptr = ptr.add(1);
                                 break 'inner;
                             }
@@ -425,6 +410,91 @@ where
         }
     }
 
+    #[inline]
+    unsafe fn next_opt(&mut self) -> (Option<Token>, Option<ReaderError>) {
+        let mut ptr = self.buf.start;
+        let end = self.buf.end;
+
+        if end.offset_from(ptr) < 9 {
+            return self.next_opt_fallback();
+        }
+
+        // 3.4 million newlines followed by an average of 3.3 tabs
+        let data = ptr.cast::<u64>().read_unaligned().to_le();
+        ptr = ptr.add(leading_whitespace(data) as usize);
+
+        // Eagerly check for brackets, there'll be millions of them
+        if *ptr == b'{' {
+            self.buf.advance_to(ptr.add(1));
+            return (Some(Token::Open), None);
+        } else if *ptr == b'}' {
+            self.buf.advance_to(ptr.add(1));
+            return (Some(Token::Close), None);
+        }
+        // unquoted values are the most frequent type of values in
+        // text so if we see something that is alphanumeric or a
+        // dash (for negative numbers) we eagerly attempt to match
+        // against it. Loop unrolling is used to minimize the number
+        // of access to the boundary lookup table.
+        else if matches!(*ptr, b'a'..=b'z' | b'0'..=b'9' | b'A'..=b'Z' | b'-') {
+            let start_ptr = ptr;
+            let mut opt_ptr = start_ptr.add(1);
+            while end.offset_from(opt_ptr) > 8 {
+                for _ in 0..8 {
+                    if is_boundary(*opt_ptr) {
+                        self.buf.advance_to(opt_ptr);
+
+                        // for space delimited arrays, advance one
+                        if *opt_ptr == b' ' {
+                            self.buf.advance(1);
+                        }
+
+                        let scalar = self.buf.get(start_ptr..opt_ptr);
+                        return (Some(Token::Unquoted(scalar)), None);
+                    }
+                    opt_ptr = opt_ptr.add(1);
+                }
+            }
+
+            // optimization failed, fallback to inner parsing loop
+        } else if *ptr == b'\"' {
+            let start_ptr = ptr.add(1);
+            let mut opt_ptr = start_ptr;
+            let mut escaped = false;
+            while end.offset_from(opt_ptr) > 8 {
+                let data = opt_ptr.cast::<u64>().read_unaligned().to_le();
+                escaped |= contains_zero_byte(data ^ repeat_byte(b'\\'));
+
+                // http://0x80.pl/notesen/2023-03-06-swar-find-any.html#faster-swar-procedure
+                let mask = repeat_byte(0x7f);
+                let lobits = data & mask;
+                let x0 = (lobits ^ repeat_byte(b'\"')) + mask;
+                let t0 = x0 | data;
+                let t1 = t0 & repeat_byte(0x80);
+                let t2 = t1 ^ repeat_byte(0x80);
+
+                if t2 != 0 {
+                    let quote_ind = t2.trailing_zeros() >> 3;
+
+                    if !escaped {
+                        opt_ptr = opt_ptr.add(quote_ind as usize);
+                        self.buf.advance_to(opt_ptr.add(1));
+                        let scalar = self.buf.get(start_ptr..opt_ptr);
+                        return (Some(Token::Quoted(scalar)), None);
+                    } else {
+                        break;
+                    }
+                } else {
+                    opt_ptr = opt_ptr.add(8);
+                }
+            }
+
+            // optimization failed, fallback to inner parsing loop
+        }
+
+        self.next_opt_fallback()
+    }
+
     /// Advance a given number of bytes and return them.
     ///
     /// The internal buffer must be large enough to accomodate all bytes.
diff --git a/src/util.rs b/src/util.rs
index f7f1d94..9487c0e 100644
--- a/src/util.rs
+++ b/src/util.rs
@@ -72,11 +72,31 @@ pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 {
     sum_usize(bytewise_equal(value, repeat_byte(byte)))
 }
 
+#[inline]
+pub(crate) fn leading_whitespace(value: u64) -> u32 {
+    let mask1 = repeat_byte(b'\t');
+    let mask2 = repeat_byte(b'\n');
+    let res1 = value ^ mask1;
+    let res2 = value ^ mask2;
+    (res1 & res2).trailing_zeros() >> 3
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
     use rstest::*;
 
+    #[rstest]
+    #[case(*b"\t\t\t\t\t\t\t\t", 8)]
+    #[case(*b"a\t\t\t\t\t\t\t", 0)]
+    #[case(*b"\t       ", 1)]
+    #[case(*b"\n\na     ", 2)]
+    #[case(*b"\n\ta     ", 2)]
+    fn test_leading_whitespace(#[case] input: [u8; 8], #[case] expected: u32) {
+        let lhs = u64::from_le_bytes(input);
+        assert_eq!(leading_whitespace(lhs), expected);
+    }
+
     #[rstest]
     #[case(*b"        ", 0)]
     #[case(*b"   {    ", 1)]