Merge pull request #146 from rakaly/encoding

Simplify text decoding routines
rakaly · Dec 21, 2023 · 0d314ac · 0d314ac
2 parents 5f5b4cf + 6bed8dd
commit 0d314ac
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 61 deletions.
diff --git a/src/bin/stats.rs b/src/bin/stats.rs
@@ -19,6 +19,7 @@ struct Stats {
     token: u32,
     rgb: u32,
     i64: u32,
+    frequencies: Vec<u64>,
 }
 
 impl Stats {
@@ -34,8 +35,14 @@ impl Stats {
             BinaryToken::U64(_) => self.u64 += 1,
             BinaryToken::I64(_) => self.i64 += 1,
             BinaryToken::I32(_) => self.i32 += 1,
-            BinaryToken::Quoted(_) => self.quoted += 1,
-            BinaryToken::Unquoted(_) => self.unquoted += 1,
+            BinaryToken::Quoted(x) => {
+                self.frequencies[x.as_bytes().len()] += 1;
+                self.quoted += 1
+            }
+            BinaryToken::Unquoted(x) => {
+                self.frequencies[x.as_bytes().len()] += 1;
+                self.unquoted += 1
+            }
             BinaryToken::F32(_) => self.f32 += 1,
             BinaryToken::F64(_) => self.f64 += 1,
             BinaryToken::Token(_) => self.token += 1,
@@ -182,6 +189,28 @@ impl std::fmt::Display for Stats {
 
         writeln!(f, "total:\t\t{:<8}", total)?;
 
+        let count = self.frequencies.iter().sum::<u64>();
+        let sum = self
+            .frequencies
+            .iter()
+            .enumerate()
+            .map(|(i, x)| (i as u64) * *x)
+            .sum::<u64>();
+        let median_ind = (count + 1) / 2;
+        let mut counter = 0;
+        let mut median = 0;
+        for (i, freq) in self.frequencies.iter().enumerate() {
+            counter += *freq;
+            if counter > median_ind {
+                median = i;
+                break;
+            }
+        }
+
+        writeln!(f, "text count: {}", count)?;
+        writeln!(f, "text average length: {:.2}", sum as f64 / count as f64)?;
+        writeln!(f, "text median length: {:.2}", median)?;
+
         Ok(())
     }
 }
@@ -255,6 +284,9 @@ fn main() -> Result<(), Box<dyn error::Error>> {
     let mut keys = Stats::default();
     let mut values = Stats::default();
     let mut array = Stats::default();
+    keys.frequencies = vec![0; 100];
+    values.frequencies = vec![0; 100];
+    array.frequencies = vec![0; 100];
     let tokens = tape.tokens();
     read_object(&mut keys, &mut values, &mut array, tokens, 0..tokens.len());
     println!("Object key tokens:");

diff --git a/src/data.rs b/src/data.rs
@@ -262,11 +262,6 @@ pub(crate) const WHITESPACE: u8 = 2;
 pub(crate) const OPERATOR: u8 = 4;
 pub(crate) const COMMENT: u8 = 8;
 
-#[inline]
-pub(crate) fn is_whitespace(b: u8) -> bool {
-    b.is_ascii_whitespace()
-}
-
 #[inline]
 pub(crate) fn is_boundary(b: u8) -> bool {
     CHARACTER_CLASS[usize::from(b)] != 0

diff --git a/src/encoding.rs b/src/encoding.rs
@@ -1,7 +1,4 @@
-use crate::{
-    data::is_whitespace, data::WINDOWS_1252, util::contains_zero_byte, util::le_u64,
-    util::repeat_byte,
-};
+use crate::{data::WINDOWS_1252, util::contains_zero_byte, util::le_u64, util::repeat_byte};
 use std::borrow::Cow;
 
 /// An encoding for interpreting byte data as UTF-8 text
@@ -107,61 +104,40 @@ impl Encoding for Utf8Encoding {
     }
 }
 
-#[inline]
-fn trim_trailing_index(d: &[u8]) -> usize {
-    d.iter()
-        .rev()
-        .position(|x| !is_whitespace(*x))
-        .unwrap_or(d.len())
-}
-
-#[inline]
-fn trim_trailing_whitepsace(d: &[u8]) -> &[u8] {
-    &d[..d.len() - trim_trailing_index(d)]
-}
-
-#[inline]
-fn trim_trailing_ascii_whitespace(original_data: &[u8], s: &mut String) {
-    // truncate the string's inner vector to remove any whitespace.
-    // We know this is safe as we only care about ascii whitespace.
-    let ind = trim_trailing_index(original_data);
-    let inner = unsafe { s.as_mut_vec() };
-    inner.truncate(inner.len() - ind);
+// https://github.com/rust-lang/rust/blob/767453eb7ca188e991ac5568c17b984dd4893e77/library/core/src/slice/ascii.rs#L159-L171
+const fn trim_ascii_end(data: &[u8]) -> &[u8] {
+    let mut bytes = data;
+
+    // Note: A pattern matching based approach (instead of indexing) allows
+    // making the function const.
+    while let [rest @ .., last] = bytes {
+        if last.is_ascii_whitespace() {
+            bytes = rest;
+        } else {
+            break;
+        }
+    }
+    bytes
 }
 
 #[inline]
 pub(crate) fn decode_windows1252(d: &[u8]) -> Cow<str> {
-    // Then we iterate through the data in 8 byte chunks and ensure that each chunk
-    // is contained of ascii characters with no escape characters
-    let mut chunk_iter = d.chunks_exact(8);
-    let mut offset = 0;
-    for n in &mut chunk_iter {
-        let wide = le_u64(n);
-        if wide & 0x8080_8080_8080_8080 != 0 || contains_zero_byte(wide ^ repeat_byte(b'\\')) {
-            return Cow::Owned(windows_1252_create(d, offset));
-        }
-
-        offset += 8;
+    let bytes = trim_ascii_end(d);
+    let mut eject = false;
+    for x in bytes {
+        eject |= !x.is_ascii() || *x == b'\\'
     }
 
-    // Same logic as before but instead of operating on 8 bytes at a time, work bytewise
-    let remainder = chunk_iter.remainder();
-    for &byte in remainder {
-        if !byte.is_ascii() || byte == b'\\' {
-            return Cow::Owned(windows_1252_create(d, offset));
-        }
-
-        offset += 1;
+    if eject {
+        return Cow::Owned(windows_1252_create(bytes, 0));
     }
 
-    let d = trim_trailing_whitepsace(d);
-
-    // This is safe as we just checked that the data is ascii and ascii is a subset of utf8
-    debug_assert!(std::str::from_utf8(d).is_ok());
-    let s = unsafe { std::str::from_utf8_unchecked(d) };
+    debug_assert!(std::str::from_utf8(bytes).is_ok());
+    let s = unsafe { std::str::from_utf8_unchecked(bytes) };
     Cow::Borrowed(s)
 }
 
+#[inline(never)]
 fn windows_1252_create(d: &[u8], offset: usize) -> String {
     let (upto, rest) = d.split_at(offset);
 
@@ -175,14 +151,14 @@ fn windows_1252_create(d: &[u8], offset: usize) -> String {
         result.push(WINDOWS_1252[c as usize]);
     }
 
-    trim_trailing_ascii_whitespace(d, &mut result);
     result
 }
 
 #[inline]
 pub(crate) fn decode_utf8(d: &[u8]) -> Cow<str> {
     // Then we iterate through the data in 8 byte chunks and ensure that each chunk
     // has no escape characters
+    let d = trim_ascii_end(d);
     let mut chunk_iter = d.chunks_exact(8);
     let mut offset = 0;
     let mut is_ascii = true;
@@ -207,7 +183,7 @@ pub(crate) fn decode_utf8(d: &[u8]) -> Cow<str> {
         offset += 1;
     }
 
-    let d = trim_trailing_whitepsace(d);
+    let d = trim_ascii_end(d);
 
     // Most the strings we'll be decoding are ascii, so we have an ascii fast path. If we don't
     // detect any non-ascii characters then we can immediately create the borrowed string without
@@ -234,10 +210,10 @@ fn utf8_create(d: &[u8], offset: usize) -> String {
         result.push(c);
     }
 
-    let mut result = String::from_utf8(result)
-        .unwrap_or_else(|e| String::from_utf8_lossy(&e.into_bytes()).into_owned());
-    trim_trailing_ascii_whitespace(d, &mut result);
-    result
+    match String::from_utf8(result) {
+        Ok(s) => s,
+        Err(e) => String::from_utf8_lossy(&e.into_bytes()).into_owned(),
+    }
 }
 
 #[cfg(test)]