Skip to content

Commit

Permalink
Merge pull request #146 from rakaly/encoding
Browse files Browse the repository at this point in the history
Simplify text decoding routines
  • Loading branch information
nickbabcock authored Dec 21, 2023
2 parents 5f5b4cf + 6bed8dd commit 0d314ac
Show file tree
Hide file tree
Showing 3 changed files with 64 additions and 61 deletions.
36 changes: 34 additions & 2 deletions src/bin/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct Stats {
token: u32,
rgb: u32,
i64: u32,
frequencies: Vec<u64>,
}

impl Stats {
Expand All @@ -34,8 +35,14 @@ impl Stats {
BinaryToken::U64(_) => self.u64 += 1,
BinaryToken::I64(_) => self.i64 += 1,
BinaryToken::I32(_) => self.i32 += 1,
BinaryToken::Quoted(_) => self.quoted += 1,
BinaryToken::Unquoted(_) => self.unquoted += 1,
BinaryToken::Quoted(x) => {
self.frequencies[x.as_bytes().len()] += 1;
self.quoted += 1
}
BinaryToken::Unquoted(x) => {
self.frequencies[x.as_bytes().len()] += 1;
self.unquoted += 1
}
BinaryToken::F32(_) => self.f32 += 1,
BinaryToken::F64(_) => self.f64 += 1,
BinaryToken::Token(_) => self.token += 1,
Expand Down Expand Up @@ -182,6 +189,28 @@ impl std::fmt::Display for Stats {

writeln!(f, "total:\t\t{:<8}", total)?;

let count = self.frequencies.iter().sum::<u64>();
let sum = self
.frequencies
.iter()
.enumerate()
.map(|(i, x)| (i as u64) * *x)
.sum::<u64>();
let median_ind = (count + 1) / 2;
let mut counter = 0;
let mut median = 0;
for (i, freq) in self.frequencies.iter().enumerate() {
counter += *freq;
if counter > median_ind {
median = i;
break;
}
}

writeln!(f, "text count: {}", count)?;
writeln!(f, "text average length: {:.2}", sum as f64 / count as f64)?;
writeln!(f, "text median length: {:.2}", median)?;

Ok(())
}
}
Expand Down Expand Up @@ -255,6 +284,9 @@ fn main() -> Result<(), Box<dyn error::Error>> {
let mut keys = Stats::default();
let mut values = Stats::default();
let mut array = Stats::default();
keys.frequencies = vec![0; 100];
values.frequencies = vec![0; 100];
array.frequencies = vec![0; 100];
let tokens = tape.tokens();
read_object(&mut keys, &mut values, &mut array, tokens, 0..tokens.len());
println!("Object key tokens:");
Expand Down
5 changes: 0 additions & 5 deletions src/data.rs
Original file line number Diff line number Diff line change
Expand Up @@ -262,11 +262,6 @@ pub(crate) const WHITESPACE: u8 = 2;
pub(crate) const OPERATOR: u8 = 4;
pub(crate) const COMMENT: u8 = 8;

#[inline]
pub(crate) fn is_whitespace(b: u8) -> bool {
b.is_ascii_whitespace()
}

#[inline]
pub(crate) fn is_boundary(b: u8) -> bool {
CHARACTER_CLASS[usize::from(b)] != 0
Expand Down
84 changes: 30 additions & 54 deletions src/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
use crate::{
data::is_whitespace, data::WINDOWS_1252, util::contains_zero_byte, util::le_u64,
util::repeat_byte,
};
use crate::{data::WINDOWS_1252, util::contains_zero_byte, util::le_u64, util::repeat_byte};
use std::borrow::Cow;

/// An encoding for interpreting byte data as UTF-8 text
Expand Down Expand Up @@ -107,61 +104,40 @@ impl Encoding for Utf8Encoding {
}
}

#[inline]
fn trim_trailing_index(d: &[u8]) -> usize {
d.iter()
.rev()
.position(|x| !is_whitespace(*x))
.unwrap_or(d.len())
}

#[inline]
fn trim_trailing_whitepsace(d: &[u8]) -> &[u8] {
&d[..d.len() - trim_trailing_index(d)]
}

#[inline]
fn trim_trailing_ascii_whitespace(original_data: &[u8], s: &mut String) {
// truncate the string's inner vector to remove any whitespace.
// We know this is safe as we only care about ascii whitespace.
let ind = trim_trailing_index(original_data);
let inner = unsafe { s.as_mut_vec() };
inner.truncate(inner.len() - ind);
// https://github.com/rust-lang/rust/blob/767453eb7ca188e991ac5568c17b984dd4893e77/library/core/src/slice/ascii.rs#L159-L171
const fn trim_ascii_end(data: &[u8]) -> &[u8] {
let mut bytes = data;

// Note: A pattern matching based approach (instead of indexing) allows
// making the function const.
while let [rest @ .., last] = bytes {
if last.is_ascii_whitespace() {
bytes = rest;
} else {
break;
}
}
bytes
}

#[inline]
pub(crate) fn decode_windows1252(d: &[u8]) -> Cow<str> {
// Then we iterate through the data in 8 byte chunks and ensure that each chunk
// is contained of ascii characters with no escape characters
let mut chunk_iter = d.chunks_exact(8);
let mut offset = 0;
for n in &mut chunk_iter {
let wide = le_u64(n);
if wide & 0x8080_8080_8080_8080 != 0 || contains_zero_byte(wide ^ repeat_byte(b'\\')) {
return Cow::Owned(windows_1252_create(d, offset));
}

offset += 8;
let bytes = trim_ascii_end(d);
let mut eject = false;
for x in bytes {
eject |= !x.is_ascii() || *x == b'\\'
}

// Same logic as before but instead of operating on 8 bytes at a time, work bytewise
let remainder = chunk_iter.remainder();
for &byte in remainder {
if !byte.is_ascii() || byte == b'\\' {
return Cow::Owned(windows_1252_create(d, offset));
}

offset += 1;
if eject {
return Cow::Owned(windows_1252_create(bytes, 0));
}

let d = trim_trailing_whitepsace(d);

// This is safe as we just checked that the data is ascii and ascii is a subset of utf8
debug_assert!(std::str::from_utf8(d).is_ok());
let s = unsafe { std::str::from_utf8_unchecked(d) };
debug_assert!(std::str::from_utf8(bytes).is_ok());
let s = unsafe { std::str::from_utf8_unchecked(bytes) };
Cow::Borrowed(s)
}

#[inline(never)]
fn windows_1252_create(d: &[u8], offset: usize) -> String {
let (upto, rest) = d.split_at(offset);

Expand All @@ -175,14 +151,14 @@ fn windows_1252_create(d: &[u8], offset: usize) -> String {
result.push(WINDOWS_1252[c as usize]);
}

trim_trailing_ascii_whitespace(d, &mut result);
result
}

#[inline]
pub(crate) fn decode_utf8(d: &[u8]) -> Cow<str> {
// Then we iterate through the data in 8 byte chunks and ensure that each chunk
// has no escape characters
let d = trim_ascii_end(d);
let mut chunk_iter = d.chunks_exact(8);
let mut offset = 0;
let mut is_ascii = true;
Expand All @@ -207,7 +183,7 @@ pub(crate) fn decode_utf8(d: &[u8]) -> Cow<str> {
offset += 1;
}

let d = trim_trailing_whitepsace(d);
let d = trim_ascii_end(d);

// Most the strings we'll be decoding are ascii, so we have an ascii fast path. If we don't
// detect any non-ascii characters then we can immediately create the borrowed string without
Expand All @@ -234,10 +210,10 @@ fn utf8_create(d: &[u8], offset: usize) -> String {
result.push(c);
}

let mut result = String::from_utf8(result)
.unwrap_or_else(|e| String::from_utf8_lossy(&e.into_bytes()).into_owned());
trim_trailing_ascii_whitespace(d, &mut result);
result
match String::from_utf8(result) {
Ok(s) => s,
Err(e) => String::from_utf8_lossy(&e.into_bytes()).into_owned(),
}
}

#[cfg(test)]
Expand Down

0 comments on commit 0d314ac

Please sign in to comment.