diff --git a/README.md b/README.md index c6b1652..1f699fe 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ Converters](https://github.com/ParadoxGameConverters) and ## Quick Start -Below is a demonstration on parsing plaintext data using jomini tools. +Below is a demonstration of deserializing plaintext data using serde. +Several additional serde-like attributes are used to reconcile the serde +data model with structure of these files. ```rust use jomini::{ @@ -71,9 +73,9 @@ let actual: Model = jomini::text::de::from_windows1252_slice(data)?; assert_eq!(actual, expected); ``` -## Binary Parsing +## Binary Deserialization -Parsing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: +Deserializing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: - How text should be decoded (typically Windows-1252 or UTF-8) - How rational (floating point) numbers are decoded @@ -84,7 +86,7 @@ Implementors be warned, not only does each Paradox game have a different binary Below is an example that defines a sample binary format and uses a hashmap token lookup. ```rust -use jomini::{BinaryDeserializer, Encoding, JominiDeserialize, Windows1252Encoding}; +use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; use std::{borrow::Cow, collections::HashMap}; #[derive(JominiDeserialize, PartialEq, Debug)] @@ -116,8 +118,7 @@ let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; let mut map = HashMap::new(); map.insert(0x2d82, "field1"); -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; +let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); ``` @@ -126,59 +127,14 @@ without any duplication. One can configure the behavior when a token is unknown (ie: fail immediately or try to continue). -### Ondemand Deserialization - -The ondemand deserializer is a one-shot deserialization mode is often faster -and more memory efficient as it does not parse the input into an intermediate -tape, and instead deserializes right from the input. - -It is instantiated and used similarly to `BinaryDeserializer` - -```rust -use jomini::OndemandBinaryDeserializer; -// [...snip code from previous example...] - -let actual: MyStruct = OndemandBinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -``` - -### Direct identifier deserialization with `token` attribute - -There may be some performance loss during binary deserialization as -tokens are resolved to strings via a `TokenResolver` and then matched against the -string representations of a struct's fields. - -We can fix this issue by directly encoding the expected token value into the struct: - -```rust -#[derive(JominiDeserialize, PartialEq, Debug)] -struct MyStruct { - #[jomini(token = 0x2d82)] - field1: String, -} - -// Empty token to string resolver -let map = HashMap::::new(); - -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -``` - -Couple notes: - -- This does not obviate need for the token to string resolver as tokens may be used as values. -- If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. - ## Caveats -Caller is responsible for: +Before calling any Jomini API, callers are expected to: -- Determining the correct format (text or binary) ahead of time -- Stripping off any header that may be present (eg: `EU4txt` / `EU4bin`) -- Providing the token resolver for the binary format -- Providing the conversion to reconcile how, for example, a date may be encoded as an integer in +- Determine the correct format (text or binary) ahead of time. +- Strip off any header that may be present (eg: `EU4txt` / `EU4bin`) +- Provide the token resolver for the binary format +- Provide the conversion to reconcile how, for example, a date may be encoded as an integer in the binary format, but as a string when in plaintext. ## The Mid-level API @@ -199,6 +155,9 @@ for (key, _op, value) in reader.fields() { } ``` +For even lower level of parisng, see the respective binary and text +documentation. + The mid-level API also provides the excellent utility of converting the plaintext Clausewitz format to JSON when the `json` feature is enabled. @@ -211,28 +170,6 @@ let actual = reader.json().to_string()?; assert_eq!(actual, r#"{"foo":"bar"}"#); ``` -## One Level Lower - -At the lowest layer, one can interact with the raw data directly via `TextTape` -and `BinaryTape`. - -```rust -use jomini::{TextTape, TextToken, Scalar}; - -let data = b"foo=bar"; - -assert_eq!( - TextTape::from_slice(&data[..])?.tokens(), - &[ - TextToken::Unquoted(Scalar::new(b"foo")), - TextToken::Unquoted(Scalar::new(b"bar")), - ] -); -``` - -If one will only use `TextTape` and `BinaryTape` then `jomini` can be compiled without default -features, resulting in a build without dependencies. - ## Write API There are two targeted use cases for the write API. One is when a text tape is on hand. diff --git a/benches/jomini_bench.rs b/benches/jomini_bench.rs index acad516..1f72aa3 100644 --- a/benches/jomini_bench.rs +++ b/benches/jomini_bench.rs @@ -3,11 +3,9 @@ use criterion::{ }; use flate2::read::GzDecoder; use jomini::{ - binary::{ - de::OndemandBinaryDeserializerBuilder, BinaryFlavor, BinaryTapeParser, TokenResolver, - }, + binary::{BinaryFlavor, BinaryTapeParser, TokenResolver}, common::Date, - BinaryDeserializer, BinaryTape, Encoding, Scalar, TextTape, Utf8Encoding, Windows1252Encoding, + BinaryTape, Encoding, Scalar, TextTape, Utf8Encoding, Windows1252Encoding, }; use std::{borrow::Cow, io::Read}; @@ -125,15 +123,26 @@ pub fn binary_deserialize_benchmark(c: &mut Criterion) { group.throughput(Throughput::Bytes(data.len() as u64)); group.bench_function("ondemand", |b| { b.iter(|| { - let _res: Gamestate = OndemandBinaryDeserializerBuilder::with_flavor(BinaryTestFlavor) + let _res: Gamestate = BinaryTestFlavor + .deserializer() .deserialize_slice(&data[..], &MyBinaryResolver) .unwrap(); }) }); + group.bench_function("ondemand-reader", |b| { + b.iter(|| { + let _res: Gamestate = BinaryTestFlavor + .deserializer() + .deserialize_reader(&data[..], &MyBinaryResolver) + .unwrap(); + }) + }); group.bench_function("tape", |b| { b.iter(|| { - let _res: Gamestate = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &MyBinaryResolver) + let tape = BinaryTape::from_slice(&data[..]).unwrap(); + let _res: Gamestate = BinaryTestFlavor + .deserializer() + .deserialize_tape(&tape, &MyBinaryResolver) .unwrap(); }) }); diff --git a/fuzz/fuzz_targets/fuzz_binary.rs b/fuzz/fuzz_targets/fuzz_binary.rs index a07cd61..ca8b9f4 100644 --- a/fuzz/fuzz_targets/fuzz_binary.rs +++ b/fuzz/fuzz_targets/fuzz_binary.rs @@ -62,6 +62,17 @@ fuzz_target!(|data: &[u8]| { hash.insert(0x354eu16, "selector"); hash.insert(0x209u16, "localization"); + let mut lexer = jomini::binary::Lexer::new(data); + let mut reader = jomini::binary::TokenReader::builder().buffer_len(100).build(data); + + loop { + match (lexer.read_token(), reader.read()) { + (Ok(t1), Ok(t2)) => assert_eq!(t1, t2), + (Err(e1), Err(e2)) => { break; } + (x, y) => panic!("{:?} {:?}", x, y), + } + } + let mut utape = jomini::BinaryTape::default(); let ures = jomini::binary::BinaryTapeParser.parse_slice_into_tape_unoptimized(&data, &mut utape); diff --git a/fuzz/fuzz_targets/fuzz_text.rs b/fuzz/fuzz_targets/fuzz_text.rs index 595418a..790ba50 100644 --- a/fuzz/fuzz_targets/fuzz_text.rs +++ b/fuzz/fuzz_targets/fuzz_text.rs @@ -98,6 +98,17 @@ where } fuzz_target!(|data: &[u8]| { + let mut reader = jomini::text::TokenReader::new(data); + let mut i = 0; + while let Ok(Some(x)) = reader.next() { + if matches!(x, jomini::text::Token::Open) { + i += 1; + if i % 2 == 1 { + let _ = reader.skip_container(); + } + } + } + let _: Result = jomini::TextTape::from_slice(&data).and_then(|tape| { let tokens = tape.tokens(); for (i, token) in tokens.iter().enumerate() { diff --git a/src/binary/de.rs b/src/binary/de.rs index 315e1f6..5de348a 100644 --- a/src/binary/de.rs +++ b/src/binary/de.rs @@ -1,290 +1,586 @@ -use super::{tokens::*, Rgb}; +use super::{ + lexer::{LexemeId, Lexer}, + LexError, Token, TokenReader, TokenReaderBuilder, +}; use crate::{ binary::{BinaryFlavor, FailedResolveStrategy, TokenResolver}, de::ColorSequence, - util::get_split, - BinaryTape, BinaryToken, DeserializeError, DeserializeErrorKind, Error, ErrorKind, + BinaryTape, BinaryToken, DeserializeError, DeserializeErrorKind, Error, +}; +use serde::de::{ + self, Deserialize, DeserializeOwned, DeserializeSeed, MapAccess, SeqAccess, Visitor, }; -use serde::de::{self, Deserialize, DeserializeSeed, MapAccess, SeqAccess, Visitor}; -use std::borrow::Cow; +use std::{borrow::Cow, io::Read}; -#[derive(Debug)] -struct OndemandParser<'data> { - data: &'data [u8], - original_length: usize, +/// Serde deserializer over a streaming binary reader +pub struct BinaryReaderDeserializer<'res, RES, F, R> { + reader: TokenReader, + config: BinaryConfig<'res, RES, F>, +} + +impl<'res, RES: TokenResolver, E: BinaryFlavor, R: Read> BinaryReaderDeserializer<'res, RES, E, R> { + /// Deserialize into provided type + pub fn deserialize(&mut self) -> Result + where + T: DeserializeOwned, + { + T::deserialize(self) + } +} + +impl<'a, 'de, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::Deserializer<'de> + for &'a mut BinaryReaderDeserializer<'res, RES, F, R> +{ + type Error = Error; + + fn deserialize_any(self, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "root deserializer can only work with key value pairs", + )), + })) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_map(BinaryReaderMap::new(self, true)) + } + + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) + } + + serde::forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct enum ignored_any identifier + } } -impl<'data> OndemandParser<'data> { +struct BinaryReaderMap<'a: 'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + root: bool, +} + +impl<'a, 'res, RES: 'a, F, R> BinaryReaderMap<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, root: bool) -> Self { + BinaryReaderMap { de, root } + } +} + +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> MapAccess<'de> + for BinaryReaderMap<'a, 'res, RES, F, R> +{ + type Error = Error; + #[inline] - pub fn peek(&mut self) -> Option { - self.data - .get(..2) - .map(|head| u16::from_le_bytes([head[0], head[1]])) + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + loop { + match self.de.reader.next() { + Ok(Some(Token::Close)) => return Ok(None), + Ok(Some(Token::Open)) => { + let _ = self.de.reader.read(); + } + Ok(Some(token)) => { + return seed + .deserialize(BinaryReaderTokenDeserializer { de, token }) + .map(Some) + } + Ok(None) if self.root => return Ok(None), + Ok(None) => return Err(LexError::Eof.at(self.de.reader.position()).into()), + Err(e) => return Err(e.into()), + } + } } #[inline] - pub fn next(&mut self) -> Option { - let (data, token) = - get_split::<2>(self.data).map(|(head, rest)| (rest, u16::from_le_bytes(head)))?; - self.data = data; - Some(token) + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + let mut token = self.de.reader.read()?; + if matches!(token, Token::Equal) { + token = self.de.reader.read()?; + } + + seed.deserialize(BinaryReaderTokenDeserializer { de, token }) } +} + +struct BinaryReaderTokenDeserializer<'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + token: Token<'a>, +} +impl<'a, 'res, RES: TokenResolver, F: BinaryFlavor, R> + BinaryReaderTokenDeserializer<'a, 'res, RES, F, R> +where + F: BinaryFlavor, + R: Read, +{ #[inline] - pub fn read(&mut self) -> Result { - self.next().ok_or_else(Error::eof) + fn deser<'de, V>(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + 'res: 'de, + { + match self.token { + Token::U32(x) => visitor.visit_u32(x), + Token::U64(x) => visitor.visit_u64(x), + Token::I32(x) => visitor.visit_i32(x), + Token::Bool(x) => visitor.visit_bool(x), + Token::Quoted(x) | Token::Unquoted(x) => { + match self.de.config.flavor.decode(x.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } + Token::F32(x) => visitor.visit_f32(self.de.config.flavor.visit_f32(x)), + Token::F64(x) => visitor.visit_f64(self.de.config.flavor.visit_f64(x)), + Token::Rgb(x) => visitor.visit_seq(ColorSequence::new(x)), + Token::I64(x) => visitor.visit_i64(x), + Token::Id(s) => match self.de.config.resolver.resolve(s) { + Some(id) => visitor.visit_borrowed_str(id), + None => match self.de.config.failed_resolve_strategy { + FailedResolveStrategy::Error => Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::UnknownToken { token_id: s }, + })), + FailedResolveStrategy::Stringify => visitor.visit_string(format!("0x{:x}", s)), + FailedResolveStrategy::Ignore => { + visitor.visit_borrowed_str("__internal_identifier_ignore") + } + }, + }, + Token::Close => Err(Error::invalid_syntax( + "did not expect end", + self.de.reader.position(), + )), + Token::Equal => Err(Error::invalid_syntax( + "did not expect equal", + self.de.reader.position(), + )), + Token::Open => visitor.visit_seq(BinaryReaderSeq::new(self.de)), + } } +} + +macro_rules! deserialize_scalar { + ($method:ident) => { + #[inline] + fn $method(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.deser(visitor) + } + }; +} + +impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::Deserializer<'de> + for BinaryReaderTokenDeserializer<'a, 'res, RES, F, R> +{ + type Error = Error; + + deserialize_scalar!(deserialize_any); + deserialize_scalar!(deserialize_i8); + deserialize_scalar!(deserialize_i16); + deserialize_scalar!(deserialize_u8); + deserialize_scalar!(deserialize_char); + deserialize_scalar!(deserialize_identifier); + deserialize_scalar!(deserialize_bytes); + deserialize_scalar!(deserialize_byte_buf); #[inline] - pub fn read_string(&mut self) -> Result<&'data [u8], Error> { - let (head, rest) = get_split::<2>(self.data).ok_or_else(Error::eof)?; - let text_len = usize::from(u16::from_le_bytes(head)); - if text_len <= rest.len() { - let (text, rest) = rest.split_at(text_len); - self.data = rest; - Ok(text) + fn deserialize_bool(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::Bool(x) = &self.token { + visitor.visit_bool(*x) } else { - Err(Error::eof()) + self.deser(visitor) } } #[inline] - pub fn read_bool(&mut self) -> Result { - let (&first, rest) = self.data.split_first().ok_or_else(Error::eof)?; - self.data = rest; - Ok(first != 0) + fn deserialize_u16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::Id(x) = &self.token { + visitor.visit_u16(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_i32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::I32(x) = &self.token { + visitor.visit_i32(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_u32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::U32(x) = &self.token { + visitor.visit_u32(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_u64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::U64(x) = &self.token { + visitor.visit_u64(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_i64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::I64(x) = &self.token { + visitor.visit_i64(*x) + } else { + self.deser(visitor) + } } #[inline] - fn read_u32(&mut self) -> Result { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(u32::from_le_bytes(head)) + fn deserialize_f32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::F32(x) = &self.token { + visitor.visit_f32(self.de.config.flavor.visit_f32(*x)) + } else { + self.deser(visitor) + } } #[inline] - fn read_u64(&mut self) -> Result { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(u64::from_le_bytes(head)) + fn deserialize_f64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::F64(x) = &self.token { + visitor.visit_f64(self.de.config.flavor.visit_f64(*x)) + } else { + self.deser(visitor) + } } #[inline] - fn read_i64(&mut self) -> Result { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(i64::from_le_bytes(head)) + fn deserialize_str(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_string(visitor) } #[inline] - fn read_i32(&mut self) -> Result { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(i32::from_le_bytes(head)) + fn deserialize_string(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Quoted(x) | Token::Unquoted(x) => { + match self.de.config.flavor.decode(x.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } + _ => self.deser(visitor), + } + } + + #[inline] + fn deserialize_option(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_some(self) + } + + #[inline] + fn deserialize_unit(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + #[inline] + fn deserialize_unit_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + #[inline] + fn deserialize_newtype_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + #[inline] + fn deserialize_seq(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => { + let mut seq = BinaryReaderSeq::new(self.de); + let result = visitor.visit_seq(&mut seq)?; + if !seq.hit_end { + // For when we are deserializing an array that doesn't read + // the closing token + if !matches!(self.de.reader.read()?, Token::Close) { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.reader.position(), + )); + } + } + Ok(result) + } + Token::Rgb(x) => visitor.visit_seq(ColorSequence::new(x)), + _ => self.deser(visitor), + } + } + + #[inline] + fn deserialize_tuple(self, _len: usize, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + #[inline] + fn deserialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) } #[inline] - fn read_f32(&mut self) -> Result<[u8; 4], Error> { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(head) + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + visitor.visit_map(BinaryReaderMap::new(self.de, false)) + } else { + self.deser(visitor) + } } #[inline] - fn read_f64(&mut self) -> Result<[u8; 8], Error> { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(head) + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) } #[inline] - fn skip_value(&mut self, init: u16) -> Result<(), Error> { - match init { - QUOTED_STRING | UNQUOTED_STRING => { - self.read_string()?; - Ok(()) - } - U32 => { - self.read_u32()?; - Ok(()) - } - I32 => { - self.read_i32()?; - Ok(()) - } - U64 => { - self.read_u64()?; - Ok(()) - } - I64 => { - self.read_i64()?; - Ok(()) - } - BOOL => { - self.read_bool()?; - Ok(()) - } - F32 => { - self.read_f32()?; - Ok(()) - } - F64 => { - self.read_f64()?; - Ok(()) - } - OPEN => self.skip_container(), - _ => Ok(()), - } + fn deserialize_enum( + self, + _name: &'static str, + _variants: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_enum(BinaryReaderEnum::new(self.de, self.token)) } #[inline] - fn skip_container(&mut self) -> Result<(), Error> { - let mut depth = 1; - while depth != 0 { - match self.read()? { - QUOTED_STRING | UNQUOTED_STRING => { - self.read_string()?; - } - U32 => { - self.read_u32()?; - } - I32 => { - self.read_i32()?; - } - U64 => { - self.read_u64()?; - } - I64 => { - self.read_i64()?; - } - BOOL => { - self.read_bool()?; - } - F32 => { - self.read_f32()?; - } - F64 => { - self.read_f64()?; - } - END => depth -= 1, - OPEN => depth += 1, - _ => {} - } + fn deserialize_ignored_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + self.de.reader.skip_container()?; } - Ok(()) + visitor.visit_unit() } +} - fn read_rgb(&mut self) -> Result { - let start = self.read()?; - let rtoken = self.read()?; - let r = self.read_u32()?; - let gtoken = self.read()?; - let g = self.read_u32()?; - let btoken = self.read()?; - let b = self.read_u32()?; - let next_tok = self.read()?; - let a = match (start, rtoken, gtoken, btoken, next_tok) { - (OPEN, U32, U32, U32, END) => None, - (OPEN, U32, U32, U32, U32) => { - let a = Some(self.read_u32()?); - if self.read()? != END { - return Err(self.invalid_syntax("expected end after rgb alpha")); - } - a - } - _ => return Err(self.invalid_syntax("invalid rgb value")), - }; +struct BinaryReaderSeq<'a: 'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + hit_end: bool, +} - Ok(Rgb { r, g, b, a }) +impl<'a, 'de: 'a, 'res: 'de, RES: 'a, F, R> BinaryReaderSeq<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>) -> Self { + BinaryReaderSeq { de, hit_end: false } } +} - #[cold] - #[inline(never)] - fn invalid_syntax>(&self, msg: T) -> Error { - Error::new(ErrorKind::InvalidSyntax { - msg: msg.into(), - offset: self.original_length - self.data.len(), - }) +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> SeqAccess<'de> + for BinaryReaderSeq<'a, 'res, RES, F, R> +{ + type Error = Error; + + fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> + where + T: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + match self.de.reader.read()? { + Token::Close => { + self.hit_end = true; + Ok(None) + } + token => seed + .deserialize(BinaryReaderTokenDeserializer { de, token }) + .map(Some), + } } } -/// On-demand binary deserializer -pub struct OndemandBinaryDeserializer<'data, 'res: 'data, RES, F> { - parser: OndemandParser<'data>, - config: BinaryConfig<'res, RES, F>, +struct BinaryReaderEnum<'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + token: Token<'a>, } -impl OndemandBinaryDeserializer<'_, '_, (), ()> { - /// Constructs a OndemandBinaryDeserializerBuilder - pub fn builder_flavor(flavor: F) -> OndemandBinaryDeserializerBuilder { - OndemandBinaryDeserializerBuilder::with_flavor(flavor) +impl<'a, 'res, RES: 'a, F, R> BinaryReaderEnum<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, token: Token<'a>) -> Self { + BinaryReaderEnum { de, token } } } -/// Build a tweaked on-deman binary deserializer -#[derive(Debug)] -pub struct OndemandBinaryDeserializerBuilder { - failed_resolve_strategy: FailedResolveStrategy, - flavor: F, +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::EnumAccess<'de> + for BinaryReaderEnum<'a, 'res, RES, F, R> +{ + type Error = Error; + type Variant = Self; + + fn variant_seed(self, seed: V) -> Result<(V::Value, Self), Self::Error> + where + V: de::DeserializeSeed<'de>, + { + let variant = seed.deserialize(BinaryReaderTokenDeserializer { + de: self.de, + token: self.token, + })?; + Ok((variant, self)) + } } -impl OndemandBinaryDeserializerBuilder -where - F: BinaryFlavor, +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R> de::VariantAccess<'de> + for BinaryReaderEnum<'a, 'res, RES, F, R> { - /// Create a new builder instance - pub fn with_flavor(flavor: F) -> Self { - OndemandBinaryDeserializerBuilder { - failed_resolve_strategy: FailedResolveStrategy::Ignore, - flavor, - } - } + type Error = Error; - /// Set the behavior when a unknown token is encountered - pub fn on_failed_resolve(&mut self, strategy: FailedResolveStrategy) -> &mut Self { - self.failed_resolve_strategy = strategy; - self + fn unit_variant(self) -> Result<(), Self::Error> { + Ok(()) } - /// Convenience method for parsing and building a deserializer - pub fn from_slice<'data, 'res: 'data, RES>( - self, - data: &'data [u8], - resolver: &'res RES, - ) -> OndemandBinaryDeserializer<'data, 'res, RES, F> + fn newtype_variant_seed(self, _seed: T) -> Result where - RES: TokenResolver, + T: DeserializeSeed<'de>, { - let config = BinaryConfig { - resolver, - failed_resolve_strategy: self.failed_resolve_strategy, - flavor: self.flavor, - }; + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } - OndemandBinaryDeserializer { - parser: OndemandParser { - data, - original_length: data.len(), - }, - config, - } + fn tuple_variant(self, _len: usize, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) } - /// Convenience method for parsing and deserializing binary data - pub fn deserialize_slice<'b, 'data, 'res: 'data, RES, T>( + fn struct_variant( self, - data: &'data [u8], - resolver: &'res RES, - ) -> Result + _fields: &'static [&'static str], + _visitor: V, + ) -> Result where - T: Deserialize<'data>, - RES: TokenResolver, + V: Visitor<'de>, { - self.from_slice(data, resolver).deserialize() + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) } } +/// On-demand binary deserializer +pub struct OndemandBinaryDeserializer<'data, 'res: 'data, RES, F> { + parser: Lexer<'data>, + config: BinaryConfig<'res, RES, F>, +} + impl<'de, 'res, RES: TokenResolver, E: BinaryFlavor> OndemandBinaryDeserializer<'de, 'res, RES, E> { /// Deserialize into provided type pub fn deserialize(&mut self) -> Result @@ -357,17 +653,18 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> where K: DeserializeSeed<'de>, { - let token = self.de.parser.next(); - match token { - Some(END) => Ok(None), - None if self.root => Ok(None), - None => Err(Error::eof()), - Some(token) => seed + match self.de.parser.read_id() { + Ok(LexemeId::CLOSE) => Ok(None), + Ok(token) => seed .deserialize(OndemandTokenDeserializer { de: &mut *self.de, token, }) .map(Some), + Err(e) => match e.kind() { + LexError::Eof if self.root => Ok(None), + _ => Err(e.into()), + }, } } @@ -375,9 +672,9 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> where V: DeserializeSeed<'de>, { - let mut token = self.de.parser.read()?; - if token == EQUAL { - token = self.de.parser.read()?; + let mut token = self.de.parser.read_id()?; + if token == LexemeId::EQUAL { + token = self.de.parser.read_id()?; } seed.deserialize(OndemandTokenDeserializer { @@ -389,7 +686,7 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> struct OndemandTokenDeserializer<'a, 'de: 'a, 'res: 'de, RES: 'a, F> { de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, - token: u16, + token: LexemeId, } impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> @@ -404,32 +701,36 @@ where let mut tok = self.token; // Skip empty objects masquerading as keys - while tok == OPEN && matches!(self.de.parser.peek(), Some(END)) { - self.de.parser.read()?; - tok = self.de.parser.read()?; + while tok == LexemeId::OPEN && matches!(self.de.parser.peek_id(), Some(LexemeId::CLOSE)) { + self.de.parser.read_id()?; + tok = self.de.parser.read_id()?; } match tok { - QUOTED_STRING | UNQUOTED_STRING => { + LexemeId::QUOTED | LexemeId::UNQUOTED => { let data = self.de.parser.read_string()?; - match self.de.config.flavor.decode(data) { + match self.de.config.flavor.decode(data.as_bytes()) { Cow::Borrowed(x) => visitor.visit_borrowed_str(x), Cow::Owned(x) => visitor.visit_string(x), } } - U32 => visitor.visit_u32(self.de.parser.read_u32()?), - I32 => visitor.visit_i32(self.de.parser.read_i32()?), - U64 => visitor.visit_u64(self.de.parser.read_u64()?), - I64 => visitor.visit_i64(self.de.parser.read_i64()?), - BOOL => visitor.visit_bool(self.de.parser.read_bool()?), - F32 => visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)), - F64 => visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)), - OPEN => visitor.visit_seq(OndemandSeq::new(self.de)), - END | EQUAL => Err(self - .de - .parser - .invalid_syntax("unexpected token encountered")), - s => match self.de.config.resolver.resolve(s) { + LexemeId::U32 => visitor.visit_u32(self.de.parser.read_u32()?), + LexemeId::I32 => visitor.visit_i32(self.de.parser.read_i32()?), + LexemeId::U64 => visitor.visit_u64(self.de.parser.read_u64()?), + LexemeId::I64 => visitor.visit_i64(self.de.parser.read_i64()?), + LexemeId::BOOL => visitor.visit_bool(self.de.parser.read_bool()?), + LexemeId::F32 => { + visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)) + } + LexemeId::F64 => { + visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)) + } + LexemeId::OPEN => visitor.visit_seq(OndemandSeq::new(self.de)), + LexemeId::CLOSE | LexemeId::EQUAL => Err(Error::invalid_syntax( + "unexpected token encountered", + self.de.parser.position(), + )), + LexemeId(s) => match self.de.config.resolver.resolve(s) { Some(id) => visitor.visit_borrowed_str(id), None => match self.de.config.failed_resolve_strategy { FailedResolveStrategy::Error => Err(Error::from(DeserializeError { @@ -474,10 +775,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == BOOL { + if self.token == LexemeId::BOOL { visitor.visit_bool(self.de.parser.read_bool()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -485,10 +786,11 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - match self.token { - QUOTED_STRING | UNQUOTED_STRING | U32 | I32 | U64 | I64 | BOOL | F32 | F64 | OPEN - | END | EQUAL => self.deser(visitor), - x => visitor.visit_u16(x), + if self.token.is_id() { + let LexemeId(x) = self.token; + visitor.visit_u16(x) + } else { + self.deser(visitor) } } @@ -496,10 +798,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == I32 { + if self.token == LexemeId::I32 { visitor.visit_i32(self.de.parser.read_i32()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -507,10 +809,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == U32 { + if self.token == LexemeId::U32 { visitor.visit_u32(self.de.parser.read_u32()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -518,10 +820,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == U64 { + if self.token == LexemeId::U64 { visitor.visit_u64(self.de.parser.read_u64()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -529,10 +831,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == I64 { + if self.token == LexemeId::I64 { visitor.visit_i64(self.de.parser.read_i64()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -540,10 +842,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == F32 { + if self.token == LexemeId::F32 { visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -551,10 +853,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == F64 { + if self.token == LexemeId::F64 { visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -569,14 +871,14 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == QUOTED_STRING || self.token == UNQUOTED_STRING { + if self.token == LexemeId::QUOTED || self.token == LexemeId::UNQUOTED { let data = self.de.parser.read_string()?; - match self.de.config.flavor.decode(data) { + match self.de.config.flavor.decode(data.as_bytes()) { Cow::Borrowed(x) => visitor.visit_borrowed_str(x), Cow::Owned(x) => visitor.visit_string(x), } } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -620,26 +922,26 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == OPEN { + if self.token == LexemeId::OPEN { let mut seq = OndemandSeq::new(self.de); let result = visitor.visit_seq(&mut seq)?; if !seq.hit_end { // For when we are deserializing an array that doesn't read // the closing token - let ender = self.de.parser.read()?; - if ender != END { - return Err(self - .de - .parser - .invalid_syntax("Expected sequence to be terminated with an end token")); + let ender = self.de.parser.read_id()?; + if ender != LexemeId::CLOSE { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.parser.position(), + )); } } Ok(result) - } else if self.token == RGB { + } else if self.token == LexemeId::RGB { let rgb = self.de.parser.read_rgb()?; visitor.visit_seq(ColorSequence::new(rgb)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -666,10 +968,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == OPEN { + if self.token == LexemeId::OPEN { visitor.visit_map(OndemandMap::new(self.de, false)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -726,8 +1028,8 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> SeqAccess<'de> where T: DeserializeSeed<'de>, { - let token = self.de.parser.read()?; - if token == END { + let token = self.de.parser.read_id()?; + if token == LexemeId::CLOSE { self.hit_end = true; Ok(None) } else { @@ -742,11 +1044,11 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> SeqAccess<'de> struct OndemandEnum<'a, 'de: 'a, 'res: 'de, RES: 'a, F> { de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, - token: u16, + token: LexemeId, } impl<'a, 'de: 'a, 'res: 'de, RES: 'a, F> OndemandEnum<'a, 'de, 'res, RES, F> { - fn new(de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, token: u16) -> Self { + fn new(de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, token: LexemeId) -> Self { OndemandEnum { de, token } } } @@ -864,7 +1166,7 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::VariantAccess< /// map.insert(0x2d83, String::from("field2")); /// /// let builder = BinaryDeserializer::builder_flavor(BinaryTestFlavor); -/// let mut deserializer = builder.from_slice(&data[..], &map)?; +/// let mut deserializer = builder.from_slice(&data[..], &map); /// let a: StructA = deserializer.deserialize()?; /// assert_eq!(a, StructA { /// field1: "ENG".to_string(), @@ -886,7 +1188,6 @@ pub struct BinaryDeserializer<'b, 'data: 'b, 'res: 'data, RES, F> { } enum BinaryDeserializerKind<'data, 'b> { - Owned(BinaryTape<'data>), Borrowed(&'b BinaryTape<'data>), } @@ -895,6 +1196,7 @@ enum BinaryDeserializerKind<'data, 'b> { pub struct BinaryDeserializerBuilder { failed_resolve_strategy: FailedResolveStrategy, flavor: F, + reader_config: TokenReaderBuilder, } impl BinaryDeserializerBuilder @@ -906,6 +1208,7 @@ where BinaryDeserializerBuilder { failed_resolve_strategy: FailedResolveStrategy::Ignore, flavor, + reader_config: TokenReaderBuilder::default(), } } @@ -915,30 +1218,63 @@ where self } - /// Convenience method for parsing and building a deserializer - pub fn from_slice<'b, 'a, 'res: 'a, RES>( + /// Set the reader buffer config (unused for slice deserializations) + pub fn reader_config(&mut self, val: TokenReaderBuilder) -> &mut Self { + self.reader_config = val; + self + } + + /// Create binary deserializer from reader + pub fn from_reader( + self, + reader: R, + resolver: &RES, + ) -> BinaryReaderDeserializer + where + RES: TokenResolver, + { + let reader = self.reader_config.build(reader); + let config = BinaryConfig { + resolver, + failed_resolve_strategy: self.failed_resolve_strategy, + flavor: self.flavor, + }; + + BinaryReaderDeserializer { reader, config } + } + + /// Deserialize value from reader + pub fn deserialize_reader(self, reader: R, resolver: &RES) -> Result + where + T: DeserializeOwned, + RES: TokenResolver, + { + self.from_reader(reader, resolver).deserialize() + } + + /// Create a binary deserializer from a slice + pub fn from_slice<'a, 'res: 'a, RES>( self, data: &'a [u8], resolver: &'res RES, - ) -> Result, Error> + ) -> OndemandBinaryDeserializer<'a, 'res, RES, F> where RES: TokenResolver, { - let tape = BinaryTape::from_slice(data)?; let config = BinaryConfig { resolver, failed_resolve_strategy: self.failed_resolve_strategy, flavor: self.flavor, }; - Ok(BinaryDeserializer { - tape: BinaryDeserializerKind::Owned(tape), + OndemandBinaryDeserializer { + parser: Lexer::new(data), config, - }) + } } - /// Convenience method for parsing and deserializing binary data - pub fn deserialize_slice<'b, 'data, 'res: 'data, RES, T>( + /// Deserialize value from slice + pub fn deserialize_slice<'data, 'res: 'data, RES, T>( self, data: &'data [u8], resolver: &'res RES, @@ -947,8 +1283,7 @@ where T: Deserialize<'data>, RES: TokenResolver, { - let deser = self.from_slice(data, resolver)?; - deser.deserialize() + self.from_slice(data, resolver).deserialize() } /// Deserialize the given binary tape @@ -971,6 +1306,19 @@ where config, } } + + /// Deserialize the given binary tape + pub fn deserialize_tape<'data, 'b, 'res: 'data, RES, T>( + self, + tape: &'b BinaryTape<'data>, + resolver: &'res RES, + ) -> Result + where + T: Deserialize<'data>, + RES: TokenResolver, + { + self.from_tape(tape, resolver).deserialize() + } } impl<'b, 'de, 'res, RES: TokenResolver, E: BinaryFlavor> BinaryDeserializer<'b, 'de, 'res, RES, E> { @@ -1025,13 +1373,12 @@ impl<'a, 'b, 'de, 'res, RES: TokenResolver, F: BinaryFlavor> de::Deserializer<'d V: Visitor<'de>, { match &self.tape { - BinaryDeserializerKind::Owned(x) | &BinaryDeserializerKind::Borrowed(x) => visitor - .visit_map(BinaryMap::new( - &self.config, - x.tokens(), - 0, - x.tokens().len(), - )), + &BinaryDeserializerKind::Borrowed(x) => visitor.visit_map(BinaryMap::new( + &self.config, + x.tokens(), + 0, + x.tokens().len(), + )), } } @@ -1556,13 +1903,24 @@ mod tests { T: Deserialize<'a> + PartialEq + std::fmt::Debug, RES: TokenResolver, { - let result = eu4_builder().from_slice(data, resolver)?.deserialize()?; - let ondemand = OndemandBinaryDeserializerBuilder::with_flavor(Eu4Flavor::new()) - .deserialize_slice(data, resolver)?; + let tape = BinaryTape::from_slice(data).unwrap(); + let result = eu4_builder().deserialize_tape(&tape, resolver)?; + let ondemand = eu4_builder().deserialize_slice(data, resolver)?; assert_eq!(result, ondemand); Ok(result) } + fn from_owned<'a, 'res: 'a, RES, T>(data: &'a [u8], resolver: &'res RES) -> Result + where + T: DeserializeOwned + PartialEq + std::fmt::Debug, + RES: TokenResolver, + { + let res = from_slice(data, resolver).unwrap(); + let reader: T = eu4_builder().deserialize_reader(data, resolver).unwrap(); + assert_eq!(reader, res); + Ok(res) + } + #[test] fn test_single_field() { let data = [ @@ -1577,7 +1935,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1640,7 +1998,7 @@ mod tests { map.insert(0x2d82, String::from("field1")); map.insert(0x284c, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1661,7 +2019,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 89 }); } @@ -1677,7 +2035,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 89 }); } @@ -1695,7 +2053,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x326b, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 128 }); } @@ -1713,7 +2071,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x326b, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: -1 }); } @@ -1729,7 +2087,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 0.023 }); } @@ -1747,7 +2105,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 1.78732 }); } @@ -1765,7 +2123,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1788,7 +2146,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1809,7 +2167,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1830,7 +2188,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1858,7 +2216,7 @@ mod tests { } } - let actual: MyStruct = from_slice(&data[..], &NullResolver).unwrap(); + let actual: MyStruct = from_owned(&data[..], &NullResolver).unwrap(); assert_eq!( actual, MyStruct { @@ -1885,7 +2243,7 @@ mod tests { map.insert(0x284c, String::from("yes")); map.insert(0x284b, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1914,7 +2272,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2ee1, String::from("dlc_enabled")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1950,7 +2308,7 @@ mod tests { map.insert(0x2ee1, String::from("dlc_enabled")); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1994,7 +2352,7 @@ mod tests { map.insert(0x2ec7, String::from("third")); map.insert(0x2ec8, String::from("fourth")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2037,7 +2395,7 @@ mod tests { map.insert(0x2ec7, String::from("third")); map.insert(0x2ec8, String::from("fourth")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2058,7 +2416,7 @@ mod tests { ]; let map: HashMap = HashMap::new(); - let actual: HashMap = from_slice(&data[..], &map).unwrap(); + let actual: HashMap = from_owned(&data[..], &map).unwrap(); assert_eq!(actual.len(), 1); assert_eq!(actual.get(&89), Some(&30)); } @@ -2085,7 +2443,7 @@ mod tests { String::from("1444.11.11"), ); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2116,7 +2474,7 @@ mod tests { String::from(r#"Joe "Captain" Rogers"#), ); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2147,7 +2505,7 @@ mod tests { map.insert(0x00e1, String::from("type")); map.insert(0x28be, String::from("general")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2171,7 +2529,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x00e1, String::from("type")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { _type: vec![] }); } @@ -2198,7 +2556,7 @@ mod tests { map.insert(0x284c, String::from("yes")); map.insert(0x284b, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2234,7 +2592,7 @@ mod tests { field1: u64, } - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 128 }); } @@ -2257,7 +2615,7 @@ mod tests { map.insert(0x2d82, "field1"); map.insert(0x28e3, "second"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2281,8 +2639,7 @@ mod tests { let map: HashMap = HashMap::new(); let mut builder = eu4_builder(); builder.on_failed_resolve(FailedResolveStrategy::Error); - let actual: Result = - builder.from_slice(&data[..], &map).unwrap().deserialize(); + let actual: Result = builder.from_slice(&data[..], &map).deserialize(); assert!(actual.is_err()); } @@ -2316,7 +2673,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2347,7 +2704,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2444,7 +2801,7 @@ mod tests { map.insert(0x1b, "name"); map.insert(0x165, "none"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2471,7 +2828,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2495,7 +2852,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2521,7 +2878,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2556,7 +2913,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "first"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2597,7 +2954,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "field"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2643,7 +3000,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "field"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2676,7 +3033,7 @@ mod tests { map.insert(0x2d82, "field1"); map.insert(0x28e3, "second"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2698,7 +3055,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x337f, "campaign_id"); - let actual: Meta = from_slice(&data[..], &map).unwrap(); + let actual: Meta = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, Meta { @@ -2717,7 +3074,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x053a, "color"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2787,7 +3144,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x053a, "color"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { diff --git a/src/binary/flavor.rs b/src/binary/flavor.rs index 79298a0..6f6199f 100644 --- a/src/binary/flavor.rs +++ b/src/binary/flavor.rs @@ -1,3 +1,9 @@ +#[cfg(feature = "derive")] +use crate::{ + binary::{de::BinaryDeserializerBuilder, TokenResolver}, + BinaryDeserializer, Error, +}; + /// Trait customizing decoding values from binary data /// /// How binary data is encoded differs between games and even @@ -8,6 +14,37 @@ pub trait BinaryFlavor: crate::Encoding { /// Decode a f64 from 8 bytes of data fn visit_f64(&self, data: [u8; 8]) -> f64; + + /// Create binary deserializer from this binary flavor + #[cfg(feature = "derive")] + fn deserializer(&self) -> BinaryDeserializerBuilder<&Self> { + BinaryDeserializer::builder_flavor(self) + } + + /// Deserialize value from slice of data with this binary flavor + #[cfg(feature = "derive")] + fn deserialize_slice<'de, 'res: 'de, T, RES>( + &self, + data: &'de [u8], + resolver: &'res RES, + ) -> Result + where + T: serde::de::Deserialize<'de>, + RES: TokenResolver, + { + self.deserializer().deserialize_slice(data, resolver) + } + + /// Deserialize value from stream of data with this binary flavor + #[cfg(feature = "derive")] + fn deserialize_reader(&self, reader: R, resolver: &RES) -> Result + where + T: serde::de::DeserializeOwned, + RES: TokenResolver, + R: std::io::Read, + { + self.deserializer().deserialize_reader(reader, resolver) + } } impl BinaryFlavor for &'_ T { diff --git a/src/binary/lexer.rs b/src/binary/lexer.rs new file mode 100644 index 0000000..bab54a0 --- /dev/null +++ b/src/binary/lexer.rs @@ -0,0 +1,778 @@ +use super::Rgb; +use crate::{util::get_split, Scalar}; +use std::fmt; + +/// The ID of current Lexeme +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct LexemeId(pub u16); + +impl LexemeId { + /// A binary '{' (open bracket) + pub const OPEN: LexemeId = LexemeId::new(0x0003); + + /// A binary '}' (close bracket) + pub const CLOSE: LexemeId = LexemeId::new(0x0004); + + /// A binary '=' + pub const EQUAL: LexemeId = LexemeId::new(0x0001); + + /// A binary 32 bit unsigned integer + pub const U32: LexemeId = LexemeId::new(0x0014); + + /// A binary 64 bit unsigned integer + pub const U64: LexemeId = LexemeId::new(0x029c); + + /// A binary 32 bit signed integer + pub const I32: LexemeId = LexemeId::new(0x000c); + + /// A binary boolean + pub const BOOL: LexemeId = LexemeId::new(0x000e); + + /// A binary string that is typically quoted + pub const QUOTED: LexemeId = LexemeId::new(0x000f); + + /// A binary string that is typically without quotes + pub const UNQUOTED: LexemeId = LexemeId::new(0x0017); + + /// A binary 32 bit floating point + pub const F32: LexemeId = LexemeId::new(0x000d); + + /// A binary 64 bit floating point + pub const F64: LexemeId = LexemeId::new(0x0167); + + /// A binary RGB value + pub const RGB: LexemeId = LexemeId::new(0x0243); + + /// A binary 64 bit signed integer + pub const I64: LexemeId = LexemeId::new(0x0317); + + /// Construct a new [LexemeId] from a 16bit value + #[inline] + pub const fn new(x: u16) -> Self { + LexemeId(x) + } + + /// Identifies if the given ID does not match of the predefined [LexemeId] + /// constants, and thus can be considered an ID token. + /// + /// ```rust + /// use jomini::binary::LexemeId; + /// let lid = LexemeId::new(0x1000); + /// assert!(lid.is_id()); + /// ``` + #[inline] + pub const fn is_id(&self) -> bool { + !matches!( + *self, + LexemeId::OPEN + | LexemeId::CLOSE + | LexemeId::EQUAL + | LexemeId::U32 + | LexemeId::U64 + | LexemeId::I32 + | LexemeId::BOOL + | LexemeId::QUOTED + | LexemeId::UNQUOTED + | LexemeId::F32 + | LexemeId::F64 + | LexemeId::RGB + | LexemeId::I64 + ) + } +} + +#[inline] +pub(crate) fn read_id(data: &[u8]) -> Result<(LexemeId, &[u8]), LexError> { + let (head, rest) = get_split::<2>(data).ok_or(LexError::Eof)?; + Ok((LexemeId::new(u16::from_le_bytes(head)), rest)) +} + +#[inline] +pub(crate) fn read_string(data: &[u8]) -> Result<(Scalar, &[u8]), LexError> { + let (head, rest) = get_split::<2>(data).ok_or(LexError::Eof)?; + let text_len = usize::from(u16::from_le_bytes(head)); + if text_len <= rest.len() { + let (text, rest) = rest.split_at(text_len); + Ok((Scalar::new(text), rest)) + } else { + Err(LexError::Eof) + } +} + +#[inline] +pub(crate) fn read_bool(data: &[u8]) -> Result<(bool, &[u8]), LexError> { + let (&first, rest) = data.split_first().ok_or(LexError::Eof)?; + Ok((first != 0, rest)) +} + +#[inline] +pub(crate) fn read_u32(data: &[u8]) -> Result<(u32, &[u8]), LexError> { + let (head, rest) = get_split::<4>(data).ok_or(LexError::Eof)?; + Ok((u32::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_u64(data: &[u8]) -> Result<(u64, &[u8]), LexError> { + let (head, rest) = get_split::<8>(data).ok_or(LexError::Eof)?; + Ok((u64::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_i64(data: &[u8]) -> Result<(i64, &[u8]), LexError> { + let (head, rest) = get_split::<8>(data).ok_or(LexError::Eof)?; + Ok((i64::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_i32(data: &[u8]) -> Result<(i32, &[u8]), LexError> { + let (head, rest) = get_split::<4>(data).ok_or(LexError::Eof)?; + Ok((i32::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_f32(data: &[u8]) -> Result<([u8; 4], &[u8]), LexError> { + get_split::<4>(data).ok_or(LexError::Eof) +} + +#[inline] +pub(crate) fn read_f64(data: &[u8]) -> Result<([u8; 8], &[u8]), LexError> { + get_split::<8>(data).ok_or(LexError::Eof) +} + +#[inline] +pub(crate) fn read_rgb(data: &[u8]) -> Result<(Rgb, &[u8]), LexError> { + let (start, data) = read_id(data)?; + let (rtoken, data) = read_id(data)?; + let (r, data) = read_u32(data)?; + let (gtoken, data) = read_id(data)?; + let (g, data) = read_u32(data)?; + let (btoken, data) = read_id(data)?; + let (b, data) = read_u32(data)?; + let (next_tok, data) = read_id(data)?; + match (start, rtoken, gtoken, btoken, next_tok) { + (LexemeId::OPEN, LexemeId::U32, LexemeId::U32, LexemeId::U32, LexemeId::CLOSE) => { + Ok((Rgb { r, g, b, a: None }, data)) + } + (LexemeId::OPEN, LexemeId::U32, LexemeId::U32, LexemeId::U32, LexemeId::U32) => { + let (a, data) = read_u32(data)?; + let (end, data) = read_id(data)?; + if end == LexemeId::CLOSE { + let a = Some(a); + Ok((Rgb { r, g, b, a }, data)) + } else { + Err(LexError::InvalidRgb) + } + } + _ => Err(LexError::InvalidRgb), + } +} + +/// Binary token, the raw form of [BinaryToken](crate::binary::BinaryToken) +/// +/// This binary token contains the yielded raw tokens, and won't match open and +/// close tokens together, nor does it make a determination if open and close +/// represents an array, object, or both. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'a> { + /// '{' + Open, + + /// '}' + Close, + + /// '=' + Equal, + + /// 32bit unsigned integer + U32(u32), + + /// 64bit unsigned integer + U64(u64), + + /// 32bit signed integer + I32(i32), + + /// boolean + Bool(bool), + + /// quoted text + Quoted(Scalar<'a>), + + /// text that is not quoted + Unquoted(Scalar<'a>), + + /// 32bits of floating point data + F32([u8; 4]), + + /// 64bits of floating point data + F64([u8; 8]), + + /// Rgb data + Rgb(Rgb), + + /// 64bit signed integer + I64(i64), + + /// token id that can be resolved to a string via a + /// [TokenResolver](crate::binary::TokenResolver) + Id(u16), +} + +#[inline] +pub(crate) fn read_token(data: &[u8]) -> Result<(Token, &[u8]), LexError> { + let (id, data) = read_id(data)?; + match id { + LexemeId::OPEN => Ok((Token::Open, data)), + LexemeId::CLOSE => Ok((Token::Close, data)), + LexemeId::EQUAL => Ok((Token::Equal, data)), + LexemeId::U32 => read_u32(data).map(|(x, d)| (Token::U32(x), d)), + LexemeId::U64 => read_u64(data).map(|(x, d)| (Token::U64(x), d)), + LexemeId::I32 => read_i32(data).map(|(x, d)| (Token::I32(x), d)), + LexemeId::BOOL => read_bool(data).map(|(x, d)| (Token::Bool(x), d)), + LexemeId::QUOTED => read_string(data).map(|(x, d)| (Token::Quoted(x), d)), + LexemeId::UNQUOTED => read_string(data).map(|(x, d)| (Token::Unquoted(x), d)), + LexemeId::F32 => read_f32(data).map(|(x, d)| (Token::F32(x), d)), + LexemeId::F64 => read_f64(data).map(|(x, d)| (Token::F64(x), d)), + LexemeId::RGB => read_rgb(data).map(|(x, d)| (Token::Rgb(x), d)), + LexemeId::I64 => read_i64(data).map(|(x, d)| (Token::I64(x), d)), + LexemeId(id) => Ok((Token::Id(id), data)), + } +} + +/// Lexical error type without positional information +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LexError { + /// Data ended too soon + Eof, + + /// An invalid RGB block encountered + InvalidRgb, +} + +impl std::error::Error for LexError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl std::fmt::Display for LexError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + LexError::Eof => write!(f, "unexpected end of file"), + LexError::InvalidRgb => write!(f, "invalid rgb data encountered",), + } + } +} + +impl LexError { + #[inline] + #[must_use] + pub(crate) fn at(self, position: usize) -> LexerError { + LexerError { + position, + kind: self, + } + } +} + +/// Lexical error type with positional information +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LexerError { + position: usize, + kind: LexError, +} + +impl LexerError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position + } + + /// Return a reference the error kind + pub fn kind(&self) -> &LexError { + &self.kind + } + + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> LexError { + self.kind + } +} + +impl std::error::Error for LexerError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl std::fmt::Display for LexerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + LexError::Eof => write!(f, "not enough data to read at {}", self.position), + LexError::InvalidRgb => write!(f, "invalid rgb data encountered at {}", self.position), + } + } +} + +/// Zero cost binary data scanner. +/// +/// There are two main ways to drive the lexer. To see them in action, imagine +/// we want to count the max amount of nesting. +/// +/// ```rust +/// use jomini::binary::{Lexer, Token}; +/// let mut lexer = Lexer::new(&[0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = lexer.next_token()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::LexerError>(()) +/// ``` +/// +/// The [Lexer::next_token] is an ergonomic way to scan through binary tokens. +/// The functions prefixed with `read_`denote more data is expected, while +/// `next_` allows for the data to finish. +/// +/// If it is desired scan through the binary data with zero overhead, one needs +/// to drive the lexer more thoroughly. +/// +/// ```rust +/// use jomini::binary::{Lexer, LexemeId}; +/// let mut lexer = Lexer::new(&[0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(id) = lexer.next_id()? { +/// match id { +/// LexemeId::OPEN => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// LexemeId::CLOSE => current_depth -= 1, +/// LexemeId::U32 => { lexer.read_u32()?; } +/// LexemeId::I32 => { lexer.read_i32()?; } +/// LexemeId::BOOL => { lexer.read_bool()?; } +/// LexemeId::QUOTED | LexemeId::UNQUOTED => { lexer.read_string()?; } +/// LexemeId::F32 => { lexer.read_f32()?; } +/// LexemeId::F64 => { lexer.read_f64()?; } +/// LexemeId::I64 => { lexer.read_i64()?; } +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::LexerError>(()) +/// ``` +/// +/// Only at token boundaries can `token` functions be intertwined with the +/// individual lexeme functions. +/// +/// Errors reported will contain positional information. +pub struct Lexer<'a> { + data: &'a [u8], + original_length: usize, +} + +impl<'a> Lexer<'a> { + /// Creates a new lexer over the given data + #[inline] + pub fn new(data: &'a [u8]) -> Self { + Self { + data, + original_length: data.len(), + } + } + + /// Returns the remaining data that has not yet been processed. + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId}; + /// let mut lexer = Lexer::new(&[0xd2, 0x28, 0xff]); + /// assert_eq!(lexer.read_id().unwrap(), LexemeId::new(0x28d2)); + /// assert_eq!(lexer.remainder(), &[0xff]); + /// ``` + #[inline] + pub fn remainder(&self) -> &'a [u8] { + self.data + } + + /// Returns how many bytes have been processed by the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId}; + /// let mut lexer = Lexer::new(&[0xd2, 0x28, 0xff]); + /// assert_eq!(lexer.read_id().unwrap(), LexemeId::new(0x28d2)); + /// assert_eq!(lexer.position(), 2); + /// ``` + #[inline] + pub fn position(&self) -> usize { + self.original_length - self.data.len() + } + + #[inline] + fn err_position(&self, err: LexError) -> LexerError { + err.at(self.position()) + } + + /// Advance the lexer through the next lexeme id, and return it + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.read_id(), Ok(LexemeId::new(0x282d))); + /// assert_eq!(lexer.read_id().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_id(&mut self) -> Result { + let (result, rest) = read_id(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Attempt to advance through the [LexemeId] + /// + /// An EOF error can still be thrown if data is present but not enough + /// exists to decode the next [LexemeId] + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.next_id(), Ok(Some(LexemeId::new(0x282d)))); + /// assert_eq!(lexer.next_id(), Ok(None)); + /// + /// let mut lexer = Lexer::new(&[0x2d]); + /// assert_eq!(lexer.next_id().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn next_id(&mut self) -> Result, LexerError> { + match read_id(self.data) { + Ok((result, rest)) => { + self.data = rest; + Ok(Some(result)) + } + Err(LexError::Eof) if self.remainder().is_empty() => Ok(None), + Err(e) => Err(self.err_position(e)), + } + } + + /// Assume more tokens exist in the data and read the next one. + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Token}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.read_token(), Ok(Token::Id(0x282d))); + /// assert_eq!(lexer.read_token().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_token(&mut self) -> Result, LexerError> { + let (result, rest) = read_token(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Attempt to advance through the next token or return `None` if no data remains + /// + /// An EOF error can still be thrown if data is present but not enough + /// exists to decode the next token. + /// + /// ```rust + /// use jomini::binary::{Lexer, Token, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.next_token(), Ok(Some(Token::Id(0x282d)))); + /// assert_eq!(lexer.next_token(), Ok(None)); + /// + /// let mut lexer = Lexer::new(&[0x2d]); + /// assert_eq!(lexer.next_token().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn next_token(&mut self) -> Result>, LexerError> { + match read_token(self.data) { + Ok((result, rest)) => { + self.data = rest; + Ok(Some(result)) + } + Err(LexError::Eof) if self.remainder().is_empty() => Ok(None), + Err(e) => Err(self.err_position(e)), + } + } + + /// Peek at the next [LexemeId] without advancing the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, LexemeId}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.peek_id(), Some(LexemeId::EQUAL)); + /// assert_eq!(lexer.read_id(), Ok(LexemeId::EQUAL)); + /// assert_eq!(lexer.peek_id(), None); + /// ``` + #[inline] + pub fn peek_id(&mut self) -> Option { + self.data + .get(..2) + .map(|head| LexemeId::new(u16::from_le_bytes([head[0], head[1]]))) + } + + /// Peek at the next [Token] without advancing the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Token}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.peek_token(), Some(Token::Equal)); + /// assert_eq!(lexer.read_token(), Ok(Token::Equal)); + /// assert_eq!(lexer.peek_token(), None); + /// ``` + #[inline] + pub fn peek_token(&mut self) -> Option> { + read_token(self.data).ok().map(|(t, _)| t) + } + + /// Advance the lexer through a length prefixed string + /// + /// ```rust + /// use jomini::{Scalar, binary::{Lexer, LexError}}; + /// let mut lexer = Lexer::new(&[0x03, 0x00, 0x45, 0x4e, 0x47][..]); + /// assert_eq!(lexer.read_string(), Ok(Scalar::new(b"ENG"))); + /// assert_eq!(lexer.read_string().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_string(&mut self) -> Result, LexerError> { + let (result, rest) = read_string(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through a boolean + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.read_bool(), Ok(true)); + /// assert_eq!(lexer.read_bool(), Ok(false)); + /// assert_eq!(lexer.read_bool().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_bool(&mut self) -> Result { + let (result, rest) = read_bool(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through unsigned little endian 32 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x59, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_u32(), Ok(89)); + /// assert_eq!(lexer.read_u32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_u32(&mut self) -> Result { + let (result, rest) = read_u32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through unsigned little endian 64 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_u64(), Ok(128)); + /// assert_eq!(lexer.read_u64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_u64(&mut self) -> Result { + let (result, rest) = read_u64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through signed little endian 64 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff][..]); + /// assert_eq!(lexer.read_i64(), Ok(-1)); + /// assert_eq!(lexer.read_i64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_i64(&mut self) -> Result { + let (result, rest) = read_i64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through signed little endian 32 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x59, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_i32(), Ok(89)); + /// assert_eq!(lexer.read_i32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_i32(&mut self) -> Result { + let (result, rest) = read_i32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through 32 bits of floating point data and return the bytes + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let data = [0x17, 0x00, 0x00, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_f32(), Ok(data)); + /// assert_eq!(lexer.read_f32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_f32(&mut self) -> Result<[u8; 4], LexerError> { + let (result, rest) = read_f32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through 64 bits of floating point data and return the bytes + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let data = [0xc7, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_f64(), Ok(data)); + /// assert_eq!(lexer.read_f64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_f64(&mut self) -> Result<[u8; 8], LexerError> { + let (result, rest) = read_f64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through an rgb value (with optional alpha channel) + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Rgb}; + /// let data = [0x03, 0x00, 0x14, 0x00, 0x6e, 0x00, 0x00, 0x00, + /// 0x14, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x14, 0x00, + /// 0x1b, 0x00, 0x00, 0x00, 0x04, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_rgb(), Ok(Rgb { r: 110, g: 27, b: 27, a: None })); + /// assert_eq!(lexer.read_rgb().unwrap_err().kind(), &LexError::Eof); + /// ``` + pub fn read_rgb(&mut self) -> Result { + let (result, rest) = read_rgb(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance a given number of bytes and return them + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(b"EU4bin"); + /// assert_eq!(lexer.read_bytes(6), Ok(&b"EU4bin"[..])); + /// assert_eq!(lexer.read_bytes(1).unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&'a [u8], LexerError> { + if self.data.len() >= bytes { + let (head, rest) = self.data.split_at(bytes); + self.data = rest; + Ok(head) + } else { + Err(self.err_position(LexError::Eof)) + } + } + + /// Skip the value denoted by the [LexemeId]. Will skip entire containers. + #[inline] + pub fn skip_value(&mut self, id: LexemeId) -> Result<(), LexerError> { + match id { + LexemeId::QUOTED | LexemeId::UNQUOTED => { + self.read_string()?; + Ok(()) + } + LexemeId::U32 => { + self.read_u32()?; + Ok(()) + } + LexemeId::I32 => { + self.read_i32()?; + Ok(()) + } + LexemeId::U64 => { + self.read_u64()?; + Ok(()) + } + LexemeId::I64 => { + self.read_i64()?; + Ok(()) + } + LexemeId::BOOL => { + self.read_bool()?; + Ok(()) + } + LexemeId::F32 => { + self.read_f32()?; + Ok(()) + } + LexemeId::F64 => { + self.read_f64()?; + Ok(()) + } + LexemeId::OPEN => self.skip_container(), + _ => Ok(()), + } + } + + #[inline] + fn skip_container(&mut self) -> Result<(), LexerError> { + let mut depth = 1; + loop { + match self.read_id()? { + LexemeId::QUOTED | LexemeId::UNQUOTED => { + self.read_string()?; + } + LexemeId::U32 => { + self.read_u32()?; + } + LexemeId::I32 => { + self.read_i32()?; + } + LexemeId::U64 => { + self.read_u64()?; + } + LexemeId::I64 => { + self.read_i64()?; + } + LexemeId::BOOL => { + self.read_bool()?; + } + LexemeId::F32 => { + self.read_f32()?; + } + LexemeId::F64 => { + self.read_f64()?; + } + LexemeId::CLOSE => { + depth -= 1; + if depth == 0 { + return Ok(()); + } + } + LexemeId::OPEN => depth += 1, + _ => {} + } + } + } +} diff --git a/src/binary/mod.rs b/src/binary/mod.rs index 4c8eb3e..74b59c3 100644 --- a/src/binary/mod.rs +++ b/src/binary/mod.rs @@ -1,18 +1,85 @@ //! Types for parsing clausewitz binary input //! -//! See the top level module documentation for an overview that includes parsing -//! and deserializing binary data. +//! Main binary deserialization APIs: +//! - [BinaryFlavor::deserialize_slice] +//! - [BinaryFlavor::deserialize_reader] +//! +//! If the serde deserialization API is too high level, one can build +//! abstractions ontop of. +//! - [BinaryTape::from_slice]: Realizes a pseudo AST onto a linear tape. +//! Cleans up and normalizes data. +//! - [TokenReader]: An incremental binary lexer designed for handling large +//! saves in a memory efficient manner. +//! - [Lexer]: The lowest level, a zero cost binary data scanner over a byte +//! slice. +//! +//! ## Direct identifier deserialization with `token` attribute +//! +//! There may be some performance loss during binary deserialization as +//! tokens are resolved to strings via a `TokenResolver` and then matched against the +//! string representations of a struct's fields. +//! +//! We can fix this issue by directly encoding the expected token value into the struct: +//! +//! ```rust +//! # #[cfg(feature = "derive")] { +//! # use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; +//! # use std::{borrow::Cow, collections::HashMap}; +//! # +//! # #[derive(Debug, Default)] +//! # pub struct BinaryTestFlavor; +//! # +//! # impl BinaryFlavor for BinaryTestFlavor { +//! # fn visit_f32(&self, data: [u8; 4]) -> f32 { +//! # f32::from_le_bytes(data) +//! # } +//! # +//! # fn visit_f64(&self, data: [u8; 8]) -> f64 { +//! # f64::from_le_bytes(data) +//! # } +//! # } +//! # +//! # impl Encoding for BinaryTestFlavor { +//! # fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { +//! # Windows1252Encoding::decode(data) +//! # } +//! # } +//! # +//! # let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; +//! # +//! #[derive(JominiDeserialize, PartialEq, Debug)] +//! struct MyStruct { +//! #[jomini(token = 0x2d82)] +//! field1: String, +//! } +//! +//! // Empty token to string resolver +//! let map = HashMap::::new(); +//! +//! let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; +//! assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); +//! # } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! Couple notes: +//! +//! - This does not obviate need for the token to string resolver as tokens may be used as values. +//! - If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. /// binary deserialization #[cfg(feature = "derive")] pub mod de; mod flavor; +mod lexer; +mod reader; mod resolver; mod rgb; mod tape; -mod tokens; pub use self::flavor::BinaryFlavor; +pub use self::lexer::{LexError, LexemeId, Lexer, LexerError, Token}; +pub use self::reader::{ReaderError, ReaderErrorKind, TokenReader, TokenReaderBuilder}; pub use self::resolver::{FailedResolveStrategy, TokenResolver}; pub use self::rgb::*; pub use self::tape::{BinaryTape, BinaryTapeParser, BinaryToken}; diff --git a/src/binary/reader.rs b/src/binary/reader.rs new file mode 100644 index 0000000..8185184 --- /dev/null +++ b/src/binary/reader.rs @@ -0,0 +1,430 @@ +use super::{ + lexer::{read_id, read_string, read_token}, + LexError, LexemeId, LexerError, Token, +}; +use crate::buffer::{BufferError, BufferWindow, BufferWindowBuilder, SliceReader}; +use std::{fmt, io::Read}; + +/// [Lexer](crate::binary::Lexer) that works over a [Read] implementation +/// +/// Example of computing the max nesting depth using a [TokenReader]. +/// +/// ```rust +/// use jomini::binary::{TokenReader, Token}; +/// let data = [0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]; +/// let mut reader = TokenReader::new(&data[..]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = reader.next()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::ReaderError>(()) +/// ``` +/// +/// Unlike a [BinaryTape](crate::BinaryTape), which will skip ghost objects, +/// pair open and close tokens together, and recognize if a container is an +/// object, array, or mixed -- the tokens yielded from a [TokenReader] are not +/// fully formed. This is a much more raw view of the data that can be used to +/// construct higher level parsers, melters, and deserializers that operate over +/// a stream of data. +/// +/// [TokenReader] operates over a fixed size buffer, so using a +/// [BufRead](std::io::BufRead) affords no benefits. An error will be returned +/// for tokens that are impossible to fit within the buffer (eg: if the provided +/// with 100 byte buffer but there is a binary string that is 101 bytes long). +#[derive(Debug)] +pub struct TokenReader { + reader: R, + buf: BufferWindow, +} + +impl TokenReader<()> { + /// Read from a byte slice without memcpy's + #[inline] + pub fn from_slice(data: &[u8]) -> TokenReader> { + TokenReader { + reader: SliceReader::new(data), + buf: BufferWindow::from_slice(data), + } + } +} + +impl TokenReader +where + R: Read, +{ + /// Convenience method for constructing the default token reader + #[inline] + pub fn new(reader: R) -> Self { + TokenReader::builder().build(reader) + } + + /// Returns the byte position of the data stream that has been processed. + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[0xd2, 0x28, 0xff][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.position(), 2); + /// ``` + #[inline] + pub fn position(&self) -> usize { + self.buf.position() + } + + #[inline] + fn next_opt(&mut self) -> (Option, Option) { + loop { + let window = + unsafe { std::slice::from_raw_parts(self.buf.start, self.buf.window_len()) }; + match read_token(window) { + Ok((tok, new_data)) => { + self.buf.advance_to(new_data.as_ptr()); + return (Some(tok), None); + } + Err(LexError::Eof) => {} + Err(e) => return (None, Some(self.lex_error(e))), + } + + match self.buf.fill_buf(&mut self.reader) { + Ok(0) if self.buf.window_len() == 0 => return (None, None), + Ok(0) => return (None, Some(self.lex_error(LexError::Eof))), + Ok(_) => {} + Err(e) => return (None, Some(self.buffer_error(e))), + } + } + } + + /// Advance a given number of bytes and return them. + /// + /// The internal buffer must be large enough to accomodate all bytes. + /// + /// ```rust + /// use jomini::binary::{TokenReader, LexError, ReaderErrorKind}; + /// let mut reader = TokenReader::new(&b"EU4bin"[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4bin"[..]); + /// assert!(matches!(reader.read_bytes(1).unwrap_err().kind(), ReaderErrorKind::Lexer(LexError::Eof))); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&[u8], ReaderError> { + while self.buf.window_len() < bytes { + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.lex_error(LexError::Eof)), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), + } + } + + let input = unsafe { std::slice::from_raw_parts(self.buf.start, bytes) }; + self.buf.advance(bytes); + Ok(input) + } + + /// Advance through the containing block until the closing token is consumed + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, + /// 0x04, 0x00, 0x04, 0x00, 0xff, 0xff + /// ][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.read().unwrap(), Token::Equal); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert!(reader.skip_container().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Id(0xffff)); + /// ``` + #[inline] + pub fn skip_container(&mut self) -> Result<(), ReaderError> { + let mut depth = 1; + loop { + while let Ok((id, data)) = read_id(self.buf.window()) { + match id { + LexemeId::CLOSE => { + self.buf.advance_to(data.as_ptr()); + depth -= 1; + if depth == 0 { + return Ok(()); + } + } + LexemeId::OPEN => { + self.buf.advance_to(data.as_ptr()); + depth += 1 + } + LexemeId::BOOL => match data.get(1..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::F32 | LexemeId::U32 | LexemeId::I32 => match data.get(4..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::F64 | LexemeId::I64 | LexemeId::U64 => match data.get(8..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::QUOTED | LexemeId::UNQUOTED => match read_string(data) { + Ok((_, d)) => self.buf.advance_to(d.as_ptr()), + Err(_) => break, + }, + _ => self.buf.advance_to(data.as_ptr()), + } + } + + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.lex_error(LexError::Eof)), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), + } + } + } + + /// Consume the token reader and return the internal buffer and reader. This + /// allows the buffer to be reused. + /// + /// ```rust + /// use jomini::binary::TokenReader; + /// let data = b"EU4bin"; + /// let mut reader = TokenReader::new(&data[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &data[..]); + /// + /// let (buf, _) = reader.into_parts(); + /// let data = b"HOI4bin"; + /// let mut reader = TokenReader::builder().buffer(buf).build(&data[..]); + /// assert_eq!(reader.read_bytes(7).unwrap(), &data[..]); + /// ``` + #[inline] + pub fn into_parts(self) -> (Box<[u8]>, R) { + (self.buf.buf, self.reader) + } + + /// Read the next token in the stream. Will error if not enough data remains + /// to decode a token. + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token, ReaderErrorKind, LexError}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00 + /// ][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.read().unwrap(), Token::Equal); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert_eq!(reader.read().unwrap(), Token::Close); + /// assert!(matches!(reader.read().unwrap_err().kind(), ReaderErrorKind::Lexer(LexError::Eof))); + /// ``` + #[inline] + pub fn read(&mut self) -> Result { + // Workaround for borrow checker :( + let s = unsafe { &mut *(self as *mut TokenReader) }; + match self.next_opt() { + (Some(x), _) => Ok(x), + (None, None) => Err(s.lex_error(LexError::Eof)), + (None, Some(e)) => Err(e), + } + } + + /// Read a token, returning none when all the data has been consumed + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00 + /// ][..]); + /// assert_eq!(reader.next().unwrap(), Some(Token::Id(0x28d2))); + /// assert_eq!(reader.next().unwrap(), Some(Token::Equal)); + /// assert_eq!(reader.next().unwrap(), Some(Token::Open)); + /// assert_eq!(reader.next().unwrap(), Some(Token::Close)); + /// assert_eq!(reader.next().unwrap(), None); + /// ``` + #[inline] + pub fn next(&mut self) -> Result, ReaderError> { + match self.next_opt() { + (Some(x), _) => Ok(Some(x)), + (None, None) => Ok(None), + (None, Some(e)) => Err(e), + } + } + + #[cold] + #[inline(never)] + fn buffer_error(&self, e: BufferError) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::from(e), + } + } + + #[cold] + #[inline(never)] + fn lex_error(&self, e: LexError) -> ReaderError { + ReaderError::from(e.at(self.position())) + } +} + +impl TokenReader<()> { + /// Initializes a default [TokenReaderBuilder] + pub fn builder() -> TokenReaderBuilder { + TokenReaderBuilder::default() + } +} + +/// Creates a binary token reader +#[derive(Debug, Default)] +pub struct TokenReaderBuilder { + buffer: BufferWindowBuilder, +} + +impl TokenReaderBuilder { + /// Set the fixed size buffer to the given buffer + #[inline] + pub fn buffer(mut self, val: Box<[u8]>) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer(val); + self + } + + /// Set the length of the buffer if no buffer is provided + #[inline] + pub fn buffer_len(mut self, val: usize) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer_len(val); + self + } + + /// Create a binary token reader around a given reader. + #[inline] + pub fn build(self, reader: R) -> TokenReader { + let buf = self.buffer.build(); + TokenReader { reader, buf } + } +} + +/// The specific binary reader error type. +#[derive(Debug)] +pub enum ReaderErrorKind { + /// An underlying error from a [Read]er + Read(std::io::Error), + + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, + + /// The data is corrupted + Lexer(LexError), +} + +/// An binary lexing error over a `Read` implementation +#[derive(Debug)] +pub struct ReaderError { + position: usize, + kind: ReaderErrorKind, +} + +impl ReaderError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position + } + + /// Return a reference the error kind + pub fn kind(&self) -> &ReaderErrorKind { + &self.kind + } + + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> ReaderErrorKind { + self.kind + } +} + +impl std::error::Error for ReaderError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match &self.kind { + ReaderErrorKind::Read(cause) => Some(cause), + _ => None, + } + } +} + +impl std::fmt::Display for ReaderError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match &self.kind { + ReaderErrorKind::Read { .. } => { + write!(f, "failed to read past position: {}", self.position) + } + ReaderErrorKind::BufferFull => { + write!(f, "max buffer size exceeded at position: {}", self.position) + } + ReaderErrorKind::Lexer(cause) => { + write!(f, "{} at position: {}", cause, self.position) + } + } + } +} + +impl From for ReaderError { + fn from(value: LexerError) -> Self { + ReaderError { + position: value.position(), + kind: ReaderErrorKind::Lexer(value.into_kind()), + } + } +} + +impl From for ReaderErrorKind { + fn from(value: BufferError) -> Self { + match value { + BufferError::Io(x) => ReaderErrorKind::Read(x), + BufferError::BufferFull => ReaderErrorKind::BufferFull, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_reader(data: &[u8], expected: &[Token]) { + fn eq(mut reader: TokenReader, expected: &[Token]) + where + R: Read, + { + for token in expected { + assert_eq!(reader.next().unwrap(), Some(*token)); + } + assert_eq!(reader.next().unwrap(), None); + } + + eq(TokenReader::new(data), expected); + + let data_with_header: Vec<_> = b"EU4bin".iter().chain(data).copied().collect(); + let mut reader = TokenReader::new(data_with_header.as_slice()); + assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4bin"[..]); + eq(reader, expected); + } + + #[test] + fn test_binary_token_reader() { + let data = [0xe1, 0x00, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00]; + test_reader( + &data, + &[Token::Id(0x00e1), Token::Equal, Token::Open, Token::Close], + ); + } + + #[test] + fn test_not_enough_data() { + let mut reader = TokenReader::new(&[0x43][..]); + assert!(matches!( + reader.read().unwrap_err().kind(), + &ReaderErrorKind::Lexer(LexError::Eof) + )); + } +} diff --git a/src/binary/tape.rs b/src/binary/tape.rs index a34b819..f0d8160 100644 --- a/src/binary/tape.rs +++ b/src/binary/tape.rs @@ -1,4 +1,10 @@ -use super::tokens::*; +use super::{ + lexer::{ + read_bool, read_f32, read_f64, read_i32, read_i64, read_id, read_rgb, read_string, + read_u32, read_u64, + }, + LexError, LexemeId, +}; use crate::{binary::Rgb, copyless::VecHelper, util::get_split, Error, ErrorKind, Scalar}; /// Represents any valid binary value @@ -151,125 +157,77 @@ impl<'a, 'b> ParserState<'a, 'b> { } #[inline] - fn parse_next_id(&mut self, data: &'a [u8]) -> Result<(&'a [u8], u16), Error> { - self.parse_next_id_opt(data).ok_or_else(Error::eof) + fn parse_next_id(&mut self, data: &'a [u8]) -> Result<(&'a [u8], LexemeId), Error> { + read_id(data) + .map(|(id, rest)| (rest, id)) + .map_err(|e| self.err_position(e, data)) } #[inline] fn parse_u32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - let val = u32::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::U32(val)); + let (result, rest) = read_u32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::U32(result)); Ok(rest) } #[inline] fn parse_u64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - let val = u64::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::U64(val)); + let (result, rest) = read_u64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::U64(result)); Ok(rest) } #[inline] fn parse_i64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - let val = i64::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::I64(val)); + let (result, rest) = read_i64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::I64(result)); Ok(rest) } #[inline] fn parse_i32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - let val = i32::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::I32(val)); + let (result, rest) = read_i32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::I32(result)); Ok(rest) } #[inline] fn parse_f32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::F32(head)); + let (result, rest) = read_f32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::F32(result)); Ok(rest) } #[inline] fn parse_f64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::F64(head)); + let (result, rest) = read_f64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::F64(result)); Ok(rest) } #[inline] fn parse_bool(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let val = data.first().map(|&x| x != 0).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::Bool(val)); - Ok(&data[1..]) + let (result, rest) = read_bool(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::Bool(result)); + Ok(rest) } fn parse_rgb(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let data = &data[2..]; - let (data, r_tok) = self.parse_next_id(data)?; - let (r_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let r = u32::from_le_bytes(r_data); - - let (data, g_tok) = self.parse_next_id(data)?; - let (g_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let g = u32::from_le_bytes(g_data); - - let (data, b_tok) = self.parse_next_id(data)?; - let (b_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let b = u32::from_le_bytes(b_data); - - if r_tok != U32 && g_tok != U32 && b_tok != U32 { - return Err(self.invalid_syntax("invalid rgb tokens", data)); - } - - let (data, next_tok) = self.parse_next_id(data)?; - - let (data, a) = match next_tok { - U32 => { - let (a_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let a = u32::from_le_bytes(a_data); - let (data, end_tok) = self.parse_next_id(data)?; - if end_tok != END { - return Err(self.invalid_syntax("expected end to follow rgb alpha", data)); - } - (data, Some(a)) - } - END => (data, None), - _ => return Err(self.invalid_syntax("invalid rgb end token", data)), - }; - - let val = Rgb { r, g, b, a }; - self.token_tape.alloc().init(BinaryToken::Rgb(val)); - Ok(data) - } - - #[inline(always)] - fn parse_string_inner(&mut self, data: &'a [u8]) -> Result<(Scalar<'a>, &'a [u8]), Error> { - let (head, rest) = get_split::<2>(data).ok_or_else(Error::eof)?; - let text_len = usize::from(u16::from_le_bytes(head)); - if text_len <= rest.len() { - let (text, rest) = rest.split_at(text_len); - let scalar = Scalar::new(text); - Ok((scalar, rest)) - } else { - Err(Error::eof()) - } + let (result, rest) = read_rgb(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::Rgb(result)); + Ok(rest) } #[inline(always)] fn parse_quoted_string(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (scalar, rest) = self.parse_string_inner(data)?; + let (scalar, rest) = read_string(data).map_err(|e| self.err_position(e, data))?; self.token_tape.alloc().init(BinaryToken::Quoted(scalar)); Ok(rest) } #[inline(always)] fn parse_unquoted_string(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (scalar, rest) = self.parse_string_inner(data)?; + let (scalar, rest) = read_string(data).map_err(|e| self.err_position(e, data))?; self.token_tape.alloc().init(BinaryToken::Unquoted(scalar)); Ok(rest) } @@ -294,6 +252,8 @@ impl<'a, 'b> ParserState<'a, 'b> { } fn parse(&mut self) -> Result<(), Error> { + use super::LexemeId as L; + let mut data = self.data; let mut state = ParseState::Key; @@ -320,25 +280,27 @@ impl<'a, 'b> ParserState<'a, 'b> { }; } - 'outer: while let Some((mut d, mut token_id)) = self.parse_next_id_opt(data) { + 'outer: while let Some((mut d, token_id)) = self.parse_next_id_opt(data) { + let mut token_id = LexemeId(token_id); + // This conditional is purely an optimization to parse an entire // = in one iteration of the loop, and can be removed // or ignored to ease understanding. See PR #111 for a breakdown on // field and value frequency. if ENABLE_OPTIMIZATION && state == ParseState::Key { - if token_id > UNQUOTED_STRING || token_id == 0xb { + if token_id > L::UNQUOTED || token_id == L(0xb) { // 65-90% of keys are tokens // 5% of these keys are id (0xb) - if token_id != F64 && token_id != U64 { - self.token_tape.alloc().init(BinaryToken::Token(token_id)); + if token_id != L::F64 && token_id != L::U64 { + self.token_tape.alloc().init(BinaryToken::Token(token_id.0)); let (d2, token_id2) = self.parse_next_id(d)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d3, token_id3) = self.parse_next_id(d2)?; - if token_id3 == I32 { + if token_id3 == L::I32 { data = self.parse_i32(d3)?; continue; - } else if token_id3 == OPEN { + } else if token_id3 == L::OPEN { // We could be looking at a primitive array // so we should attempt to parse it in one go let ind = self.token_tape.len(); @@ -357,7 +319,7 @@ impl<'a, 'b> ParserState<'a, 'b> { let (nd2, x) = self.parse_next_id(nd)?; if x == $token { nd = self.$fn(nd2)?; - } else if x == END { + } else if x == L::CLOSE { data = nd2; let end_idx = self.token_tape.len(); match unsafe { @@ -392,20 +354,22 @@ impl<'a, 'b> ParserState<'a, 'b> { } // These three array types cover 99.6% of EU4 arrays - if token_id4 == I32 { - parse_array_field!(parse_i32, I32); - } else if token_id4 == QUOTED_STRING { - parse_array_field!(parse_quoted_string, QUOTED_STRING); - } else if token_id4 == F32 { - parse_array_field!(parse_f32, F32); - } else if (token_id4 > UNQUOTED_STRING - && token_id4 != F64 - && token_id4 != U64) - || token_id4 == 0xb + if token_id4 == L::I32 { + parse_array_field!(parse_i32, L::I32); + } else if token_id4 == L::QUOTED { + parse_array_field!(parse_quoted_string, L::QUOTED); + } else if token_id4 == L::F32 { + parse_array_field!(parse_f32, L::F32); + } else if (token_id4 > L::UNQUOTED + && token_id4 != L::F64 + && token_id4 != L::U64) + || token_id4 == L(0xb) { - self.token_tape.alloc().init(BinaryToken::Token(token_id4)); + self.token_tape + .alloc() + .init(BinaryToken::Token(token_id4.0)); let (d4, token_id4) = self.parse_next_id(d4)?; - if token_id4 == EQUAL { + if token_id4 == L::EQUAL { unsafe { self.set_parent_to_object(parent_ind) }; state = ParseState::ObjectValue; (d, token_id) = self.parse_next_id(d4)?; @@ -419,10 +383,10 @@ impl<'a, 'b> ParserState<'a, 'b> { token_id = token_id4; state = ParseState::OpenFirst; } - } else if token_id3 == QUOTED_STRING { + } else if token_id3 == L::QUOTED { data = self.parse_quoted_string(d3)?; continue; - } else if token_id3 == F32 { + } else if token_id3 == L::F32 { data = self.parse_f32(d3)?; continue; } else { @@ -436,19 +400,19 @@ impl<'a, 'b> ParserState<'a, 'b> { state = ParseState::KeyValueSeparator; } } - } else if token_id == END { + } else if token_id == L::CLOSE { push_end!(); data = d; continue; - } else if token_id == QUOTED_STRING { + } else if token_id == L::QUOTED { // over 20% of EU4 object keys are quoted strings and they // nearly always are objects let d2 = self.parse_quoted_string(d)?; let (d3, token_id2) = self.parse_next_id(d2)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d4, token_id3) = self.parse_next_id(d3)?; - if token_id3 == OPEN { + if token_id3 == L::OPEN { let ind = self.token_tape.len(); self.token_tape.alloc().init(BinaryToken::Array(parent_ind)); parent_ind = ind; @@ -456,18 +420,18 @@ impl<'a, 'b> ParserState<'a, 'b> { (d, token_id) = self.parse_next_id(d4)?; // Expect an object that follows a quoted string to start with a token - if token_id > UNQUOTED_STRING && token_id != F64 && token_id != U64 { - self.token_tape.alloc().init(BinaryToken::Token(token_id)); + if token_id > L::UNQUOTED && token_id != L::F64 && token_id != L::U64 { + self.token_tape.alloc().init(BinaryToken::Token(token_id.0)); (d, token_id) = self.parse_next_id(d)?; - if token_id == EQUAL { + if token_id == L::EQUAL { unsafe { self.set_parent_to_object(parent_ind) }; state = ParseState::ObjectValue; (d, token_id) = self.parse_next_id(d)?; - if token_id == BOOL { + if token_id == L::BOOL { data = self.parse_bool(d)?; state = ParseState::Key; continue; - } else if token_id == QUOTED_STRING { + } else if token_id == L::QUOTED { data = self.parse_quoted_string(d)?; state = ParseState::Key; continue; @@ -486,15 +450,15 @@ impl<'a, 'b> ParserState<'a, 'b> { token_id = token_id2; state = ParseState::KeyValueSeparator; } - } else if token_id == I32 { + } else if token_id == L::I32 { // 8% of Vic3 and EU4 object keys are i32 // 96% of i32 keys have an i32 value let d2 = self.parse_i32(d)?; let (d3, token_id2) = self.parse_next_id(d2)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d4, token_id3) = self.parse_next_id(d3)?; - if token_id3 == I32 { + if token_id3 == L::I32 { data = self.parse_i32(d4)?; continue; } else { @@ -520,15 +484,15 @@ impl<'a, 'b> ParserState<'a, 'b> { } match token_id { - U32 => { + L::U32 => { data = self.parse_u32(d)?; state = Self::next_state(state); } - U64 => { + L::U64 => { data = self.parse_u64(d)?; state = Self::next_state(state); } - I32 => { + L::I32 => { data = self.parse_i32(d)?; state = Self::next_state(state); @@ -536,9 +500,9 @@ impl<'a, 'b> ParserState<'a, 'b> { let mut nd = data; loop { let (nd2, x) = self.parse_next_id(nd)?; - if x == I32 { + if x == L::I32 { nd = self.parse_i32(nd2)?; - } else if x == END { + } else if x == L::CLOSE { push_end!(); data = nd2; break; @@ -550,28 +514,28 @@ impl<'a, 'b> ParserState<'a, 'b> { } } } - BOOL => { + L::BOOL => { data = self.parse_bool(d)?; state = Self::next_state(state); } - QUOTED_STRING => { + L::QUOTED => { data = self.parse_quoted_string(d)?; state = Self::next_state(state); } - UNQUOTED_STRING => { + L::UNQUOTED => { data = self.parse_unquoted_string(d)?; state = Self::next_state(state); } - F32 => { + L::F32 => { data = self.parse_f32(d)?; state = Self::next_state(state); } - F64 => { + L::F64 => { data = self.parse_f64(d)?; state = Self::next_state(state); } - OPEN => { + L::OPEN => { if state != ParseState::Key { let ind = self.token_tape.len(); self.token_tape.alloc().init(BinaryToken::Array(parent_ind)); @@ -585,12 +549,12 @@ impl<'a, 'b> ParserState<'a, 'b> { // position eg: `a={b=c {} d=1}`. These occur in every // EU4 save, even in 1.34. match self.parse_next_id(d)? { - (nd, END) => data = nd, + (nd, L::CLOSE) => data = nd, _ => return Err(self.empty_object_err(data)), } } } - END => { + L::CLOSE => { match state { ParseState::KeyValueSeparator => { // `a={b=c 10}` @@ -608,7 +572,7 @@ impl<'a, 'b> ParserState<'a, 'b> { push_end!(); data = d; } - EQUAL => { + L::EQUAL => { data = d; if state == ParseState::KeyValueSeparator { state = ParseState::ObjectValue; @@ -657,17 +621,17 @@ impl<'a, 'b> ParserState<'a, 'b> { return Err(self.equal_key_error(data)); } } - RGB if state == ParseState::ObjectValue => { + L::RGB if state == ParseState::ObjectValue => { data = self.parse_rgb(d)?; state = ParseState::Key; } - I64 => { + L::I64 => { data = self.parse_i64(d)?; state = Self::next_state(state); } x => { data = d; - self.token_tape.alloc().init(BinaryToken::Token(x)); + self.token_tape.alloc().init(BinaryToken::Token(x.0)); state = Self::next_state(state); } } @@ -718,6 +682,14 @@ impl<'a, 'b> ParserState<'a, 'b> { self.token_tape.alloc().init(stashed1); } + #[inline] + fn err_position(&self, err: LexError, data: &[u8]) -> Error { + match err { + LexError::Eof => Error::eof(), + LexError::InvalidRgb => Error::invalid_syntax("invalid rgb", self.offset(data)), + } + } + #[inline(never)] #[cold] fn equal_key_error(&mut self, data: &[u8]) -> Error { @@ -771,15 +743,6 @@ impl<'a, 'b> ParserState<'a, 'b> { offset: self.offset(data), }) } - - #[inline(never)] - #[cold] - fn invalid_syntax>(&self, msg: T, data: &[u8]) -> Error { - Error::new(ErrorKind::InvalidSyntax { - msg: msg.into(), - offset: self.offset(data), - }) - } } /// Houses the tape of tokens that is extracted from binary data @@ -1074,7 +1037,7 @@ mod tests { data.extend_from_slice(b"schools_initiated"); data.extend_from_slice(&[0x01, 0x00, 0x0f, 0x00, 0x0b, 0x00]); data.extend_from_slice(b"1444.11.11\n"); - data.extend_from_slice(&END.to_le_bytes()); + data.extend_from_slice(&LexemeId::CLOSE.0.to_le_bytes()); let tape = parse(&data[..]).unwrap(); assert_eq!( tape.token_tape, diff --git a/src/binary/tokens.rs b/src/binary/tokens.rs deleted file mode 100644 index ebb37c6..0000000 --- a/src/binary/tokens.rs +++ /dev/null @@ -1,13 +0,0 @@ -pub(crate) const END: u16 = 0x0004; -pub(crate) const OPEN: u16 = 0x0003; -pub(crate) const EQUAL: u16 = 0x0001; -pub(crate) const U32: u16 = 0x0014; -pub(crate) const U64: u16 = 0x029c; -pub(crate) const I32: u16 = 0x000c; -pub(crate) const BOOL: u16 = 0x000e; -pub(crate) const QUOTED_STRING: u16 = 0x000f; -pub(crate) const UNQUOTED_STRING: u16 = 0x0017; -pub(crate) const F32: u16 = 0x000d; -pub(crate) const F64: u16 = 0x0167; -pub(crate) const RGB: u16 = 0x0243; -pub(crate) const I64: u16 = 0x0317; diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 0000000..45aebea --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,167 @@ +use crate::Scalar; +use std::{io::Read, marker::PhantomData, ops::Range}; + +#[derive(Debug)] +pub struct BufferWindow { + pub buf: Box<[u8]>, + + // start of window into buffer + pub start: *const u8, + + // end of window into buffer + pub end: *const u8, + + // number of consumed bytes from prior reads + pub prior_reads: usize, +} + +pub enum BufferError { + Io(std::io::Error), + BufferFull, +} + +impl BufferWindow { + #[inline] + pub fn from_slice(data: &[u8]) -> Self { + Self { + buf: Box::new([]), + start: data.as_ptr(), + end: data.as_ptr_range().end, + prior_reads: 0, + } + } + + #[inline] + pub fn advance_to(&mut self, ptr: *const u8) { + debug_assert!((self.start..=self.end).contains(&ptr)); + self.start = ptr; + } + + #[inline] + pub fn advance(&mut self, amt: usize) { + let ptr = unsafe { self.start.add(amt) }; + debug_assert!((self.start..=self.end).contains(&ptr)); + self.start = ptr; + } + + #[inline] + pub fn window(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.start, self.window_len()) } + } + + #[inline] + pub fn window_len(&self) -> usize { + unsafe { self.end.offset_from(self.start) as usize } + } + + #[inline] + pub fn position(&self) -> usize { + self.prior_reads + self.consumed_data() + } + + #[inline] + pub fn consumed_data(&self) -> usize { + unsafe { self.start.offset_from(self.buf.as_ptr()) as usize } + } + + #[inline] + pub fn get(&self, range: Range<*const u8>) -> Scalar { + debug_assert!(range.start >= self.buf.as_ptr_range().start); + debug_assert!(range.end <= self.buf.as_ptr_range().end); + let len = unsafe { range.end.offset_from(range.start) as usize }; + let sl = unsafe { std::slice::from_raw_parts(range.start, len) }; + Scalar::new(sl) + } + + /// This seems similar to `BufRead::fill_buf`, but whereas the `BufRead` + /// will only call the underlying read if the buffer is currently empty, + /// this function will copy over the bytes that haven't been consumed to the + /// start. + #[inline] + pub fn fill_buf(&mut self, mut reader: impl Read) -> Result { + // Copy over the unconsumed bytes to the start of the buffer + let carry_over = self.window_len(); + if carry_over != 0 { + if carry_over >= self.buf.len() { + return Err(BufferError::BufferFull); + } + unsafe { self.start.copy_to(self.buf.as_mut_ptr(), carry_over) }; + } + + self.prior_reads += self.consumed_data(); + self.start = self.buf.as_ptr(); + self.end = unsafe { self.buf.as_ptr().add(carry_over) }; + + // Have the reader start filling in bytes after unconsumed bytes + match reader.read(&mut self.buf[carry_over..]) { + Ok(r) => { + self.end = unsafe { self.end.add(r) }; + Ok(r) + } + Err(e) => Err(BufferError::Io(e)), + } + } +} + +#[derive(Debug)] +pub struct BufferWindowBuilder { + buffer: Option>, + buffer_len: usize, +} + +impl Default for BufferWindowBuilder { + fn default() -> Self { + // Default buffer size of 32 KiB, same size that flate2 uses. + let buffer_len = 32 * 1024; + Self { + buffer: None, + buffer_len, + } + } +} + +impl BufferWindowBuilder { + #[inline] + pub fn buffer(mut self, val: Box<[u8]>) -> BufferWindowBuilder { + self.buffer = Some(val); + self + } + + #[inline] + pub fn buffer_len(mut self, val: usize) -> BufferWindowBuilder { + self.buffer_len = val; + self + } + + #[inline] + pub fn build(self) -> BufferWindow { + let init_len = self.buffer_len; + let buf = self + .buffer + .unwrap_or_else(|| vec![0; init_len].into_boxed_slice()); + let start = buf.as_ptr_range().start; + let end = buf.as_ptr_range().start; + BufferWindow { + buf, + start, + end, + prior_reads: 0, + } + } +} + +/// An no-op read implementation used for TokenReaders +#[derive(Debug)] +pub struct SliceReader<'a>(PhantomData<&'a [u8]>); + +impl<'a> SliceReader<'a> { + pub(crate) fn new(_data: &'a [u8]) -> Self { + SliceReader(PhantomData) + } +} + +impl<'a> Read for SliceReader<'a> { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Ok(0) + } +} diff --git a/src/errors.rs b/src/errors.rs index 167c132..fd4f00b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,4 +1,8 @@ -use crate::ScalarError; +use crate::{ + binary::{LexError, LexerError, ReaderError as BinReaderError}, + text::ReaderError as TextReaderError, + ScalarError, +}; use std::fmt; /// An error that can occur when processing data @@ -16,6 +20,17 @@ impl Error { Self::new(ErrorKind::Eof) } + #[cold] + pub(crate) fn invalid_syntax(msg: T, position: usize) -> Error + where + T: Into, + { + Self::new(ErrorKind::InvalidSyntax { + msg: msg.into(), + offset: position, + }) + } + /// Return the specific type of error pub fn kind(&self) -> &ErrorKind { &self.0 @@ -64,6 +79,10 @@ pub enum ErrorKind { /// An error occurred when performing IO. Io(std::io::Error), + + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, } impl ErrorKind { @@ -103,6 +122,9 @@ impl std::fmt::Display for Error { ), ErrorKind::Deserialize(ref err) => write!(f, "deserialize error: {}", err), ErrorKind::Io(ref err) => write!(f, "io error: {}", err), + ErrorKind::BufferFull => { + write!(f, "max buffer size exceeded") + }, } } } @@ -113,6 +135,42 @@ impl From for Error { } } +impl From for Error { + fn from(value: LexerError) -> Self { + match value.kind() { + LexError::Eof => Error::eof(), + _ => Error::new(ErrorKind::InvalidSyntax { + msg: format!("{}", value.kind()), + offset: value.position(), + }), + } + } +} + +impl From for Error { + fn from(value: BinReaderError) -> Self { + let pos = value.position(); + match value.into_kind() { + crate::binary::ReaderErrorKind::Read(x) => Error::new(ErrorKind::Io(x)), + crate::binary::ReaderErrorKind::BufferFull => todo!(), + crate::binary::ReaderErrorKind::Lexer(LexError::Eof) => Error::eof(), + crate::binary::ReaderErrorKind::Lexer(LexError::InvalidRgb) => { + Error::invalid_syntax("invalid rgb", pos) + } + } + } +} + +impl From for Error { + fn from(value: TextReaderError) -> Self { + match value.into_kind() { + crate::text::ReaderErrorKind::Read(x) => Error::new(ErrorKind::Io(x)), + crate::text::ReaderErrorKind::BufferFull => todo!(), + crate::text::ReaderErrorKind::Eof => Error::eof(), + } + } +} + /// A Serde deserialization error. #[derive(Debug, PartialEq)] pub struct DeserializeError { diff --git a/src/lib.rs b/src/lib.rs index 9e81656..258f3d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,7 @@ Converters](https://github.com/ParadoxGameConverters) and ## Features - ✔ Versatile: Handle both plaintext and binary encoded data -- ✔ Fast: Parse data at 1 GB/s +- ✔ Fast: Parse data at over 1 GB/s - ✔ Small: Compile with zero dependencies - ✔ Safe: Extensively fuzzed against potential malicious input - ✔ Ergonomic: Use [serde](https://serde.rs/derive.html)-like macros to have parsing logic automatically implemented @@ -22,7 +22,9 @@ Converters](https://github.com/ParadoxGameConverters) and ## Quick Start -Below is a demonstration on parsing plaintext data using jomini tools. +Below is a demonstration of deserializing plaintext data using serde. +Several additional serde-like attributes are used to reconcile the serde +data model with structure of these files. ```rust # #[cfg(feature = "derive")] { @@ -72,9 +74,9 @@ assert_eq!(actual, expected); # Ok::<(), Box>(()) ``` -## Binary Parsing +## Binary Deserialization -Parsing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: +Deserializing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: - How text should be decoded (typically Windows-1252 or UTF-8) - How rational (floating point) numbers are decoded @@ -86,7 +88,7 @@ Below is an example that defines a sample binary format and uses a hashmap token ```rust # #[cfg(feature = "derive")] { -use jomini::{BinaryDeserializer, Encoding, JominiDeserialize, Windows1252Encoding}; +use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; use std::{borrow::Cow, collections::HashMap}; #[derive(JominiDeserialize, PartialEq, Debug)] @@ -97,7 +99,7 @@ struct MyStruct { #[derive(Debug, Default)] pub struct BinaryTestFlavor; -impl jomini::binary::BinaryFlavor for BinaryTestFlavor { +impl BinaryFlavor for BinaryTestFlavor { fn visit_f32(&self, data: [u8; 4]) -> f32 { f32::from_le_bytes(data) } @@ -118,8 +120,7 @@ let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; let mut map = HashMap::new(); map.insert(0x2d82, "field1"); -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; +let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); # } # Ok::<(), Box>(()) @@ -130,120 +131,14 @@ without any duplication. One can configure the behavior when a token is unknown (ie: fail immediately or try to continue). -### Ondemand Deserialization - -The ondemand deserializer is a one-shot deserialization mode is often faster -and more memory efficient as it does not parse the input into an intermediate -tape, and instead deserializes right from the input. - -It is instantiated and used similarly to `BinaryDeserializer` - -```rust -# #[cfg(feature = "derive")] { -use jomini::OndemandBinaryDeserializer; -# use jomini::{Encoding, JominiDeserialize, Windows1252Encoding}; -# use std::{borrow::Cow, collections::HashMap}; -# -# #[derive(JominiDeserialize, PartialEq, Debug)] -# struct MyStruct { -# field1: String, -# } -# -# #[derive(Debug, Default)] -# pub struct BinaryTestFlavor; -# -# impl jomini::binary::BinaryFlavor for BinaryTestFlavor { -# fn visit_f32(&self, data: [u8; 4]) -> f32 { -# f32::from_le_bytes(data) -# } -# -# fn visit_f64(&self, data: [u8; 8]) -> f64 { -# f64::from_le_bytes(data) -# } -# } -# -# impl Encoding for BinaryTestFlavor { -# fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { -# Windows1252Encoding::decode(data) -# } -# } -# -# let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; -# -# let mut map = HashMap::new(); -# map.insert(0x2d82, "field1"); -// [...snip code from previous example...] - -let actual: MyStruct = OndemandBinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -# } -# Ok::<(), Box>(()) -``` - -### Direct identifier deserialization with `token` attribute - -There may be some performance loss during binary deserialization as -tokens are resolved to strings via a `TokenResolver` and then matched against the -string representations of a struct's fields. - -We can fix this issue by directly encoding the expected token value into the struct: - -```rust -# #[cfg(feature = "derive")] { -# use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, BinaryDeserializer}; -# use std::{borrow::Cow, collections::HashMap}; -# -# #[derive(Debug, Default)] -# pub struct BinaryTestFlavor; -# -# impl jomini::binary::BinaryFlavor for BinaryTestFlavor { -# fn visit_f32(&self, data: [u8; 4]) -> f32 { -# f32::from_le_bytes(data) -# } -# -# fn visit_f64(&self, data: [u8; 8]) -> f64 { -# f64::from_le_bytes(data) -# } -# } -# -# impl Encoding for BinaryTestFlavor { -# fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { -# Windows1252Encoding::decode(data) -# } -# } -# -# let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; -# -#[derive(JominiDeserialize, PartialEq, Debug)] -struct MyStruct { - #[jomini(token = 0x2d82)] - field1: String, -} - -// Empty token to string resolver -let map = HashMap::::new(); - -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -# } -# Ok::<(), Box>(()) -``` - -Couple notes: - -- This does not obviate need for the token to string resolver as tokens may be used as values. -- If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. - ## Caveats -Caller is responsible for: +Before calling any Jomini API, callers are expected to: -- Determining the correct format (text or binary) ahead of time -- Stripping off any header that may be present (eg: `EU4txt` / `EU4bin`) -- Providing the token resolver for the binary format -- Providing the conversion to reconcile how, for example, a date may be encoded as an integer in +- Determine the correct format (text or binary) ahead of time. +- Strip off any header that may be present (eg: `EU4txt` / `EU4bin`) +- Provide the token resolver for the binary format +- Provide the conversion to reconcile how, for example, a date may be encoded as an integer in the binary format, but as a string when in plaintext. ## The Mid-level API @@ -264,6 +159,8 @@ for (key, _op, value) in reader.fields() { } ``` +For even lower level of parisng, see the respective [binary] and [text] module documentation. + */ #![cfg_attr( feature = "json", @@ -287,28 +184,6 @@ assert_eq!(actual, r#"{"foo":"bar"}"#); "## )] /*! -## One Level Lower - -At the lowest layer, one can interact with the raw data directly via `TextTape` -and `BinaryTape`. - -```rust -use jomini::{TextTape, TextToken, Scalar}; - -let data = b"foo=bar"; - -assert_eq!( - TextTape::from_slice(&data[..])?.tokens(), - &[ - TextToken::Unquoted(Scalar::new(b"foo")), - TextToken::Unquoted(Scalar::new(b"bar")), - ] -); -# Ok::<(), Box>(()) -``` - -If one will only use `TextTape` and `BinaryTape` then `jomini` can be compiled without default -features, resulting in a build without dependencies. ## Write API @@ -349,6 +224,7 @@ assert_eq!(&out, b"hello=world\nfoo=bar"); */ #![warn(missing_docs)] pub mod binary; +pub(crate) mod buffer; pub mod common; mod copyless; mod data; @@ -362,17 +238,17 @@ mod scalar; pub mod text; pub(crate) mod util; +#[doc(inline)] pub use self::binary::{BinaryTape, BinaryToken}; +pub use self::buffer::SliceReader; pub use self::encoding::*; pub use self::errors::*; pub use self::scalar::{Scalar, ScalarError}; +#[doc(inline)] pub use self::text::{TextTape, TextToken, TextWriter, TextWriterBuilder}; #[cfg(feature = "derive")] #[doc(inline)] -pub use self::{ - binary::de::{BinaryDeserializer, OndemandBinaryDeserializer}, - text::de::TextDeserializer, -}; +pub use self::{binary::de::BinaryDeserializer, text::de::TextDeserializer}; #[cfg(feature = "derive")] pub use jomini_derive::*; diff --git a/src/text/de.rs b/src/text/de.rs index 1365495..6ffdea3 100644 --- a/src/text/de.rs +++ b/src/text/de.rs @@ -1,13 +1,14 @@ -use super::reader::ValuesIter; +use super::{dom::ValuesIter, reader::Token, TokenReader}; use crate::{ text::{ArrayReader, FieldsIter, ObjectReader, Operator, Reader, ScalarReader, ValueReader}, DeserializeError, DeserializeErrorKind, Encoding, Error, TextTape, TextToken, Utf8Encoding, Windows1252Encoding, }; -use serde::de::{self, Deserialize, DeserializeSeed, Visitor}; +use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, Visitor}; use std::{ borrow::Cow, fmt::{self, Debug}, + io::Read, }; /// Represents the field value that contains an operator @@ -122,12 +123,600 @@ where TextDeserializer::from_windows1252_slice(data)?.deserialize() } +/// (**Experimental**) Create a Windows1252 text value from a reader +/// +/// Considered experimental as it uses a [TokenReader] under the hood, which +/// uses a different parsing routine geared toward save files. +pub fn from_windows1252_reader(reader: R) -> Result +where + T: DeserializeOwned, + R: Read, +{ + TextDeserializer::from_windows1252_reader(TokenReader::new(reader)).deserialize() +} + +/// Convenience method for deserializing streaming utf8 data into a Rust value +pub fn from_utf8_reader(reader: R) -> Result +where + T: DeserializeOwned, + R: Read, +{ + TextDeserializer::from_utf8_reader(TokenReader::new(reader)).deserialize() +} + /// Convenience method for parsing the given text data and deserializing as utf8 encoded. pub fn from_utf8_slice<'a, T>(data: &'a [u8]) -> Result where - T: Deserialize<'a>, + T: Deserialize<'a>, +{ + TextDeserializer::from_utf8_slice(data)?.deserialize() +} + +/// A serde deserializer over streaming data +pub struct TextReaderDeserializer { + reader: TokenReader, + encoding: E, +} + +impl TextReaderDeserializer { + /// Deserialize into provided type + pub fn deserialize(&mut self) -> Result + where + T: DeserializeOwned, + { + T::deserialize(self) + } +} + +impl<'de, R: Read, E: Encoding> de::Deserializer<'de> for &'_ mut TextReaderDeserializer { + type Error = Error; + + fn deserialize_any(self, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "root deserializer can only work with key value pairs", + )), + })) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_map(TextReaderMap::new(self, true)) + } + + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) + } + + serde::forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct enum ignored_any identifier + } +} + +struct TextReaderMap<'a, R, E> { + de: &'a mut TextReaderDeserializer, + root: bool, +} + +impl<'a, R, E> TextReaderMap<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, root: bool) -> Self { + TextReaderMap { de, root } + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::MapAccess<'de> for TextReaderMap<'a, R, E> { + type Error = Error; + + #[inline] + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + loop { + match self.de.reader.next() { + Ok(Some(Token::Close)) => return Ok(None), + Ok(Some(Token::Open)) => { + let _ = self.de.reader.read()?; + } + Ok(Some(token)) => { + return seed + .deserialize(TextReaderTokenDeserializer::new(de, token)) + .map(Some) + } + Ok(None) if self.root => return Ok(None), + Ok(None) => return Err(self.de.reader.eof_error().into()), + Err(e) => return Err(e.into()), + } + } + } + + #[inline] + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + let token = self.de.reader.read()?; + let deser = if let Token::Operator(op) = token { + let new_token = self.de.reader.read()?; + let mut deser = TextReaderTokenDeserializer::new(de, new_token); + deser.op = op; + deser + } else { + TextReaderTokenDeserializer::new(de, token) + }; + + seed.deserialize(deser) + } +} + +struct TextReaderTokenDeserializer<'a, R, E> { + de: &'a mut TextReaderDeserializer, + token: Token<'a>, + op: Operator, +} + +impl<'a, R, E> TextReaderTokenDeserializer<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, token: Token<'a>) -> Self { + Self { + de, + token, + op: Operator::Equal, + } + } +} + +impl<'a, 'de: 'a, R: Read, E: Encoding> de::Deserializer<'de> + for TextReaderTokenDeserializer<'a, R, E> +{ + type Error = Error; + + fn deserialize_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => visitor.visit_seq(TextReaderSeq::new(self.de)), + Token::Close => Err(Error::invalid_syntax( + "did not expect end", + self.de.reader.position(), + )), + Token::Operator(x) => visitor.visit_str(x.symbol()), + Token::Unquoted(s) | Token::Quoted(s) => match self.de.encoding.decode(s.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + }, + } + } + + fn deserialize_bool(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_bool().ok()) { + Some(x) => visitor.visit_bool(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_i8(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_i64().ok()) { + Some(x) => visitor.visit_i64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_u8(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_u64().ok()) { + Some(x) => visitor.visit_u64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_f32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_f64(visitor) + } + + fn deserialize_f64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_f64().ok()) { + Some(x) => visitor.visit_f64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_char(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_any(visitor) + } + + fn deserialize_str(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Some(s) = self.token.as_scalar() { + match self.de.encoding.decode(s.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } else { + self.deserialize_any(visitor) + } + } + + fn deserialize_string(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_str(visitor) + } + + fn deserialize_bytes(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar() { + Some(s) => visitor.visit_bytes(s.as_bytes()), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_byte_buf(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_bytes(visitor) + } + + fn deserialize_option(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_some(self) + } + + fn deserialize_unit(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + fn deserialize_unit_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + fn deserialize_newtype_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + fn deserialize_seq(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + let mut seq = TextReaderSeq::new(self.de); + let result = visitor.visit_seq(&mut seq)?; + if !seq.hit_end { + // For when we are deserializing an array that doesn't read + // the closing token + if !matches!(self.de.reader.read()?, Token::Close) { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.reader.position(), + )); + } + } + Ok(result) + } + + fn deserialize_tuple(self, _len: usize, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + fn deserialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + visitor.visit_map(TextReaderMap::new(self.de, false)) + } else { + self.deserialize_any(visitor) + } + } + + fn deserialize_struct( + self, + name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + if name == "_internal_jomini_property" { + visitor.visit_map(PropertyReaderMap { + de: self.de, + token: self.token, + op: self.op, + state: 0, + }) + } else { + self.deserialize_map(visitor) + } + } + + fn deserialize_enum( + self, + _name: &'static str, + _variants: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_enum(TextReaderEnum::new(self.de, self.token)) + } + + fn deserialize_identifier(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_str(visitor) + } + + fn deserialize_ignored_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => self.de.reader.skip_container()?, + Token::Unquoted(_) => self.de.reader.skip_unquoted_value()?, + _ => {} + } + visitor.visit_unit() + } +} + +struct TextReaderSeq<'a, R, E> { + de: &'a mut TextReaderDeserializer, + hit_end: bool, +} + +impl<'a, R, E> TextReaderSeq<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer) -> Self { + TextReaderSeq { de, hit_end: false } + } +} + +impl<'de, 'a, R, E> de::SeqAccess<'de> for TextReaderSeq<'a, R, E> +where + R: Read, + E: Encoding, { - TextDeserializer::from_utf8_slice(data)?.deserialize() + type Error = Error; + + fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> + where + T: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + match self.de.reader.read()? { + Token::Close => { + self.hit_end = true; + Ok(None) + } + token => seed + .deserialize(TextReaderTokenDeserializer::new(de, token)) + .map(Some), + } + } +} + +struct TextReaderEnum<'a, R, E> { + de: &'a mut TextReaderDeserializer, + token: Token<'a>, +} + +impl<'a, R, E> TextReaderEnum<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, token: Token<'a>) -> Self { + TextReaderEnum { de, token } + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::EnumAccess<'de> for TextReaderEnum<'a, R, E> { + type Error = Error; + type Variant = Self; + + fn variant_seed(self, seed: V) -> Result<(V::Value, Self), Self::Error> + where + V: de::DeserializeSeed<'de>, + { + let variant = seed.deserialize(TextReaderTokenDeserializer::new(self.de, self.token))?; + Ok((variant, self)) + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::VariantAccess<'de> for TextReaderEnum<'a, R, E> { + type Error = Error; + + fn unit_variant(self) -> Result<(), Self::Error> { + Ok(()) + } + + fn newtype_variant_seed(self, _seed: T) -> Result + where + T: DeserializeSeed<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } + + fn tuple_variant(self, _len: usize, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } + + fn struct_variant( + self, + _fields: &'static [&'static str], + _visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } +} + +struct PropertyReaderMap<'a, R, E> { + de: &'a mut TextReaderDeserializer, + op: Operator, + token: Token<'a>, + state: usize, +} + +impl<'a, 'de, R, E> de::MapAccess<'de> for PropertyReaderMap<'a, R, E> +where + E: Encoding, + R: Read, +{ + type Error = Error; + + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + match self.state { + 0 => seed.deserialize(StaticDeserializer("operator")).map(Some), + 1 => seed.deserialize(StaticDeserializer("value")).map(Some), + _ => Ok(None), + } + } + + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + self.state += 1; + if self.state == 1 { + seed.deserialize(OperatorDeserializer(self.op)) + } else { + seed.deserialize(TextReaderTokenDeserializer::new(self.de, self.token)) + } + } } /// A structure to deserialize text data into Rust values. @@ -192,6 +781,37 @@ enum TextDeserializerKind<'a, 'b, E> { Reader { reader: &'b ObjectReader<'a, 'b, E> }, } +impl TextDeserializer<'_, '_, Windows1252Encoding> { + /// (**Experimental**) Create a Windows1252 text deserializer over a reader + /// + /// Considered experimental as it uses a [TokenReader] under the hood, which + /// uses a different parsing routine geared toward save files. + pub fn from_windows1252_reader( + reader: TokenReader, + ) -> TextReaderDeserializer + where + R: Read, + { + TextReaderDeserializer { + reader, + encoding: Windows1252Encoding, + } + } +} + +impl TextDeserializer<'_, '_, Utf8Encoding> { + /// Create a UTF8 text deserializer over a reader + pub fn from_utf8_reader(reader: TokenReader) -> TextReaderDeserializer + where + R: Read, + { + TextReaderDeserializer { + reader, + encoding: Utf8Encoding, + } + } +} + impl<'a, 'b> TextDeserializer<'a, 'b, Windows1252Encoding> { /// Convenience method for parsing the given text data and deserializing as windows1252 encoded. pub fn from_windows1252_slice( @@ -1074,12 +1694,12 @@ impl<'de> de::Deserializer<'de> for OperatorDeserializer { #[cfg(test)] mod tests { - use crate::common::{Date, DateHour, UniformDate}; - use super::*; + use crate::common::{Date, DateHour, UniformDate}; use jomini_derive::JominiDeserialize; + use rstest::rstest; use serde::{ - de::{self, Deserializer}, + de::{self, DeserializeOwned, Deserializer}, Deserialize, }; use std::fmt; @@ -1092,6 +1712,24 @@ mod tests { Ok(super::from_windows1252_slice(data)?) } + fn from_owned(data: &[u8]) -> T + where + T: DeserializeOwned + PartialEq + Debug, + { + let tape = TextTape::from_slice(data).unwrap(); + let x1: T = TextDeserializer::from_windows1252_tape(&tape) + .deserialize() + .unwrap(); + let reader = TokenReader::new(data); + let mut des = TextReaderDeserializer { + reader, + encoding: Windows1252Encoding, + }; + let x2 = T::deserialize(&mut des).unwrap(); + assert_eq!(x1, x2); + x1 + } + #[test] fn test_single_field() { let data = b"field1=ENG"; @@ -1101,7 +1739,7 @@ mod tests { field1: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1150,7 +1788,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1168,7 +1806,7 @@ mod tests { field1: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: false }); } @@ -1181,7 +1819,7 @@ mod tests { field1: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: true }); } @@ -1194,7 +1832,7 @@ mod tests { field1: u64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1207,7 +1845,7 @@ mod tests { field1: u32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1220,7 +1858,7 @@ mod tests { field1: u8, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 100 }); } @@ -1233,7 +1871,7 @@ mod tests { field1: u16, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1246,7 +1884,7 @@ mod tests { field1: i8, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100 }); } @@ -1259,7 +1897,7 @@ mod tests { field1: i16, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1272,7 +1910,7 @@ mod tests { field1: i32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1285,7 +1923,7 @@ mod tests { field1: i64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1298,7 +1936,7 @@ mod tests { field1: f32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100.535 }); } @@ -1311,7 +1949,7 @@ mod tests { field1: f64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100.535 }); } @@ -1325,7 +1963,7 @@ mod tests { field2: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1343,7 +1981,7 @@ mod tests { dlc_enabled: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1376,7 +2014,7 @@ mod tests { name: Option, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1400,7 +2038,7 @@ mod tests { discovered_by: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1423,7 +2061,7 @@ mod tests { id: u32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1441,7 +2079,25 @@ mod tests { c: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); + assert_eq!( + actual, + MyStruct { + c: String::from("d"), + } + ); + } + + #[test] + fn test_skip_unwanted2() { + let data = b"a={ \"hello\" \"goodbye\" } \r\nc = d\r\ne = f"; + + #[derive(Deserialize, PartialEq, Debug)] + struct MyStruct { + c: String, + } + + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1462,7 +2118,7 @@ mod tests { e: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1485,7 +2141,7 @@ mod tests { e: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1511,7 +2167,7 @@ mod tests { c: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1529,7 +2185,7 @@ mod tests { field1: Option, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1542,7 +2198,7 @@ mod tests { fn test_deserialize_hashmap() { let data = b"-1=a\r\n-2=b"; - let actual: HashMap = from_slice(&data[..]).unwrap(); + let actual: HashMap = from_owned(&data[..]); let mut expected = HashMap::new(); expected.insert(-1, String::from("a")); expected.insert(-2, String::from("b")); @@ -1563,7 +2219,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); let mut expected = HashMap::new(); expected.insert( -1, @@ -1599,7 +2255,7 @@ mod tests { bar: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1628,7 +2284,7 @@ mod tests { bar: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1655,7 +2311,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1671,17 +2327,22 @@ mod tests { ); } - #[test] - fn test_deserialize_ignore_operator() { - let data = b"val > 3 a = b"; - + #[rstest] + #[case(b"val < 3 a = b")] + #[case(b"val <= 3 a = b")] + #[case(b"val > 3 a = b")] + #[case(b"val >= 3 a = b")] + #[case(b"val == 3 a = b")] + #[case(b"val != 3 a = b")] + #[case(b"val ?= 3 a = b")] + fn test_deserialize_ignore_operator(#[case] data: &[u8]) { #[derive(Deserialize, PartialEq, Debug)] struct MyStruct { val: i32, a: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1719,7 +2380,7 @@ mod tests { fn test_deserialize_enum_scalar() { let data = b"kind = infantry"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1927,34 +2588,17 @@ mod tests { }) } + #[rstest] + #[case(b"active_idea_groups = { a = 10 }", vec![(String::from("a"), 10)])] + #[case(b"active_idea_groups = { }", vec![])] + #[case(b"active_idea_groups = { ]=0 defensive_ideas=2 }", vec![(String::from("]"), 0), (String::from("defensive_ideas"), 2)])] #[test] - fn test_deserialize_vec_pair() { - let data = b"active_idea_groups = { a = 10 }"; - - let actual: MyStruct = from_slice(&data[..]).unwrap(); - assert_eq!( - actual, - MyStruct { - active_idea_groups: vec![(String::from("a"), 10)] - } - ); - - #[derive(Deserialize, Debug, PartialEq)] - struct MyStruct { - #[serde(default, deserialize_with = "deserialize_vec_pair")] - active_idea_groups: Vec<(String, u8)>, - } - } - - #[test] - fn test_deserialize_vec_pair_empty() { - let data = b"active_idea_groups = {}"; - - let actual: MyStruct = from_slice(&data[..]).unwrap(); + fn test_deserialize_vec_pair(#[case] input: &[u8], #[case] expected: Vec<(String, u8)>) { + let actual: MyStruct = from_owned(input); assert_eq!( actual, MyStruct { - active_idea_groups: Vec::new() + active_idea_groups: expected } ); @@ -1969,7 +2613,7 @@ mod tests { fn test_deserialize_date_string() { let data = b"date=\"1444.11.11\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1987,7 +2631,7 @@ mod tests { fn test_deserialize_datehour_string() { let data = b"date=\"1936.1.1.24\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2005,7 +2649,7 @@ mod tests { fn test_deserialize_uniform_date() { let data = b"date=\"2200.2.30\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2023,7 +2667,7 @@ mod tests { fn test_deserialize_positive_num() { let data = b"pop_happiness = +0.10"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { pop_happiness: 0.1 }); #[derive(Deserialize, Debug, PartialEq)] @@ -2036,7 +2680,7 @@ mod tests { fn test_deserialize_operator() { let data = b"num_cities < 0.10"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2054,7 +2698,7 @@ mod tests { fn test_deserialize_operator2() { let data = b"modifier = { factor = 2 num_communications > 2 }"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2124,7 +2768,7 @@ mod tests { } let data = br#"field1=1 field2=invalid"#; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2210,7 +2854,7 @@ mod tests { ), ]); - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { diff --git a/src/text/dom.rs b/src/text/dom.rs new file mode 100644 index 0000000..96fa318 --- /dev/null +++ b/src/text/dom.rs @@ -0,0 +1,1502 @@ +use super::fnv::FnvBuildHasher; +use crate::{ + text::Operator, DeserializeError, DeserializeErrorKind, Encoding, Scalar, TextTape, TextToken, +}; +use std::{ + borrow::Cow, + collections::{hash_map::Entry, HashMap}, +}; + +pub type KeyValue<'data, 'tokens, E> = ( + ScalarReader<'data, E>, + Option, + ValueReader<'data, 'tokens, E>, +); + +pub type KeyValues<'data, 'tokens, E> = (ScalarReader<'data, E>, GroupEntry<'data, 'tokens, E>); + +/// Calculate what index the next value is. This assumes that a header + value +/// are two separate values +#[inline] +fn next_idx_header(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + TextToken::Operator(_) | TextToken::MixedContainer => idx + 2, + _ => idx + 1, + } +} + +/// Calculate what index the next value is. This assumes that a header + value +/// is one value +#[inline] +fn next_idx(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + TextToken::Operator(_) => next_idx(tokens, idx + 1), + TextToken::Header(_) => next_idx_header(tokens, idx + 1), + _ => idx + 1, + } +} + +#[inline] +fn next_idx_values(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + _ => idx + 1, + } +} + +#[inline] +fn fields_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { + let mut ind = start_ind; + let mut count = 0; + while ind < end_ind { + let key_ind = ind; + if tokens[key_ind] == TextToken::MixedContainer { + return count; + } + + let value_ind = match tokens[key_ind + 1] { + TextToken::Operator(_) => key_ind + 2, + _ => key_ind + 1, + }; + ind = next_idx(tokens, value_ind); + count += 1; + } + + count +} + +#[inline] +pub fn values_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { + let mut count = 0; + let mut ind = start_ind; + while ind < end_ind { + ind = next_idx_values(tokens, ind); + count += 1; + } + + count +} + +type OpValue<'data, 'tokens, E> = (Option, ValueReader<'data, 'tokens, E>); + +/// Iterator over values grouped by duplicate keys +/// +/// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example +pub struct GroupEntryIter<'data, 'tokens, 'parent, E> { + index: usize, + parent: &'parent GroupEntry<'data, 'tokens, E>, +} + +impl<'data, 'tokens, 'parent, E> Iterator for GroupEntryIter<'data, 'tokens, 'parent, E> +where + E: Clone, +{ + type Item = (Option, ValueReader<'data, 'tokens, E>); + + fn next(&mut self) -> Option { + match &self.parent { + GroupEntry::One((op, val)) => { + if self.index == 0 { + self.index += 1; + Some((*op, (*val).clone())) + } else { + None + } + } + GroupEntry::Multiple(entries) => { + let result = entries.get(self.index); + self.index += 1; + result.map(|(op, val)| (*op, (*val).clone())) + } + } + } +} + +/// Represents a group of values for duplicate keys +/// +/// May contain one or many values +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.field_groups(); +/// let first_group = fields.next(); +/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(first_key.as_deref(), Some("name")); +/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(first_values_len, Some(1)); +/// let first_values = first_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// +/// let second_group = fields.next(); +/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(second_key.as_deref(), Some("core")); +/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(second_values, Some(2)); +/// let second_values = second_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); +/// # Ok(()) +/// # } +/// ``` +pub enum GroupEntry<'data, 'tokens, E> { + /// Represents that the group is composed of only one value + /// + /// Most fields should only occur once, so this variant is optimized to + /// not require a memory allocation (unlike the `Multiple` variant). + One(OpValue<'data, 'tokens, E>), + + /// Represents that the group is composed of several values + Multiple(Vec>), +} + +impl<'data, 'tokens, E> GroupEntry<'data, 'tokens, E> { + /// Returns an iterator that includes all the values + pub fn values<'parent>(&'parent self) -> GroupEntryIter<'data, 'tokens, 'parent, E> { + GroupEntryIter { + index: 0, + parent: self, + } + } + + /// A group can never be empty so this returns false + pub fn is_empty(&self) -> bool { + false + } + + /// Returns the number of values in the group + pub fn len(&self) -> usize { + match &self { + GroupEntry::One(_) => 1, + GroupEntry::Multiple(x) => x.len(), + } + } +} + +/// All possible text reader variants +#[derive(Debug, Clone)] +pub enum Reader<'data, 'tokens, E> { + /// object reader + Object(ObjectReader<'data, 'tokens, E>), + + /// array reader + Array(ArrayReader<'data, 'tokens, E>), + + /// scalar reader + Scalar(ScalarReader<'data, E>), + + /// value reader + Value(ValueReader<'data, 'tokens, E>), +} + +impl<'data, 'tokens, E> Reader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Interpret value as a string + #[inline] + pub fn read_str(&self) -> Result, DeserializeError> { + match &self { + Reader::Scalar(x) => Ok(x.read_str()), + Reader::Value(x) => x.read_str(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } + + /// Interpret value as a string + #[inline] + pub fn read_string(&self) -> Result { + match &self { + Reader::Scalar(x) => Ok(x.read_string()), + Reader::Value(x) => x.read_string(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } + + /// Interpret value as a scalar + #[inline] + pub fn read_scalar(&self) -> Result, DeserializeError> { + match &self { + Reader::Scalar(x) => Ok(x.read_scalar()), + Reader::Value(x) => x.read_scalar(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } +} + +/// Iterator over fields of an object grouped by key +/// +/// Since objects can have duplicated keys across fields, this iterator +/// consolidates them such that all values with the same key are grouped +/// together in the order that they appear in the object. Key order is +/// also equivalent, except that already seen keys will be skipped, as +/// those values have already been seen in an earlier group. +/// +/// The process of grouping values together is more expensive than simply +/// iterating the keys in order, so when possible prefer +/// [`ObjectReader::fields()`](crate::text::ObjectReader::fields) over +/// [`ObjectReader::field_groups()`](crate::text::ObjectReader::field_groups). +/// +/// These groups can be easily iterated: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// for (key, group) in reader.field_groups() { +/// match key.read_str().as_ref() { +/// "name" => assert_eq!(group.len(), 1), +/// "core" => assert_eq!(group.len(), 2), +/// x => panic!("unexpected key: {}", x), +/// } +/// } +/// # Ok(()) +/// # } +/// ``` +/// +/// And picked apart: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.field_groups(); +/// let first_group = fields.next(); +/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(first_key.as_deref(), Some("name")); +/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(first_values_len, Some(1)); +/// let first_values = first_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// +/// let second_group = fields.next(); +/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(second_key.as_deref(), Some("core")); +/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(second_values, Some(2)); +/// let second_values = second_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); +/// # Ok(()) +/// # } +/// ``` +pub struct FieldGroupsIter<'data, 'tokens, E> { + key_indices: HashMap<&'data [u8], Vec>, FnvBuildHasher>, + fields: FieldsIter<'data, 'tokens, E>, +} + +impl<'data, 'tokens, E> FieldGroupsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { + // Using the fnv hasher improved throughput of the eu4 json benchmark + // by over 15%. + let mut key_indices = + HashMap::with_capacity_and_hasher(reader.fields_len(), FnvBuildHasher::default()); + for (key, op, val) in reader.fields() { + let entry = key_indices.entry(key.read_scalar().as_bytes()); + + match entry { + Entry::Vacant(x) => { + x.insert(Vec::with_capacity(0)); + } + Entry::Occupied(mut x) => { + x.get_mut().push((op, val)); + } + } + } + + let fields = reader.fields(); + + FieldGroupsIter { + key_indices, + fields, + } + } + + /// See [the other `remainder` documentation](crate::text::FieldsIter::remainder) + pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { + self.fields.remainder() + } +} + +impl<'data, 'tokens, E> Iterator for FieldGroupsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = KeyValues<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + loop { + let (key, op, value) = self.fields.next()?; + + if let Some((_key, mut entries)) = + self.key_indices.remove_entry(key.read_scalar().as_bytes()) + { + if entries.is_empty() { + return Some((key, GroupEntry::One((op, value)))); + } else { + entries.insert(0, (op, value)); + return Some((key, GroupEntry::Multiple(entries))); + } + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.key_indices.len(), None) + } +} + +/// Iterator over fields of an object in the order that they appear +/// +/// Since objects can have duplicated keys across fields, this iterator +/// may yield items that have duplicate keys. +/// +/// Fields can be easily iterated: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let (names, cores) = reader +/// .fields() +/// .fold((0, 0), |(names, cores), (key, _op, _value)| { +/// match key.read_str().as_ref() { +/// "name" => (names + 1, cores), +/// "core" => (names, cores + 1), +/// x => panic!("unexpected key: {}", x), +/// } +/// }); +/// assert_eq!((1, 2), (names, cores)); +/// # Ok(()) +/// # } +/// ``` +/// +/// And picked apart: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.fields(); +/// let (first_key, _op, first_val) = fields.next().unwrap(); +/// assert_eq!(first_key.read_str(), "name"); +/// assert_eq!(first_val.read_str().ok().as_deref(), Some("a")); +/// # Ok(()) +/// # } +/// ``` +pub struct FieldsIter<'data, 'tokens, E> { + token_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> FieldsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { + FieldsIter { + token_ind: reader.start_ind, + end_ind: reader.end_ind, + tokens: reader.tokens, + encoding: reader.encoding.clone(), + } + } + + /// Returns the remaining values from an object if the container is an + /// object that transitions into an array. + pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { + let start = self + .tokens + .get(self.token_ind) + .map(|x| match x { + TextToken::MixedContainer => self.token_ind + 1, + TextToken::End(y) => { + if let Some(TextToken::Array { .. }) = self.tokens.get(*y) { + *y + 1 + } else { + self.token_ind + } + } + _ => self.token_ind, + }) + .unwrap_or(self.end_ind); + + ArrayReader { + start_ind: start, + end_ind: self.end_ind, + encoding: self.encoding.clone(), + tokens: self.tokens, + } + } +} + +impl<'data, 'tokens, E> Iterator for FieldsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = KeyValue<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + if self.token_ind >= self.end_ind { + return None; + } + + let key_ind = self.token_ind; + let token = self.tokens[key_ind].clone(); + let key_scalar = match token { + TextToken::Quoted(x) + | TextToken::Unquoted(x) + | TextToken::Parameter(x) + | TextToken::UndefinedParameter(x) => x, + TextToken::MixedContainer => { + return None; + } + _ => { + // this is a broken invariant, so we safely recover by saying the object + // has no more fields + debug_assert!(false, "All keys should be scalars, not {:?}", &token); + return None; + } + }; + + let key_reader = ScalarReader { + scalar: key_scalar, + token, + encoding: self.encoding.clone(), + }; + + let (op, value_ind) = match self.tokens[key_ind + 1] { + TextToken::Operator(x) => (Some(x), key_ind + 2), + _ => (None, key_ind + 1), + }; + + let value_reader = ValueReader { + value_ind, + tokens: self.tokens, + encoding: self.encoding.clone(), + }; + self.token_ind = next_idx(self.tokens, value_ind); + Some((key_reader, op, value_reader)) + } + + fn size_hint(&self) -> (usize, Option) { + let len = fields_len(self.tokens, self.token_ind, self.end_ind); + (len, None) + } +} + +/// A reader for objects +#[derive(Debug, Clone)] +pub struct ObjectReader<'data, 'tokens, E> { + start_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ObjectReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Create a new object reader from parsed data with encoded strings + pub fn new(tape: &'tokens TextTape<'data>, encoding: E) -> Self { + let tokens = tape.tokens(); + ObjectReader { + tokens, + end_ind: tokens.len(), + start_ind: 0, + encoding, + } + } + + /// Return the number of tokens contained within the object + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1} foo=bar")?; + /// let reader = tape.windows1252_reader(); + /// assert_eq!(reader.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + pub fn tokens_len(&self) -> usize { + self.end_ind - self.start_ind + } + + /// Deserialize from the object reader + /// + /// ``` + /// use jomini::TextTape; + /// use serde::Deserialize; + /// + /// # fn main() -> Result<(), Box> { + /// #[derive(Debug, Clone, Deserialize, PartialEq)] + /// pub struct Obj { + /// foo: String, + /// } + /// + /// let tape = TextTape::from_slice(b"obj={foo=bar}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, obj_value) = fields.next().unwrap(); + /// let obj_reader = obj_value.read_object().unwrap(); + /// let result: Obj = obj_reader.deserialize().unwrap(); + /// assert_eq!(result, Obj { foo: "bar".to_string() }); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "derive")] + pub fn deserialize(&self) -> Result + where + T: serde::Deserialize<'data>, + { + T::deserialize(&crate::TextDeserializer::from_reader(self)) + } + + /// Return the number of key value pairs that the object contains. + pub fn fields_len(&self) -> usize { + fields_len(self.tokens, self.start_ind, self.end_ind) + } + + /// Iterator over fields as they appear in the object + /// + /// See [FieldsIter](crate::text::FieldsIter) for a worked example + #[inline] + pub fn fields(&self) -> FieldsIter<'data, 'tokens, E> { + FieldsIter::new(self) + } + + /// Iterator over fields that are grouped by key + /// + /// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example + #[inline] + pub fn field_groups(&self) -> FieldGroupsIter<'data, 'tokens, E> { + FieldGroupsIter::new(self) + } +} + +/// A text reader that wraps an underlying scalar value +#[derive(Debug, Clone)] +pub struct ScalarReader<'data, E> { + scalar: Scalar<'data>, + token: TextToken<'data>, + encoding: E, +} + +impl<'data, E> ScalarReader<'data, E> +where + E: Encoding, +{ + /// Decode the data with a given string encoding + #[inline] + pub fn read_str(&self) -> Cow<'data, str> { + self.encoding.decode(self.scalar.as_bytes()) + } + + /// Decode the data with a given string encoding + #[inline] + pub fn read_string(&self) -> String { + self.encoding.decode(self.scalar.as_bytes()).into_owned() + } + + /// Return the underlying scalar + #[inline] + pub fn read_scalar(&self) -> Scalar<'data> { + self.scalar + } + + /// Return the token that the reader is abstracting + #[inline] + pub fn token(&self) -> &TextToken<'data> { + &self.token + } +} + +/// A text reader for a text value +#[derive(Debug, Clone)] +pub struct ValueReader<'data, 'tokens, E> { + value_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> { + /// Return the token that the reader is abstracting + #[inline] + pub fn token(&self) -> &TextToken<'data> { + &self.tokens[self.value_ind] + } + + #[cfg(feature = "derive")] + pub(crate) fn next(&mut self) -> Option<&TextToken<'data>> { + self.value_ind += 1; + self.tokens.get(self.value_ind) + } +} + +impl<'data, 'tokens, E> Encoding for ValueReader<'data, 'tokens, E> +where + E: Encoding, +{ + #[inline] + fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { + self.encoding.decode(data) + } +} + +impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn raw_str(&self) -> Option> { + match self.tokens[self.value_ind] { + TextToken::Header(s) + | TextToken::Unquoted(s) + | TextToken::Quoted(s) + | TextToken::Parameter(s) + | TextToken::UndefinedParameter(s) => Some(self.encoding.decode(s.as_bytes())), + TextToken::Operator(s) => Some(Cow::Borrowed(s.symbol())), + _ => None, + } + } + + /// Interpret the current value as string + #[inline] + pub fn read_str(&self) -> Result, DeserializeError> { + self.raw_str().ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a string")), + }) + } + + /// Interpret the current value as string + #[inline] + pub fn read_string(&self) -> Result { + self.raw_str() + .map(String::from) + .ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a string")), + }) + } + + /// Interpret the current value as a scalar + #[inline] + pub fn read_scalar(&self) -> Result, DeserializeError> { + self.tokens[self.value_ind] + .as_scalar() + .ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }) + } + + /// Interpret the current value as an object + #[inline] + pub fn read_object(&self) -> Result, DeserializeError> { + match self.tokens[self.value_ind] { + TextToken::Object { end, .. } => Ok(ObjectReader { + tokens: self.tokens, + start_ind: self.value_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }), + + TextToken::Array { end, .. } => Ok(ObjectReader { + tokens: self.tokens, + start_ind: end, + end_ind: end, + encoding: self.encoding.clone(), + }), + + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not an object")), + }), + } + } + + /// Interpret the current value as an array + #[inline] + pub fn read_array(&self) -> Result, DeserializeError> { + match self.tokens[self.value_ind] { + TextToken::Object { end, mixed: true } => { + let mut start_ind = self.value_ind + 1; + while self.tokens.get(start_ind) != Some(&TextToken::MixedContainer) { + start_ind = next_idx(self.tokens, start_ind); + } + + Ok(ArrayReader { + tokens: self.tokens, + start_ind: start_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }) + } + TextToken::Array { end, .. } | TextToken::Object { end, .. } => Ok(ArrayReader { + tokens: self.tokens, + start_ind: self.value_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }), + + // A header can be seen as a two element array + TextToken::Header(_) => Ok(ArrayReader { + tokens: self.tokens, + start_ind: self.value_ind, + end_ind: next_idx(self.tokens, self.value_ind + 1), + encoding: self.encoding.clone(), + }), + + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not an array")), + }), + } + } + + /// Return the number of tokens the value encompases + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, first_value) = fields.next().unwrap(); + /// assert_eq!(first_value.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + #[inline] + pub fn tokens_len(&self) -> usize { + match self.tokens[self.value_ind] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => { + end - self.value_ind - 1 + } + _ => 1, + } + } +} + +/// An iterator over the values of an array +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"cores={a b}")?; +/// let reader = tape.windows1252_reader(); +/// +/// let mut all_cores = Vec::new(); +/// for (key, _op, value) in reader.fields() { +/// assert_eq!(key.read_str(), "cores"); +/// let cores = value.read_array()?; +/// assert_eq!(cores.len(), 2); +/// for value in cores.values() { +/// all_cores.push(value.read_string()?); +/// } +/// } +/// assert_eq!(all_cores, vec![String::from("a"), String::from("b")]); +/// # Ok(()) +/// # } +/// ``` +pub struct ValuesIter<'data, 'tokens, E> { + token_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ValuesIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ArrayReader<'data, 'tokens, E>) -> Self { + ValuesIter { + token_ind: reader.start_ind, + end_ind: reader.end_ind, + tokens: reader.tokens, + encoding: reader.encoding.clone(), + } + } +} + +impl<'data, 'tokens, E> Iterator for ValuesIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = ValueReader<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + if self.token_ind < self.end_ind { + let value_ind = self.token_ind; + self.token_ind = next_idx_values(self.tokens, self.token_ind); + Some(ValueReader { + value_ind, + tokens: self.tokens, + encoding: self.encoding.clone(), + }) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + let len = values_len(self.tokens, self.token_ind, self.end_ind); + (len, Some(len)) + } +} + +/// A text reader for sequences of values +#[derive(Debug, Clone)] +pub struct ArrayReader<'data, 'tokens, E> { + start_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ArrayReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Iterator over values of an array + /// + /// See [ValuesIter](crate::text::ValuesIter) for a worked example + #[inline] + pub fn values(&self) -> ValuesIter<'data, 'tokens, E> { + ValuesIter::new(self) + } + + /// Returns if the array is empty + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the number of values in the array + #[inline] + pub fn len(&self) -> usize { + values_len(self.tokens, self.start_ind, self.end_ind) + } + + /// Return the number of tokens contained within the object + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, first_value) = fields.next().unwrap(); + /// let array = first_value.read_array()?; + /// assert_eq!(array.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + #[inline] + pub fn tokens_len(&self) -> usize { + self.end_ind - self.start_ind + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn read_value(value: ValueReader) + where + E: crate::Encoding + Clone, + { + match value.token() { + TextToken::Object { .. } => { + iterate_object(value.read_object().unwrap()); + iterate_array(value.read_array().unwrap()); + } + TextToken::Array { .. } => { + iterate_object(value.read_object().unwrap()); + iterate_array(value.read_array().unwrap()); + } + TextToken::End(_) => panic!("end!?"), + TextToken::Operator(_) => {} + TextToken::MixedContainer => {} + TextToken::Unquoted(_) + | TextToken::Quoted(_) + | TextToken::Header(_) + | TextToken::Parameter(_) + | TextToken::UndefinedParameter(_) => { + let _ = value.read_str().unwrap(); + } + } + } + + fn iterate_array(reader: ArrayReader) + where + E: crate::Encoding + Clone, + { + for value in reader.values() { + read_value(value); + } + } + + fn iterate_object(reader: ObjectReader) + where + E: crate::Encoding + Clone, + { + for (_key, group) in reader.field_groups() { + for (_op, value) in group.values() { + read_value(value); + } + } + + let mut fields = reader.fields(); + for (key, _op, value) in fields.by_ref() { + let _ = key.read_str(); + read_value(value); + } + } + + #[test] + fn simple_text_reader_text() { + let data = b"foo=bar"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + assert_eq!(reader.fields_len(), 1); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + assert_eq!(value.read_string().unwrap(), String::from("bar")); + + assert!(iter.next().is_none()); + } + + #[test] + fn simple_text_reader_obj() { + let data = b"foo={bar=qux}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + + let nested = value.read_object().unwrap(); + let mut nested_iter = nested.fields(); + let (key2, _op, value2) = nested_iter.next().unwrap(); + assert_eq!(key2.read_string(), String::from("bar")); + assert_eq!(value2.read_string().unwrap(), String::from("qux")); + assert!(nested_iter.next().is_none()); + assert!(iter.next().is_none()); + } + + #[test] + fn simple_text_reader_array() { + let data = b"foo={bar qux}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + + let nested = value.read_array().unwrap(); + let mut values = nested.values(); + assert_eq!(nested.len(), 2); + let value1 = values.next().unwrap().read_string().unwrap(); + let value2 = values.next().unwrap().read_string().unwrap(); + + assert!(values.next().is_none()); + assert_eq!(value1, String::from("bar")); + assert_eq!(value2, String::from("qux")); + } + + #[test] + fn text_reader_read_fields() { + let data = b"name=aaa name=bbb core=123 core=456 name=ccc name=ddd"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut field_groups = reader.field_groups(); + let (key, values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + + let values = values.values().collect::>(); + assert_eq!(values.len(), 4); + assert_eq!(values[0].1.read_string().unwrap(), String::from("aaa")); + assert_eq!(values[1].1.read_string().unwrap(), String::from("bbb")); + assert_eq!(values[2].1.read_string().unwrap(), String::from("ccc")); + assert_eq!(values[3].1.read_string().unwrap(), String::from("ddd")); + + let (key, values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("core")); + + let values = values.values().collect::>(); + assert_eq!(values.len(), 2); + assert_eq!(values[0].1.read_string().unwrap(), String::from("123")); + assert_eq!(values[1].1.read_string().unwrap(), String::from("456")); + } + + #[test] + fn text_reader_read_fields_nested() { + let data = + b"army={name=aaa unit={name=bbb} unit={name=ccc}} army={name=ddd unit={name=eee}}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut field_groups = reader.field_groups(); + + let (key, army_values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("army")); + assert_eq!(army_values.len(), 2); + + let army_values = army_values.values().collect::>(); + let aaa = army_values[0].1.read_object().unwrap(); + let mut aaa_groups = aaa.field_groups(); + assert_eq!(aaa.fields_len(), 3); + + let (key, values) = aaa_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(values.len(), 1); + assert_eq!( + values.values().nth(0).unwrap().1.read_string().unwrap(), + String::from("aaa") + ); + + let (key, values) = aaa_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("unit")); + assert_eq!(values.len(), 2); + + let bbb = values.values().nth(0).unwrap().1.read_object().unwrap(); + let mut bbb_fields = bbb.fields(); + let (key, _, value) = bbb_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("bbb")); + + let ccc = values.values().nth(1).unwrap().1.read_object().unwrap(); + let mut ccc_fields = ccc.fields(); + let (key, _, value) = ccc_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("ccc")); + + let ddd = army_values[1].1.read_object().unwrap(); + assert_eq!(ddd.fields_len(), 2); + + let mut ddd_groups = ddd.field_groups(); + let (key, values) = ddd_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(values.len(), 1); + assert_eq!( + values.values().nth(0).unwrap().1.read_string().unwrap(), + String::from("ddd") + ); + + let (key, values) = ddd_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("unit")); + assert_eq!(values.len(), 1); + + let eee = values.values().nth(0).unwrap().1.read_object().unwrap(); + let mut eee_fields = eee.fields(); + let (key, _, value) = eee_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("eee")); + } + + #[test] + fn text_reader_read_fields_consume() { + let data = b"name=aaa name=bbb core=123 name=ccc name=ddd"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut count = 0; + for (_key, entries) in reader.field_groups() { + for (_i, (_op, value)) in entries.values().enumerate() { + count += value.read_scalar().map(|_| 1).unwrap_or(0); + } + } + + assert_eq!(count, 5); + } + + #[test] + fn text_reader_mixed_object_1() { + let data = b"levels={10 0=1 0=2}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + assert_eq!(reader.fields_len(), 1); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("levels")); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 8); + + assert_eq!( + nested.values().nth(3).unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + assert_eq!( + nested.values().nth(6).unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + + let values = nested + .values() + .filter(|x| x.token() != &TextToken::MixedContainer) + .map(|x| x.read_string().unwrap()) + .collect::>(); + + assert_eq!( + values.as_slice(), + &[ + String::from("10"), + String::from("0"), + String::from("="), + String::from("1"), + String::from("0"), + String::from("="), + String::from("2"), + ] + ); + } + + #[test] + fn text_reader_mixed_object_2() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "brittany_area"); + + let mut keys = vec![]; + let brittany = value.read_object().unwrap(); + let mut fields = brittany.fields(); + while let Some((key, _op, _value)) = fields.next() { + keys.push(key.read_str()) + } + + assert_eq!(keys, vec![String::from("color")]); + let trailer = fields.remainder(); + assert_eq!(trailer.len(), 5); + assert_eq!(trailer.values().next().unwrap().read_str().unwrap(), "169"); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 5); + + let mut values = nested.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"169")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"170")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"171")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"172")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"4384")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_mixed_object_3() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let (_key, _op, brittany) = reader.fields().next().unwrap(); + let brittany_reader = brittany.read_object().unwrap(); + + let mut fields = brittany_reader.fields(); + let (lower_bound, upper_bound) = fields.size_hint(); + assert_eq!(lower_bound, brittany_reader.fields_len()); + assert_eq!(lower_bound, 2); + assert!(upper_bound.is_none() || upper_bound == Some(7)); + + let _ = fields.next(); + let (lower_bound, upper_bound) = fields.size_hint(); + assert_eq!(lower_bound, 1); + assert!(upper_bound.is_none() || upper_bound == Some(6)); + + let mut groups = brittany_reader.field_groups(); + let (lower_bound, upper_bound) = groups.size_hint(); + assert_eq!(lower_bound, 1); + assert!(upper_bound.is_none() || upper_bound == Some(6)); + + let _ = groups.next(); + let (lower_bound, upper_bound) = groups.size_hint(); + assert_eq!(lower_bound, 0); + assert!(upper_bound.is_none() || upper_bound == Some(5)); + } + + #[test] + fn text_reader_mixed_object_4() { + let data = br#"levels={a=b 10 c=d 20}"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + assert_eq!(reader.fields_len(), 1); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("levels")); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 5); + + let mut values = nested.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"10")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"c")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"d")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"20")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_mixed_object_5() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "brittany_area"); + + let brittany = value.read_object().unwrap(); + let mut field_groups = brittany.field_groups(); + field_groups.next().unwrap(); + assert!(field_groups.next().is_none()); + + let trailer = field_groups.remainder(); + + let mut values = trailer.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"169")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"170")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"171")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"172")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"4384")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_empty_container() { + let data = b"active_idea_groups={ }"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "active_idea_groups"); + + let empty_array = value.read_array().unwrap(); + assert_eq!(0, empty_array.len()); + assert!(empty_array.values().next().is_none()); + + let empty_object = value.read_object().unwrap(); + let mut empty_object_iter = empty_object.fields(); + assert_eq!(0, empty_object.fields_len()); + assert!(empty_object_iter.next().is_none()); + } + + #[test] + fn text_reader_header() { + let data = b"color = rgb { 10 20 30 }"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "color"); + + let header_array = value.read_array().unwrap(); + let mut values = header_array.values(); + let rgb = values.next().unwrap(); + assert_eq!(rgb.read_str().unwrap(), "rgb"); + + let vals = values.next().unwrap(); + let s = vals.read_array().unwrap(); + let svals = s.values(); + + let colors = svals + .map(|x| x.read_scalar().unwrap()) + .map(|x| x.to_u64().unwrap()) + .collect::>(); + + assert_eq!(colors, vec![10, 20, 30]); + } + + #[test] + fn reader_crash1() { + let data = b"a=r{}"; + let tape = TextTape::from_slice(data).unwrap(); + iterate_object(tape.windows1252_reader()); + } + + #[test] + fn text_reader_object_fields() { + let data = b"a{b=}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_op2() { + let data = b"a{}b>{}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_dupe() { + let data = b"a{b=c d=E d}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_header() { + let data = b"a{}b>r{}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_dupe2() { + let data = b"a{b=c d b}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression() { + let data = b"a={b{}=2}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression2() { + let data = b"r={c=d=@{y=u}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression3() { + let data = b"a={{t c=d = b}}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + // #[test] + // fn text_reader_regression4() { + // let data = include_bytes!("/home/nick/projects/jomini/fuzz/artifacts/fuzz_text/crash-a14643c9a89c0f4ab665815c99a07b15de3544a5"); + // // let data = b"a={{ b c == == = d e=f}}"; + // if let Ok(tape) = TextTape::from_slice(data) { + // let reader = tape.windows1252_reader(); + // iterate_object(reader); + // } + // } +} diff --git a/src/text/mod.rs b/src/text/mod.rs index 4f1f1b9..f17c392 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -1,14 +1,29 @@ //! Types for parsing clausewitz plaintext input //! -//! See the top level module documentation for an overview that includes parsing -//! and deserializing text. +//! Main text deserialization APIs: +//! - [TextDeserializer::from_utf8_slice](crate::text::de::TextDeserializer::from_utf8_slice): +//! Deserialize game and save files from a slice of data. +//! - [TextDeserializer::from_utf8_reader](crate::text::de::TextDeserializer::from_utf8_reader): +//! (**experimental**) much more memory efficient deserializer that is geared +//! towards deserializing large models like those found in save files. //! -//! For more examples of the mid-level DOM-like API, see [FieldGroupsIter], -//! [FieldsIter], and [ValuesIter] +//! If the serde deserialization API is too high level, one can build +//! abstractions ontop of: +//! - [TextTape::from_slice]: Realizes a pseudo AST onto +//! a linear tape. Cleans up and normalizes data. +//! - [TokenReader]: (**experimental**) an incremental text lexer +//! designed for handling large saves in a memory efficient manner. +//! +//! Some additional APIs are available to make working with a [TextTape] more +//! ergonomic for DOM-like use cases. +//! - [FieldGroupsIter] +//! - [FieldsIter] +//! - [ValuesIter] /// text deserialization #[cfg(feature = "derive")] pub mod de; +mod dom; mod fnv; mod operator; mod reader; @@ -18,10 +33,11 @@ mod writer; #[cfg(feature = "derive")] #[doc(inline)] pub use self::de::Property; -pub use self::operator::*; -pub use self::reader::{ +pub use self::dom::{ ArrayReader, FieldGroupsIter, FieldsIter, GroupEntry, GroupEntryIter, ObjectReader, Reader, ScalarReader, ValueReader, ValuesIter, }; +pub use self::operator::*; pub use self::tape::{TextTape, TextTapeParser, TextToken}; pub use self::writer::*; +pub use reader::{ReaderError, ReaderErrorKind, Token, TokenReader, TokenReaderBuilder}; diff --git a/src/text/reader.rs b/src/text/reader.rs index 1db0514..3e24ef8 100644 --- a/src/text/reader.rs +++ b/src/text/reader.rs @@ -1,1503 +1,1078 @@ -use super::fnv::FnvBuildHasher; +use super::Operator; use crate::{ - text::Operator, DeserializeError, DeserializeErrorKind, Encoding, Error, Scalar, TextTape, - TextToken, + buffer::{BufferError, BufferWindow, BufferWindowBuilder, SliceReader}, + data::is_boundary, + util::{contains_zero_byte, count_chunk, repeat_byte}, + Scalar, }; -use std::{ - borrow::Cow, - collections::{hash_map::Entry, HashMap}, -}; - -pub type KeyValue<'data, 'tokens, E> = ( - ScalarReader<'data, E>, - Option, - ValueReader<'data, 'tokens, E>, -); - -pub type KeyValues<'data, 'tokens, E> = (ScalarReader<'data, E>, GroupEntry<'data, 'tokens, E>); - -/// Calculate what index the next value is. This assumes that a header + value -/// are two separate values -#[inline] -fn next_idx_header(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - TextToken::Operator(_) | TextToken::MixedContainer => idx + 2, - _ => idx + 1, - } -} +use std::io::Read; -/// Calculate what index the next value is. This assumes that a header + value -/// is one value -#[inline] -fn next_idx(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - TextToken::Operator(_) => next_idx(tokens, idx + 1), - TextToken::Header(_) => next_idx_header(tokens, idx + 1), - _ => idx + 1, - } -} +/// Text token, the raw form of [TextToken](crate::text::TextToken) +/// +/// This binary tokens contains the yielded raw tokens, and won't match open and +/// close tokens, nor does it make a determination if open and close represents +/// an array, object, or both. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'a> { + /// '{' or '[' + Open, -#[inline] -fn next_idx_values(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - _ => idx + 1, - } -} + /// '{' or ']' + Close, -#[inline] -fn fields_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { - let mut ind = start_ind; - let mut count = 0; - while ind < end_ind { - let key_ind = ind; - if tokens[key_ind] == TextToken::MixedContainer { - return count; - } + /// An operator (eg: `foo=bar`) + Operator(Operator), - let value_ind = match tokens[key_ind + 1] { - TextToken::Operator(_) => key_ind + 2, - _ => key_ind + 1, - }; - ind = next_idx(tokens, value_ind); - count += 1; - } + /// value that is not surrounded by quotes + Unquoted(Scalar<'a>), - count + /// value that is quoted + Quoted(Scalar<'a>), } -#[inline] -pub fn values_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { - let mut count = 0; - let mut ind = start_ind; - while ind < end_ind { - ind = next_idx_values(tokens, ind); - count += 1; +impl<'a> Token<'a> { + /// Return as token as a scalar + #[inline] + pub fn as_scalar(&self) -> Option> { + match self { + Token::Quoted(s) | Token::Unquoted(s) => Some(*s), + _ => None, + } } - - count } -type OpValue<'data, 'tokens, E> = (Option, ValueReader<'data, 'tokens, E>); - -/// Iterator over values grouped by duplicate keys -/// -/// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example -pub struct GroupEntryIter<'data, 'tokens, 'parent, E> { - index: usize, - parent: &'parent GroupEntry<'data, 'tokens, E>, +#[derive(Debug)] +enum Utf8Bom { + Unknown, + NotPresent, + Present, } -impl<'data, 'tokens, 'parent, E> Iterator for GroupEntryIter<'data, 'tokens, 'parent, E> -where - E: Clone, -{ - type Item = (Option, ValueReader<'data, 'tokens, E>); - - fn next(&mut self) -> Option { - match &self.parent { - GroupEntry::One((op, val)) => { - if self.index == 0 { - self.index += 1; - Some((*op, (*val).clone())) - } else { - None - } - } - GroupEntry::Multiple(entries) => { - let result = entries.get(self.index); - self.index += 1; - result.map(|(op, val)| (*op, (*val).clone())) - } - } - } -} - -/// Represents a group of values for duplicate keys +/// Scan a [Read] implementation for text [Token]s /// -/// May contain one or many values +/// Example of computing the max nesting depth using a [TokenReader]. /// +/// ```rust +/// use jomini::text::{TokenReader, Token}; +/// let data = b"foo={{ id=3 } {} { id = 4 }}"; +/// let mut reader = TokenReader::new(&data[..]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = reader.next()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::text::ReaderError>(()) /// ``` -/// use jomini::TextTape; /// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.field_groups(); -/// let first_group = fields.next(); -/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(first_key.as_deref(), Some("name")); -/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(first_values_len, Some(1)); -/// let first_values = first_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// Unlike a [TextTape](crate::TextTape), which will skip ghost objects, pair +/// open and close tokens together, and recognize if a container is an object, +/// array, or mixed -- the tokens yielded from a [TokenReader] are not fully +/// formed. This is a much more raw view of the data that can be used to +/// construct higher level parsers and deserializers that operate over a stream +/// of data. /// -/// let second_group = fields.next(); -/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(second_key.as_deref(), Some("core")); -/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(second_values, Some(2)); -/// let second_values = second_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); -/// # Ok(()) -/// # } -/// ``` -pub enum GroupEntry<'data, 'tokens, E> { - /// Represents that the group is composed of only one value - /// - /// Most fields should only occur once, so this variant is optimized to - /// not require a memory allocation (unlike the `Multiple` variant). - One(OpValue<'data, 'tokens, E>), - - /// Represents that the group is composed of several values - Multiple(Vec>), +/// The [TokenReader] is considered **experimental**, as it uses a different +/// parsing algorithm geared towards parsing large save files. Ergonomic +/// equivalents for more esoteric game syntax (like parameter definitions) have +/// not yet been finalized. Game files can still be parsed with the experimental +/// APIs, but these APIs may change in the future based on feedback. Since the +/// binary format is not used for game files, the +/// [binary::TokenReader](crate::binary::TokenReader) is not considered +/// experimental) +/// +/// [TokenReader] operates over a fixed size buffer, so using a +/// [BufRead](std::io::BufRead) affords no benefits. An error will be returned +/// for tokens that are impossible to fit within the buffer (eg: if the provided +/// with 100 byte buffer but there is a binary string that is 101 bytes long). +#[derive(Debug)] +pub struct TokenReader { + reader: R, + buf: BufferWindow, + utf8: Utf8Bom, } -impl<'data, 'tokens, E> GroupEntry<'data, 'tokens, E> { - /// Returns an iterator that includes all the values - pub fn values<'parent>(&'parent self) -> GroupEntryIter<'data, 'tokens, 'parent, E> { - GroupEntryIter { - index: 0, - parent: self, - } - } - - /// A group can never be empty so this returns false - pub fn is_empty(&self) -> bool { - false - } - - /// Returns the number of values in the group - pub fn len(&self) -> usize { - match &self { - GroupEntry::One(_) => 1, - GroupEntry::Multiple(x) => x.len(), +impl TokenReader<()> { + /// Read from a byte slice without memcpy's + #[inline] + pub fn from_slice(data: &[u8]) -> TokenReader> { + TokenReader { + reader: SliceReader::new(data), + buf: BufferWindow::from_slice(data), + utf8: Utf8Bom::Unknown, } } } -/// All possible text reader variants -#[derive(Debug, Clone)] -pub enum Reader<'data, 'tokens, E> { - /// object reader - Object(ObjectReader<'data, 'tokens, E>), - - /// array reader - Array(ArrayReader<'data, 'tokens, E>), - - /// scalar reader - Scalar(ScalarReader<'data, E>), - - /// value reader - Value(ValueReader<'data, 'tokens, E>), -} - -impl<'data, 'tokens, E> Reader<'data, 'tokens, E> +impl TokenReader where - E: Encoding + Clone, + R: Read, { - /// Interpret value as a string + /// Create a new text reader #[inline] - pub fn read_str(&self) -> Result, DeserializeError> { - match &self { - Reader::Scalar(x) => Ok(x.read_str()), - Reader::Value(x) => x.read_str(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), - } + pub fn new(reader: R) -> Self { + TokenReader::builder().build(reader) } - /// Interpret value as a string + /// Returns the byte position of the data stream that has been processed. + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.position(), 4); + /// ``` #[inline] - pub fn read_string(&self) -> Result { - match &self { - Reader::Scalar(x) => Ok(x.read_string()), - Reader::Value(x) => x.read_string(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), - } + pub fn position(&self) -> usize { + self.buf.position() } - /// Interpret value as a scalar #[inline] - pub fn read_scalar(&self) -> Result, DeserializeError> { - match &self { - Reader::Scalar(x) => Ok(x.read_scalar()), - Reader::Value(x) => x.read_scalar(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), + unsafe fn next_opt(&mut self) -> (Option, Option) { + #[derive(Debug)] + enum ParseState { + None, + Quote, + Unquoted, } - } -} -/// Iterator over fields of an object grouped by key -/// -/// Since objects can have duplicated keys across fields, this iterator -/// consolidates them such that all values with the same key are grouped -/// together in the order that they appear in the object. Key order is -/// also equivalent, except that already seen keys will be skipped, as -/// those values have already been seen in an earlier group. -/// -/// The process of grouping values together is more expensive than simply -/// iterating the keys in order, so when possible prefer -/// [`ObjectReader::fields()`](crate::text::ObjectReader::fields) over -/// [`ObjectReader::field_groups()`](crate::text::ObjectReader::field_groups). -/// -/// These groups can be easily iterated: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// for (key, group) in reader.field_groups() { -/// match key.read_str().as_ref() { -/// "name" => assert_eq!(group.len(), 1), -/// "core" => assert_eq!(group.len(), 2), -/// x => panic!("unexpected key: {}", x), -/// } -/// } -/// # Ok(()) -/// # } -/// ``` -/// -/// And picked apart: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.field_groups(); -/// let first_group = fields.next(); -/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(first_key.as_deref(), Some("name")); -/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(first_values_len, Some(1)); -/// let first_values = first_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(first_values, Some(vec![String::from("a")])); -/// -/// let second_group = fields.next(); -/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(second_key.as_deref(), Some("core")); -/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(second_values, Some(2)); -/// let second_values = second_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); -/// # Ok(()) -/// # } -/// ``` -pub struct FieldGroupsIter<'data, 'tokens, E> { - key_indices: HashMap<&'data [u8], Vec>, FnvBuildHasher>, - fields: FieldsIter<'data, 'tokens, E>, -} + let mut state = ParseState::None; + let mut ptr = self.buf.start; + loop { + let end = self.buf.end; + let (carry_over, offset) = match state { + ParseState::None => 'eof: loop { + if ptr == end { + break (0, 0); + } -impl<'data, 'tokens, E> FieldGroupsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { - // Using the fnv hasher improved throughput of the eu4 json benchmark - // by over 15%. - let mut key_indices = - HashMap::with_capacity_and_hasher(reader.fields_len(), FnvBuildHasher::default()); - for (key, op, val) in reader.fields() { - let entry = key_indices.entry(key.read_scalar().as_bytes()); - - match entry { - Entry::Vacant(x) => { - x.insert(Vec::with_capacity(0)); + 'inner: loop { + match *ptr { + c @ b' ' | c @ b'\t' => { + ptr = ptr.add(1); + loop { + if ptr == end { + break 'eof (0, 0); + } + + if *ptr != c { + break; + } + + ptr = ptr.add(1) + } + } + b'\n' | b'\r' | b';' => { + ptr = ptr.add(1); + break 'inner; + } + b'#' => { + let start_ptr = ptr; + ptr = ptr.add(1); + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, 0); + } + + if *ptr == b'\n' { + break; + } + + ptr = ptr.add(1) + } + } + b'{' => { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Open), None); + } + b'}' => { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Close), None); + } + b'"' => { + ptr = ptr.add(1); + let start_ptr = ptr; + loop { + if ptr == end { + state = ParseState::Quote; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over); + } + + if *ptr == b'\\' { + let advance = end.offset_from(ptr).min(2); + ptr = ptr.offset(advance); + if ptr == end { + state = ParseState::Quote; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over.max(2) - 2); + } + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr.add(1)); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Quoted(scalar)), None); + } + } + } + b'@' => { + let start_ptr = ptr; + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'[' { + ptr = ptr.add(1); + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, 0); + } else if *ptr == b']' { + ptr = ptr.add(1); + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } else { + ptr = ptr.add(1); + } + } + } else { + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + state = ParseState::Unquoted; + break 'eof (carry_over, carry_over); + } else if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + } + } + b'=' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::Equal)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Operator(Operator::Exact)), None); + } + } + b'<' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::LessThan)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Operator(Operator::LessThanEqual)), None); + } + } + b'!' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'=' { + ptr = ptr.add(1); + } + + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::NotEqual)), None); + } + b'?' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'=' { + ptr = ptr.add(1); + } + + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::Exists)), None); + } + b'>' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::GreaterThan)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return ( + Some(Token::Operator(Operator::GreaterThanEqual)), + None, + ); + } + } + b'\xef' if matches!(self.utf8, Utf8Bom::Unknown) => { + match self.buf.window().get(..3) { + Some([0xef, 0xbb, 0xbf]) => { + self.utf8 = Utf8Bom::Present; + ptr = ptr.add(3); + break 'inner; + } + Some(_) => self.utf8 = Utf8Bom::NotPresent, + None => break 'eof (self.buf.window_len(), 0), + } + } + _ => { + let start_ptr = ptr; + ptr = ptr.add(1); + loop { + if ptr == end { + state = ParseState::Unquoted; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over); + } else if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + } + } + } + }, + ParseState::Quote { .. } => { + while ptr < end { + if *ptr == b'\\' { + let advance = end.offset_from(ptr).min(2); + ptr = ptr.offset(advance); + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr.add(1)); + let scalar = self.buf.get(self.buf.buf.as_ptr()..ptr); + return (Some(Token::Quoted(scalar)), None); + } + } + + // buffer or prior read too small + (self.buf.window_len(), self.buf.window_len()) } - Entry::Occupied(mut x) => { - x.get_mut().push((op, val)); + ParseState::Unquoted { .. } => { + while ptr < end { + if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(self.buf.buf.as_ptr()..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + + // buffer or prior read too small + (self.buf.window_len(), self.buf.window_len()) } + }; + + self.buf.advance_to(self.buf.end.sub(carry_over)); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => match state { + ParseState::None => { + // if we carried over data that isn't a comment, we + // should have made forward progress. + if carry_over == 0 || *self.buf.start == b'#' { + return (None, None); + } else { + return (None, Some(self.eof_error())); + } + } + ParseState::Quote { .. } => return (None, Some(self.eof_error())), + ParseState::Unquoted { .. } => { + let scalar = std::slice::from_raw_parts(self.buf.start, carry_over); + self.buf.advance_to(self.buf.end); + return (Some(Token::Unquoted(Scalar::new(scalar))), None); + } + }, + Ok(_) => ptr = self.buf.start.add(offset), + Err(e) => return (None, Some(self.buffer_error(e))), } } - - let fields = reader.fields(); - - FieldGroupsIter { - key_indices, - fields, - } } - /// See [the other `remainder` documentation](crate::text::FieldsIter::remainder) - pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { - self.fields.remainder() - } -} - -impl<'data, 'tokens, E> Iterator for FieldGroupsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = KeyValues<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - loop { - let (key, op, value) = self.fields.next()?; - - if let Some((_key, mut entries)) = - self.key_indices.remove_entry(key.read_scalar().as_bytes()) - { - if entries.is_empty() { - return Some((key, GroupEntry::One((op, value)))); - } else { - entries.insert(0, (op, value)); - return Some((key, GroupEntry::Multiple(entries))); - } + /// Advance a given number of bytes and return them. + /// + /// The internal buffer must be large enough to accomodate all bytes. + /// + /// ```rust + /// use jomini::text::{TokenReader, ReaderErrorKind}; + /// let mut reader = TokenReader::new(&b"EU4txt"[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4txt"[..]); + /// assert!(matches!(reader.read_bytes(1).unwrap_err().kind(), ReaderErrorKind::Eof)); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&[u8], ReaderError> { + while self.buf.window_len() < bytes { + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.eof_error()), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), } } - } - fn size_hint(&self) -> (usize, Option) { - (self.key_indices.len(), None) + let input = unsafe { std::slice::from_raw_parts(self.buf.start, bytes) }; + self.buf.advance(bytes); + Ok(input) } -} -/// Iterator over fields of an object in the order that they appear -/// -/// Since objects can have duplicated keys across fields, this iterator -/// may yield items that have duplicate keys. -/// -/// Fields can be easily iterated: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let (names, cores) = reader -/// .fields() -/// .fold((0, 0), |(names, cores), (key, _op, _value)| { -/// match key.read_str().as_ref() { -/// "name" => (names + 1, cores), -/// "core" => (names, cores + 1), -/// x => panic!("unexpected key: {}", x), -/// } -/// }); -/// assert_eq!((1, 2), (names, cores)); -/// # Ok(()) -/// # } -/// ``` -/// -/// And picked apart: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.fields(); -/// let (first_key, _op, first_val) = fields.next().unwrap(); -/// assert_eq!(first_key.read_str(), "name"); -/// assert_eq!(first_val.read_str().ok().as_deref(), Some("a")); -/// # Ok(()) -/// # } -/// ``` -pub struct FieldsIter<'data, 'tokens, E> { - token_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} - -impl<'data, 'tokens, E> FieldsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { - FieldsIter { - token_ind: reader.start_ind, - end_ind: reader.end_ind, - tokens: reader.tokens, - encoding: reader.encoding.clone(), + /// Advance through the containing block until the closing token is consumed + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"foo={{bar={}}} qux=1"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"foo"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert!(reader.skip_container().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"qux"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1"))); + /// ``` + #[inline] + pub fn skip_container(&mut self) -> Result<(), ReaderError> { + enum SkipState { + None, + Quote, + Comment, } - } - /// Returns the remaining values from an object if the container is an - /// object that transitions into an array. - pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { - let start = self - .tokens - .get(self.token_ind) - .map(|x| match x { - TextToken::MixedContainer => self.token_ind + 1, - TextToken::End(y) => { - if let Some(TextToken::Array { .. }) = self.tokens.get(*y) { - *y + 1 - } else { - self.token_ind + let mut state = SkipState::None; + let mut depth = 1; + let mut ptr = self.buf.start; + loop { + let end = self.buf.end; + unsafe { + 'refill: loop { + match state { + SkipState::None => 'new_state: loop { + while end.offset_from(ptr) > 8 { + // process 8 bytes at a time, which reduced + // latency of this function in EU4 saves by 50% + // (a 7% reduction overall). + let data = ptr.cast::().read_unaligned(); + let has_quote = contains_zero_byte(data ^ repeat_byte(b'"')); + let has_comment = contains_zero_byte(data ^ repeat_byte(b'#')); + if has_quote || has_comment { + break; + } + + let has_close = contains_zero_byte(data ^ repeat_byte(b'}')); + let closes = if has_close { + count_chunk(data, b'}') as i32 + } else { + 0 + }; + + let new_depth = depth - closes; + if new_depth < 1 { + break; + } + depth = new_depth; + + let has_open = contains_zero_byte(data ^ repeat_byte(b'{')); + let opens = if has_open { + count_chunk(data, b'{') as i32 + } else { + 0 + }; + + depth += opens; + ptr = ptr.add(8); + } + + if ptr == end { + break 'refill; + } + + let val = *ptr; + ptr = ptr.add(1); + match val { + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + self.buf.advance_to(ptr); + return Ok(()); + } + } + b'"' => { + state = SkipState::Quote; + break 'new_state; + } + b'#' => { + state = SkipState::Comment; + break 'new_state; + } + _ => {} + } + }, + SkipState::Quote => loop { + if ptr == end { + break 'refill; + } + + if *ptr == b'\\' { + if end.offset_from(ptr) <= 2 { + break 'refill; + } + ptr = ptr.add(2); + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + ptr = ptr.add(1); + state = SkipState::None; + break; + } + }, + SkipState::Comment => loop { + if ptr == end { + break 'refill; + } + + if *ptr == b'\n' { + ptr = ptr.add(1); + state = SkipState::None; + break; + } + + ptr = ptr.add(1) + }, } } - _ => self.token_ind, - }) - .unwrap_or(self.end_ind); + } - ArrayReader { - start_ind: start, - end_ind: self.end_ind, - encoding: self.encoding.clone(), - tokens: self.tokens, + self.buf.advance_to(ptr); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.eof_error()), + Err(e) => return Err(self.buffer_error(e)), + Ok(_) => ptr = self.buf.start, + } } } -} -impl<'data, 'tokens, E> Iterator for FieldsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = KeyValue<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - if self.token_ind >= self.end_ind { - return None; - } + /// Skip any trailing data associated with the unquoted value. Useful for + /// skipping an unquoted value that may be serving as a header. + /// + /// In the below example the `rgb { 1 2 3 }` will first be parsed as + /// unquoted `rgb`, but the `{ 1 2 3 }` needs to be skipped as well as it is + /// tied to `rgb`. + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"color = rgb { 1 2 3 } foo=bar"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"color"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"rgb"))); + /// assert!(reader.skip_unquoted_value().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"foo"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"bar"))); + /// ``` + #[inline] + pub fn skip_unquoted_value(&mut self) -> Result<(), ReaderError> { + loop { + unsafe { + let mut ptr = self.buf.start; + let end = self.buf.end; - let key_ind = self.token_ind; - let token = self.tokens[key_ind].clone(); - let key_scalar = match token { - TextToken::Quoted(x) - | TextToken::Unquoted(x) - | TextToken::Parameter(x) - | TextToken::UndefinedParameter(x) => x, - TextToken::MixedContainer => { - return None; - } - _ => { - // this is a broken invariant, so we safely recover by saying the object - // has no more fields - debug_assert!(false, "All keys should be scalars, not {:?}", &token); - return None; - } - }; - - let key_reader = ScalarReader { - scalar: key_scalar, - token, - encoding: self.encoding.clone(), - }; - - let (op, value_ind) = match self.tokens[key_ind + 1] { - TextToken::Operator(x) => (Some(x), key_ind + 2), - _ => (None, key_ind + 1), - }; - - let value_reader = ValueReader { - value_ind, - tokens: self.tokens, - encoding: self.encoding.clone(), - }; - self.token_ind = next_idx(self.tokens, value_ind); - Some((key_reader, op, value_reader)) - } + if end.offset_from(ptr) >= 4 { + let word = ptr.cast::().read_unaligned().to_le(); - fn size_hint(&self) -> (usize, Option) { - let len = fields_len(self.tokens, self.token_ind, self.end_ind); - (len, None) - } -} + // 50% of EU4 values followed by this whitespace sequence + if word == 0x0909090A { + // \n\t\t\t + ptr = ptr.add(4); + } + } -/// A reader for objects -#[derive(Debug, Clone)] -pub struct ObjectReader<'data, 'tokens, E> { - start_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} + while ptr < end { + match *ptr { + b'{' => { + self.buf.advance_to(ptr.add(1)); + return self.skip_container(); + } + b' ' | b'\t' | b'\n' | b'\r' | b';' => { + ptr = ptr.add(1); + } + _ => return Ok(()), + } + } -impl<'data, 'tokens, E> ObjectReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - /// Create a new object reader from parsed data with encoded strings - pub fn new(tape: &'tokens TextTape<'data>, encoding: E) -> Self { - let tokens = tape.tokens(); - ObjectReader { - tokens, - end_ind: tokens.len(), - start_ind: 0, - encoding, + self.buf.advance_to(end); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Ok(()), + Err(e) => return Err(self.buffer_error(e)), + Ok(_) => {} + } + } } } - /// Return the number of tokens contained within the object + /// Consume the token reader and return the internal buffer and reader. This + /// allows the buffer to be reused. /// - /// ``` - /// use jomini::TextTape; + /// ```rust + /// use jomini::text::{TokenReader}; + /// let data = b"EU4txt"; + /// let mut reader = TokenReader::new(&data[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &data[..]); /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1} foo=bar")?; - /// let reader = tape.windows1252_reader(); - /// assert_eq!(reader.tokens_len(), 6); - /// # Ok(()) - /// # } + /// let (buf, _) = reader.into_parts(); + /// let data = b"HOI4txt"; + /// let mut reader = TokenReader::builder().buffer(buf).build(&data[..]); + /// assert_eq!(reader.read_bytes(7).unwrap(), &data[..]); /// ``` - pub fn tokens_len(&self) -> usize { - self.end_ind - self.start_ind + #[inline] + pub fn into_parts(self) -> (Box<[u8]>, R) { + (self.buf.buf, self.reader) } - /// Deserialize from the object reader + /// Read the next token in the stream. Will error if not enough data remains + /// to decode a token. /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, ReaderErrorKind, Operator}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1444.11.11"))); + /// assert!(matches!(reader.read().unwrap_err().kind(), ReaderErrorKind::Eof)); /// ``` - /// use jomini::TextTape; - /// use serde::Deserialize; - /// - /// # fn main() -> Result<(), Box> { - /// #[derive(Debug, Clone, Deserialize, PartialEq)] - /// pub struct Obj { - /// foo: String, - /// } - /// - /// let tape = TextTape::from_slice(b"obj={foo=bar}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, obj_value) = fields.next().unwrap(); - /// let obj_reader = obj_value.read_object().unwrap(); - /// let result: Obj = obj_reader.deserialize().unwrap(); - /// assert_eq!(result, Obj { foo: "bar".to_string() }); - /// # Ok(()) - /// # } - /// ``` - #[cfg(feature = "derive")] - pub fn deserialize(&self) -> Result - where - T: serde::Deserialize<'data>, - { - T::deserialize(&crate::TextDeserializer::from_reader(self)) - } - - /// Return the number of key value pairs that the object contains. - pub fn fields_len(&self) -> usize { - fields_len(self.tokens, self.start_ind, self.end_ind) - } - - /// Iterator over fields as they appear in the object - /// - /// See [FieldsIter](crate::text::FieldsIter) for a worked example #[inline] - pub fn fields(&self) -> FieldsIter<'data, 'tokens, E> { - FieldsIter::new(self) + pub fn read(&mut self) -> Result { + // Workaround for borrow checker :( + let s = unsafe { &mut *(self as *mut TokenReader) }; + match unsafe { self.next_opt() } { + (Some(x), _) => Ok(x), + (None, None) => Err(s.eof_error()), + (None, Some(e)) => Err(e), + } } - /// Iterator over fields that are grouped by key + /// Read a token, returning none when all the data has been consumed /// - /// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example - #[inline] - pub fn field_groups(&self) -> FieldGroupsIter<'data, 'tokens, E> { - FieldGroupsIter::new(self) - } -} - -/// A text reader that wraps an underlying scalar value -#[derive(Debug, Clone)] -pub struct ScalarReader<'data, E> { - scalar: Scalar<'data>, - token: TextToken<'data>, - encoding: E, -} - -impl<'data, E> ScalarReader<'data, E> -where - E: Encoding, -{ - /// Decode the data with a given string encoding - #[inline] - pub fn read_str(&self) -> Cow<'data, str> { - self.encoding.decode(self.scalar.as_bytes()) - } - - /// Decode the data with a given string encoding + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1444.11.11"))); + /// assert_eq!(reader.next().unwrap(), None); + /// ``` #[inline] - pub fn read_string(&self) -> String { - self.encoding.decode(self.scalar.as_bytes()).into_owned() + pub fn next(&mut self) -> Result, ReaderError> { + match unsafe { self.next_opt() } { + (Some(x), _) => Ok(Some(x)), + (None, None) => Ok(None), + (None, Some(e)) => Err(e), + } } - /// Return the underlying scalar - #[inline] - pub fn read_scalar(&self) -> Scalar<'data> { - self.scalar + #[cold] + #[inline(never)] + pub(crate) fn eof_error(&self) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::Eof, + } } - /// Return the token that the reader is abstracting - #[inline] - pub fn token(&self) -> &TextToken<'data> { - &self.token + #[cold] + #[inline(always)] + fn buffer_error(&self, e: BufferError) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::from(e), + } } } -/// A text reader for a text value -#[derive(Debug, Clone)] -pub struct ValueReader<'data, 'tokens, E> { - value_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} - -impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> { - /// Return the token that the reader is abstracting - #[inline] - pub fn token(&self) -> &TextToken<'data> { - &self.tokens[self.value_ind] - } - - #[cfg(feature = "derive")] - pub(crate) fn next(&mut self) -> Option<&TextToken<'data>> { - self.value_ind += 1; - self.tokens.get(self.value_ind) +impl TokenReader<()> { + /// Initializes a default [TokenReaderBuilder] + pub fn builder() -> TokenReaderBuilder { + TokenReaderBuilder::default() } } -impl<'data, 'tokens, E> Encoding for ValueReader<'data, 'tokens, E> -where - E: Encoding, -{ - #[inline] - fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { - self.encoding.decode(data) - } +/// Creates a text token reader +#[derive(Debug, Default)] +pub struct TokenReaderBuilder { + buffer: BufferWindowBuilder, } -impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn raw_str(&self) -> Option> { - match self.tokens[self.value_ind] { - TextToken::Header(s) - | TextToken::Unquoted(s) - | TextToken::Quoted(s) - | TextToken::Parameter(s) - | TextToken::UndefinedParameter(s) => Some(self.encoding.decode(s.as_bytes())), - TextToken::Operator(s) => Some(Cow::Borrowed(s.symbol())), - _ => None, - } - } - - /// Interpret the current value as string - #[inline] - pub fn read_str(&self) -> Result, DeserializeError> { - self.raw_str().ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a string")), - }) - } - - /// Interpret the current value as string +impl TokenReaderBuilder { + /// Set the fixed size buffer to the given buffer #[inline] - pub fn read_string(&self) -> Result { - self.raw_str() - .map(String::from) - .ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a string")), - }) + pub fn buffer(mut self, val: Box<[u8]>) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer(val); + self } - /// Interpret the current value as a scalar + /// Set the length of the buffer if no buffer is provided #[inline] - pub fn read_scalar(&self) -> Result, DeserializeError> { - self.tokens[self.value_ind] - .as_scalar() - .ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }) + pub fn buffer_len(mut self, val: usize) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer_len(val); + self } - /// Interpret the current value as an object + /// Create a text token reader around a given reader. #[inline] - pub fn read_object(&self) -> Result, DeserializeError> { - match self.tokens[self.value_ind] { - TextToken::Object { end, .. } => Ok(ObjectReader { - tokens: self.tokens, - start_ind: self.value_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }), - - TextToken::Array { end, .. } => Ok(ObjectReader { - tokens: self.tokens, - start_ind: end, - end_ind: end, - encoding: self.encoding.clone(), - }), - - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not an object")), - }), - } - } - - /// Interpret the current value as an array - #[inline] - pub fn read_array(&self) -> Result, DeserializeError> { - match self.tokens[self.value_ind] { - TextToken::Object { end, mixed: true } => { - let mut start_ind = self.value_ind + 1; - while self.tokens.get(start_ind) != Some(&TextToken::MixedContainer) { - start_ind = next_idx(self.tokens, start_ind); - } - - Ok(ArrayReader { - tokens: self.tokens, - start_ind: start_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }) - } - TextToken::Array { end, .. } | TextToken::Object { end, .. } => Ok(ArrayReader { - tokens: self.tokens, - start_ind: self.value_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }), - - // A header can be seen as a two element array - TextToken::Header(_) => Ok(ArrayReader { - tokens: self.tokens, - start_ind: self.value_ind, - end_ind: next_idx(self.tokens, self.value_ind + 1), - encoding: self.encoding.clone(), - }), - - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not an array")), - }), - } - } - - /// Return the number of tokens the value encompases - /// - /// ``` - /// use jomini::TextTape; - /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, first_value) = fields.next().unwrap(); - /// assert_eq!(first_value.tokens_len(), 6); - /// # Ok(()) - /// # } - /// ``` - #[inline] - pub fn tokens_len(&self) -> usize { - match self.tokens[self.value_ind] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => { - end - self.value_ind - 1 - } - _ => 1, + pub fn build(self, reader: R) -> TokenReader { + let buf = self.buffer.build(); + TokenReader { + reader, + buf, + utf8: Utf8Bom::Unknown, } } } -/// An iterator over the values of an array -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"cores={a b}")?; -/// let reader = tape.windows1252_reader(); -/// -/// let mut all_cores = Vec::new(); -/// for (key, _op, value) in reader.fields() { -/// assert_eq!(key.read_str(), "cores"); -/// let cores = value.read_array()?; -/// assert_eq!(cores.len(), 2); -/// for value in cores.values() { -/// all_cores.push(value.read_string()?); -/// } -/// } -/// assert_eq!(all_cores, vec![String::from("a"), String::from("b")]); -/// # Ok(()) -/// # } -/// ``` -pub struct ValuesIter<'data, 'tokens, E> { - token_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} +/// The specific text reader error type. +#[derive(Debug)] +pub enum ReaderErrorKind { + /// An underlying error from a [Read]er + Read(std::io::Error), -impl<'data, 'tokens, E> ValuesIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ArrayReader<'data, 'tokens, E>) -> Self { - ValuesIter { - token_ind: reader.start_ind, - end_ind: reader.end_ind, - tokens: reader.tokens, - encoding: reader.encoding.clone(), - } - } + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, + + /// An early end of the data encountered + Eof, } -impl<'data, 'tokens, E> Iterator for ValuesIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = ValueReader<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - if self.token_ind < self.end_ind { - let value_ind = self.token_ind; - self.token_ind = next_idx_values(self.tokens, self.token_ind); - Some(ValueReader { - value_ind, - tokens: self.tokens, - encoding: self.encoding.clone(), - }) - } else { - None +impl From for ReaderErrorKind { + #[inline] + fn from(value: BufferError) -> Self { + match value { + BufferError::Io(x) => ReaderErrorKind::Read(x), + BufferError::BufferFull => ReaderErrorKind::BufferFull, } } - - fn size_hint(&self) -> (usize, Option) { - let len = values_len(self.tokens, self.token_ind, self.end_ind); - (len, Some(len)) - } } -/// A text reader for sequences of values -#[derive(Debug, Clone)] -pub struct ArrayReader<'data, 'tokens, E> { - start_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, +/// An text lexing error over a `Read` implementation +#[derive(Debug)] +pub struct ReaderError { + position: usize, + kind: ReaderErrorKind, } -impl<'data, 'tokens, E> ArrayReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - /// Iterator over values of an array - /// - /// See [ValuesIter](crate::text::ValuesIter) for a worked example - #[inline] - pub fn values(&self) -> ValuesIter<'data, 'tokens, E> { - ValuesIter::new(self) - } - - /// Returns if the array is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 +impl ReaderError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position } - /// Return the number of values in the array - #[inline] - pub fn len(&self) -> usize { - values_len(self.tokens, self.start_ind, self.end_ind) + /// Return a reference the error kind + pub fn kind(&self) -> &ReaderErrorKind { + &self.kind } - /// Return the number of tokens contained within the object - /// - /// ``` - /// use jomini::TextTape; - /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, first_value) = fields.next().unwrap(); - /// let array = first_value.read_array()?; - /// assert_eq!(array.tokens_len(), 6); - /// # Ok(()) - /// # } - /// ``` - #[inline] - pub fn tokens_len(&self) -> usize { - self.end_ind - self.start_ind + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> ReaderErrorKind { + self.kind } } #[cfg(test)] -mod tests { +mod test { use super::*; - - fn read_value(value: ValueReader) - where - E: crate::Encoding + Clone, - { - match value.token() { - TextToken::Object { .. } => { - iterate_object(value.read_object().unwrap()); - iterate_array(value.read_array().unwrap()); - } - TextToken::Array { .. } => { - iterate_object(value.read_object().unwrap()); - iterate_array(value.read_array().unwrap()); - } - TextToken::End(_) => panic!("end!?"), - TextToken::Operator(_) => {} - TextToken::MixedContainer => {} - TextToken::Unquoted(_) - | TextToken::Quoted(_) - | TextToken::Header(_) - | TextToken::Parameter(_) - | TextToken::UndefinedParameter(_) => { - let _ = value.read_str().unwrap(); - } - } - } - - fn iterate_array(reader: ArrayReader) - where - E: crate::Encoding + Clone, - { - for value in reader.values() { - read_value(value); - } - } - - fn iterate_object(reader: ObjectReader) - where - E: crate::Encoding + Clone, - { - for (_key, group) in reader.field_groups() { - for (_op, value) in group.values() { - read_value(value); - } - } - - let mut fields = reader.fields(); - for (key, _op, value) in fields.by_ref() { - let _ = key.read_str(); - read_value(value); - } - } - - #[test] - fn simple_text_reader_text() { - let data = b"foo=bar"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - assert_eq!(reader.fields_len(), 1); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - assert_eq!(value.read_string().unwrap(), String::from("bar")); - - assert!(iter.next().is_none()); - } - - #[test] - fn simple_text_reader_obj() { - let data = b"foo={bar=qux}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - - let nested = value.read_object().unwrap(); - let mut nested_iter = nested.fields(); - let (key2, _op, value2) = nested_iter.next().unwrap(); - assert_eq!(key2.read_string(), String::from("bar")); - assert_eq!(value2.read_string().unwrap(), String::from("qux")); - assert!(nested_iter.next().is_none()); - assert!(iter.next().is_none()); - } - - #[test] - fn simple_text_reader_array() { - let data = b"foo={bar qux}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - - let nested = value.read_array().unwrap(); - let mut values = nested.values(); - assert_eq!(nested.len(), 2); - let value1 = values.next().unwrap().read_string().unwrap(); - let value2 = values.next().unwrap().read_string().unwrap(); - - assert!(values.next().is_none()); - assert_eq!(value1, String::from("bar")); - assert_eq!(value2, String::from("qux")); - } - - #[test] - fn text_reader_read_fields() { - let data = b"name=aaa name=bbb core=123 core=456 name=ccc name=ddd"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut field_groups = reader.field_groups(); - let (key, values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - - let values = values.values().collect::>(); - assert_eq!(values.len(), 4); - assert_eq!(values[0].1.read_string().unwrap(), String::from("aaa")); - assert_eq!(values[1].1.read_string().unwrap(), String::from("bbb")); - assert_eq!(values[2].1.read_string().unwrap(), String::from("ccc")); - assert_eq!(values[3].1.read_string().unwrap(), String::from("ddd")); - - let (key, values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("core")); - - let values = values.values().collect::>(); - assert_eq!(values.len(), 2); - assert_eq!(values[0].1.read_string().unwrap(), String::from("123")); - assert_eq!(values[1].1.read_string().unwrap(), String::from("456")); - } - - #[test] - fn text_reader_read_fields_nested() { - let data = - b"army={name=aaa unit={name=bbb} unit={name=ccc}} army={name=ddd unit={name=eee}}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut field_groups = reader.field_groups(); - - let (key, army_values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("army")); - assert_eq!(army_values.len(), 2); - - let army_values = army_values.values().collect::>(); - let aaa = army_values[0].1.read_object().unwrap(); - let mut aaa_groups = aaa.field_groups(); - assert_eq!(aaa.fields_len(), 3); - - let (key, values) = aaa_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(values.len(), 1); - assert_eq!( - values.values().nth(0).unwrap().1.read_string().unwrap(), - String::from("aaa") - ); - - let (key, values) = aaa_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("unit")); - assert_eq!(values.len(), 2); - - let bbb = values.values().nth(0).unwrap().1.read_object().unwrap(); - let mut bbb_fields = bbb.fields(); - let (key, _, value) = bbb_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("bbb")); - - let ccc = values.values().nth(1).unwrap().1.read_object().unwrap(); - let mut ccc_fields = ccc.fields(); - let (key, _, value) = ccc_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("ccc")); - - let ddd = army_values[1].1.read_object().unwrap(); - assert_eq!(ddd.fields_len(), 2); - - let mut ddd_groups = ddd.field_groups(); - let (key, values) = ddd_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(values.len(), 1); + use rstest::*; + + #[rstest] + #[case(b"\"hello world\"")] + #[case(b" \"hello world\"")] + #[case(b" \"hello world\"")] + #[case(b"\t\"hello world\"")] + #[case(b"\t\t\"hello world\"")] + #[case(b"\r\n\"hello world\"")] + #[case(b"\r\n\r\n\"hello world\"")] + #[case(b"\n\"hello world\"")] + #[case(b"\n\n\"hello world\"")] + #[case(b" ; \"hello world\"")] + #[case(b" # good morning\n \"hello world\"")] + #[case(b" # good morning\r\n \"hello world\"")] + fn test_whitespace_quoted_scalar(#[case] input: &[u8]) { + let mut reader = TokenReader::new(input); assert_eq!( - values.values().nth(0).unwrap().1.read_string().unwrap(), - String::from("ddd") + reader.read().unwrap(), + Token::Quoted(Scalar::new(b"hello world")) ); - - let (key, values) = ddd_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("unit")); - assert_eq!(values.len(), 1); - - let eee = values.values().nth(0).unwrap().1.read_object().unwrap(); - let mut eee_fields = eee.fields(); - let (key, _, value) = eee_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("eee")); - } - - #[test] - fn text_reader_read_fields_consume() { - let data = b"name=aaa name=bbb core=123 name=ccc name=ddd"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut count = 0; - for (_key, entries) in reader.field_groups() { - for (_i, (_op, value)) in entries.values().enumerate() { - count += value.read_scalar().map(|_| 1).unwrap_or(0); - } + assert!(reader.read().is_err()); + } + + #[rstest] + #[case(b" a=b ", &[ + Token::Unquoted(Scalar::new(b"a")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"b")), + ])] + #[case(b" open={1 2}", &[ + Token::Unquoted(Scalar::new(b"open")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Unquoted(Scalar::new(b"1")), + Token::Unquoted(Scalar::new(b"2")), + Token::Close, + ])] + #[case(b"field1=-100.535 ", &[ + Token::Unquoted(Scalar::new(b"field1")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"-100.535")), + ])] + #[case(b"field1=-100.535", &[ + Token::Unquoted(Scalar::new(b"field1")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"-100.535")), + ])] + #[case(b"dlc_enabled={\n\t\"Cop\"\n\t\"WoN\"\n\t\"RP\"\n\t\"AoW\"\n\t\"ED\"\n}", &[ + Token::Unquoted(Scalar::new(b"dlc_enabled")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Quoted(Scalar::new(b"Cop")), + Token::Quoted(Scalar::new(b"WoN")), + Token::Quoted(Scalar::new(b"RP")), + Token::Quoted(Scalar::new(b"AoW")), + Token::Quoted(Scalar::new(b"ED")), + Token::Close, + ])] + #[case(br#""foo"="bar" "3"="1444.11.11""#, &[ + Token::Quoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"bar")), + Token::Quoted(Scalar::new(b"3")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"1444.11.11")), + ])] + #[case(br#""foo"="bar"3="1444.11.11""#, &[ + Token::Quoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"bar")), + Token::Unquoted(Scalar::new(b"3")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"1444.11.11")), + ])] + #[case(br#"custom_name="THE !@#$%^&*( '\"LEGION\"')""#, &[ + Token::Unquoted(Scalar::new(b"custom_name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(br#"THE !@#$%^&*( '\"LEGION\"')"#)), + ])] + // Preventative measures to ensure we don't regress on imperator color codes + #[case(b"custom_name=\"ab \x15D ( ID: 691 )\x15!\"", &[ + Token::Unquoted(Scalar::new(b"custom_name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"ab \x15D ( ID: 691 )\x15!")), + ])] + // test_no_equal_object_event + #[case(b"foo{bar=qux}", &[ + Token::Unquoted(Scalar::new(b"foo")), + Token::Open, + Token::Unquoted(Scalar::new(b"bar")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"qux")), + Token::Close, + ])] + // test_array_of_objects + #[case(b"stats={{id=0 type=general} {id=1 type=admiral}}", &[ + Token::Unquoted(Scalar::new(b"stats")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Open, + Token::Unquoted(Scalar::new(b"id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"0")), + Token::Unquoted(Scalar::new(b"type")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"general")), + Token::Close, + Token::Open, + Token::Unquoted(Scalar::new(b"id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"1")), + Token::Unquoted(Scalar::new(b"type")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"admiral")), + Token::Close, + Token::Close, + ])] + // test_no_ws_comment + #[case(b"foo=abc#def\nbar=qux", &[ + Token::Unquoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"abc")), + Token::Unquoted(Scalar::new(b"bar")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"qux")), + ])] + // test_bom + #[case(b"\xef\xbb\xbf#hello", &[])] + // test_period_in_identifiers + #[case(b"flavor_tur.8=yes", &[ + Token::Unquoted(Scalar::new(b"flavor_tur.8")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"yes")), + ])] + // test_dashed_identifiers From stellaris saves + #[case(b"dashed-identifier=yes", &[ + Token::Unquoted(Scalar::new(b"dashed-identifier")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"yes")), + ])] + // test_colon_values + #[case(b"province_id = event_target:agenda_province", &[ + Token::Unquoted(Scalar::new(b"province_id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"event_target:agenda_province")), + ])] + // test_parameter_syntax_with_values + // the new syntax to pass parameters to script values is explained in + // stellaris: common/script_values/00_script_values.txt + #[case(b"mult = value:job_weights_research_modifier|JOB|head_researcher|", &[ + Token::Unquoted(Scalar::new(b"mult")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new( + b"value:job_weights_research_modifier|JOB|head_researcher|" + )), + ])] + // test_variables + #[case(b"@planet_standard_scale = 11", &[ + Token::Unquoted(Scalar::new(b"@planet_standard_scale")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"11")), + ])] + // test_variables_value + #[case(b"window_name = @default_window_name", &[ + Token::Unquoted(Scalar::new(b"window_name")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"@default_window_name")), + ])] + // test_interpolated_variable + #[case(b"position = { @[1-leopard_x] @leopard_y }", &[ + Token::Unquoted(Scalar::new(b"position")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Unquoted(Scalar::new(b"@[1-leopard_x]")), + Token::Unquoted(Scalar::new(b"@leopard_y")), + Token::Close, + ])] + // test_unquoted_non_ascii More vic2 shenanigans + #[case(b"jean_jaur\xe8s = bar ", &[ + Token::Unquoted(Scalar::new(b"jean_jaur\xe8s")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"bar")), + ])] + // test_skip_semicolon + #[case(b"value=\"win\"; a=b", &[ + Token::Unquoted(Scalar::new(b"value")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"win")), + Token::Unquoted(Scalar::new(b"a")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"b")), + ])] + fn test_input(#[case] input: &[u8], #[case] expected: &[Token]) { + let mut reader = TokenReader::new(input); + for (i, e) in expected.iter().enumerate() { + assert_eq!(*e, reader.read().unwrap(), "failure at token idx: {}", i); } - assert_eq!(count, 5); - } - - #[test] - fn text_reader_mixed_object_1() { - let data = b"levels={10 0=1 0=2}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - assert_eq!(reader.fields_len(), 1); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("levels")); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 8); - - assert_eq!( - nested.values().nth(3).unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - assert_eq!( - nested.values().nth(6).unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - - let values = nested - .values() - .filter(|x| x.token() != &TextToken::MixedContainer) - .map(|x| x.read_string().unwrap()) - .collect::>(); - - assert_eq!( - values.as_slice(), - &[ - String::from("10"), - String::from("0"), - String::from("="), - String::from("1"), - String::from("0"), - String::from("="), - String::from("2"), - ] - ); - } - - #[test] - fn text_reader_mixed_object_2() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "brittany_area"); - - let mut keys = vec![]; - let brittany = value.read_object().unwrap(); - let mut fields = brittany.fields(); - while let Some((key, _op, _value)) = fields.next() { - keys.push(key.read_str()) - } - - assert_eq!(keys, vec![String::from("color")]); - let trailer = fields.remainder(); - assert_eq!(trailer.len(), 5); - assert_eq!(trailer.values().next().unwrap().read_str().unwrap(), "169"); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 5); - - let mut values = nested.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"169")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"170")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"171")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"172")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"4384")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_mixed_object_3() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let (_key, _op, brittany) = reader.fields().next().unwrap(); - let brittany_reader = brittany.read_object().unwrap(); - - let mut fields = brittany_reader.fields(); - let (lower_bound, upper_bound) = fields.size_hint(); - assert_eq!(lower_bound, brittany_reader.fields_len()); - assert_eq!(lower_bound, 2); - assert!(upper_bound.is_none() || upper_bound == Some(7)); - - let _ = fields.next(); - let (lower_bound, upper_bound) = fields.size_hint(); - assert_eq!(lower_bound, 1); - assert!(upper_bound.is_none() || upper_bound == Some(6)); - - let mut groups = brittany_reader.field_groups(); - let (lower_bound, upper_bound) = groups.size_hint(); - assert_eq!(lower_bound, 1); - assert!(upper_bound.is_none() || upper_bound == Some(6)); - - let _ = groups.next(); - let (lower_bound, upper_bound) = groups.size_hint(); - assert_eq!(lower_bound, 0); - assert!(upper_bound.is_none() || upper_bound == Some(5)); - } - - #[test] - fn text_reader_mixed_object_4() { - let data = br#"levels={a=b 10 c=d 20}"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - assert_eq!(reader.fields_len(), 1); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("levels")); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 5); - - let mut values = nested.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"10")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"c")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"d")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"20")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_mixed_object_5() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "brittany_area"); - - let brittany = value.read_object().unwrap(); - let mut field_groups = brittany.field_groups(); - field_groups.next().unwrap(); - assert!(field_groups.next().is_none()); - - let trailer = field_groups.remainder(); - - let mut values = trailer.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"169")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"170")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"171")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"172")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"4384")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_empty_container() { - let data = b"active_idea_groups={ }"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "active_idea_groups"); - - let empty_array = value.read_array().unwrap(); - assert_eq!(0, empty_array.len()); - assert!(empty_array.values().next().is_none()); - - let empty_object = value.read_object().unwrap(); - let mut empty_object_iter = empty_object.fields(); - assert_eq!(0, empty_object.fields_len()); - assert!(empty_object_iter.next().is_none()); - } - - #[test] - fn text_reader_header() { - let data = b"color = rgb { 10 20 30 }"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "color"); - - let header_array = value.read_array().unwrap(); - let mut values = header_array.values(); - let rgb = values.next().unwrap(); - assert_eq!(rgb.read_str().unwrap(), "rgb"); - - let vals = values.next().unwrap(); - let s = vals.read_array().unwrap(); - let svals = s.values(); - - let colors = svals - .map(|x| x.read_scalar().unwrap()) - .map(|x| x.to_u64().unwrap()) - .collect::>(); - - assert_eq!(colors, vec![10, 20, 30]); - } - - #[test] - fn reader_crash1() { - let data = b"a=r{}"; - let tape = TextTape::from_slice(data).unwrap(); - iterate_object(tape.windows1252_reader()); - } - - #[test] - fn text_reader_object_fields() { - let data = b"a{b=}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_op2() { - let data = b"a{}b>{}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_dupe() { - let data = b"a{b=c d=E d}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_header() { - let data = b"a{}b>r{}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_dupe2() { - let data = b"a{b=c d b}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } + reader.read().unwrap_err(); + } + + #[rstest] + #[case(b" hello= butIsaytoYou", &[ + Token::Unquoted(Scalar::new(b"hello")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"butIsaytoYou")), + ])] + #[case(b" \"lovely\"= \"who is it\"", &[ + Token::Quoted(Scalar::new(b"lovely")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"who is it")), + ])] + #[case(br#" "name"= "\"jolly\" john""#, &[ + Token::Quoted(Scalar::new(b"name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(br#"\"jolly\" john"#)), + ])] + fn test_refill(#[case] input: &[u8], #[case] expected: &[Token]) { + let min_buffer_size = expected + .iter() + .filter_map(|x| match x { + Token::Unquoted(s) => Some(s.as_bytes().len()), + Token::Quoted(s) => Some(s.as_bytes().len()), + _ => None, + }) + .max() + .unwrap() + + 1; + + for i in min_buffer_size..min_buffer_size + 10 { + let mut reader = TokenReader::builder().buffer_len(i).build(input); + for e in expected.iter() { + assert_eq!(*e, reader.read().unwrap()); + } - #[test] - fn text_reader_regression() { - let data = b"a={b{}=2}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); + assert!(reader.read().is_err()); } } - #[test] - fn text_reader_regression2() { - let data = b"r={c=d=@{y=u}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); + #[rstest] + #[case(b"a=b c=d } done")] + #[case(br#"a=alongervalue c=d } done"#)] + #[case(br#"a="a long quoted value" c=d } done"#)] + #[case(br#"a="a long \"quoted value\" with escapes" c=d } done"#)] + #[case(br#"a={"an object" { "nested array" }} c=d } done"#)] + fn test_skip_container(#[case] input: &[u8]) { + for i in 8..16 { + let mut reader = TokenReader::builder().buffer_len(i).build(input); + reader.skip_container().unwrap(); + + assert_eq!( + reader.read().unwrap(), + Token::Unquoted(Scalar::new(b"done")) + ); } } - #[test] - fn text_reader_regression3() { - let data = b"a={{t c=d = b}}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } + #[rstest] + #[case(b"\"\\")] + fn test_crash_regression(#[case] input: &[u8]) { + let mut reader = TokenReader::new(input); + while let Ok(Some(_)) = reader.next() {} } - - // #[test] - // fn text_reader_regression4() { - // let data = include_bytes!("/home/nick/projects/jomini/fuzz/artifacts/fuzz_text/crash-a14643c9a89c0f4ab665815c99a07b15de3544a5"); - // // let data = b"a={{ b c == == = d e=f}}"; - // if let Ok(tape) = TextTape::from_slice(data) { - // let reader = tape.windows1252_reader(); - // iterate_object(reader); - // } - // } } diff --git a/src/util.rs b/src/util.rs index b112b30..f7f1d94 100644 --- a/src/util.rs +++ b/src/util.rs @@ -45,11 +45,50 @@ pub(crate) fn contains_zero_byte(x: u64) -> bool { x.wrapping_sub(LO_U64) & !x & HI_U64 != 0 } +/// https://github.com/llogiq/bytecount/blob/934ea0ef4338f00c797500b10c39f03b3cfc1692/src/integer_simd.rs#L21-L27 +#[inline] +const fn bytewise_equal(lhs: u64, rhs: u64) -> u64 { + let lo = u64::MAX / 0xFF; + let hi = lo << 7; + + let x = lhs ^ rhs; + !((((x & !hi) + !hi) | x) >> 7) & lo +} + +#[inline] +const fn sum_usize(values: u64) -> u64 { + let every_other_byte_lo = u64::MAX / 0xFFFF; + let every_other_byte = every_other_byte_lo * 0xFF; + + // Pairwise reduction to avoid overflow on next step. + let pair_sum: u64 = (values & every_other_byte) + ((values >> 8) & every_other_byte); + + // Multiplication results in top two bytes holding sum. + pair_sum.wrapping_mul(every_other_byte_lo) >> ((core::mem::size_of::() - 2) * 8) +} + +#[inline] +pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 { + sum_usize(bytewise_equal(value, repeat_byte(byte))) +} + #[cfg(test)] mod tests { use super::*; use rstest::*; + #[rstest] + #[case(*b" ", 0)] + #[case(*b" { ", 1)] + #[case(*b" { {", 2)] + #[case(*b"{ { {", 3)] + #[case(*b"{{{{{{{{", 8)] + fn test_count_chunk(#[case] input: [u8; 8], #[case] expected: u64) { + let lhs = u64::from_le_bytes(input); + let rhs = repeat_byte(b'{'); + assert_eq!(sum_usize(bytewise_equal(lhs, rhs)), expected); + } + #[rstest] #[case(*b"14441111", Some(14441111))] #[case(*b"14440101", Some(14440101))]