From 59f3159e623f69564758a3a3cbc037479403f9f3 Mon Sep 17 00:00:00 2001 From: Nick Babcock Date: Wed, 20 Dec 2023 21:22:22 -0600 Subject: [PATCH] Introduce incremental read and deserialize for text & binary (#139) Previously jomini APIs required all data to be in memory. This is less than ideal when working with large save files (100 MB+). This PR introduces a host of APIs that work off `Read` implementations. For example, here is how we could find the binary max nesting depth from stdin: ```rust fn main() -> Result<(), Box> { let stdin = io::stdin().lock(); let mut reader = jomini::binary::TokenReader::new(stdin); let mut current_depth = 0; let mut max_depth = 0; while let Some(token) = reader.next()? { match token { jomini::binary::Token::Open => { current_depth += 1; max_depth = max_depth.max(current_depth); }, jomini::binary::Token::Close => current_depth -= 1, _ => {} } } println!("{}", max_depth); Ok(()) } ``` APIs for deserialization are similar and are denoted with `from_reader` Results when deserializing EU4 saves on x86 - Text saves: 33% decrease in latency, 80% reduction in peak memory usage - Binary saves: 10% increase in latency, 50% reduction in peak memory usage The reason why binary saves saw a smaller impact is that the binary deserializer was already smart enough to incrementally parse the buffered data instead of needing to parse it to a tape first like the text deserializer. The reduction in memory usage is expected to be even more pronounced (90%+) for other games with smaller models. It is a shame that using the binary incremental deserialization API came with a small performance cost for EU4 saves. I believe part of this is due how DEFLATE works. [I previously wrote an article](https://nickb.dev/blog/deflate-yourself-for-faster-rust-zips/) on how inflating from a byte slice was 20% faster than from a stream. The inflate stream implementation has a lot more `memcpy` due to it needing ["access to 32KiB of the previously decompressed data"](https://docs.rs/miniz_oxide/0.7.1/miniz_oxide/inflate/core/fn.decompress.html) that it needs to juggle. In the end, a 50% reduction in peak memory seemed worth it. In the docs, incremental text APIs are marked as experimental as they use a different parsing algorithm that is geared more towards save files. I have not yet fleshed out ergonomic equivalents for more esoteric game syntax (like parameter definitions). Game files can still be parsed with the experimental APIs, but these APIs may change in the future based on feedback. As part of this PR, the incrementally deserializing binary implementation has been promoted to handle all buffered data, and no longer will a `from_slice` function parse the data to a tape in an intermediate step. The new incremental APIs are not totally symmetrical. The binary format has a `Lexer` that is a zero cost scanner over a slice of data. There is no such equivalent for the text data as it I don't think it is conducive to being used to construct the higher level reading abstractions with good performance. It is no problem for the binary implementations to start reading a string again if it crosses a read boundary, as string operations are independent of their length. Contrast that with text, where if one starts over, each byte of the string would be looked at again (and same goes for any whitespace). This can be worked around by owning data or communicating state, but this complication doesn't seem worth it over bundling everything inside a `TokenReader` that keeps state local and returns borrowed data. There are some aspects of the resulting API I don't love: - Now there are two "token" types for each format: (eg: `TextToken` and `text::Token`). I don't like how they have the same semantic name. In hindsight, I wish I named the elements of a tape something like `TapeNode` (simdjson calles them `tape_ref`). This is something I may consider if 1.0 will ever be reached. - I'm deliberating on if `TokenReader` is the best name or if `BinaryReader` and `TextReader` are more appropriate. - I don't like the thought of maintaining another deserializer implementation, but I don't see a way around that - There are a few places in the code where I felt the need to circumvent the borrow checker as readers are essentially mutable lending iterators, which makes it difficult to use a token with any other API. I would love to solve this. ```rust // This makes me sad let de = unsafe { &mut *(self.de as *mut _) }; ``` --- README.md | 93 +- benches/jomini_bench.rs | 23 +- fuzz/fuzz_targets/fuzz_binary.rs | 11 + fuzz/fuzz_targets/fuzz_text.rs | 11 + src/binary/de.rs | 1031 ++++++++----- src/binary/flavor.rs | 37 + src/binary/lexer.rs | 778 ++++++++++ src/binary/mod.rs | 73 +- src/binary/reader.rs | 430 ++++++ src/binary/tape.rs | 231 ++- src/binary/tokens.rs | 13 - src/buffer.rs | 167 +++ src/errors.rs | 60 +- src/lib.rs | 166 +-- src/text/de.rs | 792 +++++++++- src/text/dom.rs | 1502 +++++++++++++++++++ src/text/mod.rs | 28 +- src/text/reader.rs | 2323 ++++++++++++------------------ src/util.rs | 39 + 19 files changed, 5636 insertions(+), 2172 deletions(-) create mode 100644 src/binary/lexer.rs create mode 100644 src/binary/reader.rs delete mode 100644 src/binary/tokens.rs create mode 100644 src/buffer.rs create mode 100644 src/text/dom.rs diff --git a/README.md b/README.md index c6b1652..1f699fe 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,9 @@ Converters](https://github.com/ParadoxGameConverters) and ## Quick Start -Below is a demonstration on parsing plaintext data using jomini tools. +Below is a demonstration of deserializing plaintext data using serde. +Several additional serde-like attributes are used to reconcile the serde +data model with structure of these files. ```rust use jomini::{ @@ -71,9 +73,9 @@ let actual: Model = jomini::text::de::from_windows1252_slice(data)?; assert_eq!(actual, expected); ``` -## Binary Parsing +## Binary Deserialization -Parsing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: +Deserializing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: - How text should be decoded (typically Windows-1252 or UTF-8) - How rational (floating point) numbers are decoded @@ -84,7 +86,7 @@ Implementors be warned, not only does each Paradox game have a different binary Below is an example that defines a sample binary format and uses a hashmap token lookup. ```rust -use jomini::{BinaryDeserializer, Encoding, JominiDeserialize, Windows1252Encoding}; +use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; use std::{borrow::Cow, collections::HashMap}; #[derive(JominiDeserialize, PartialEq, Debug)] @@ -116,8 +118,7 @@ let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; let mut map = HashMap::new(); map.insert(0x2d82, "field1"); -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; +let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); ``` @@ -126,59 +127,14 @@ without any duplication. One can configure the behavior when a token is unknown (ie: fail immediately or try to continue). -### Ondemand Deserialization - -The ondemand deserializer is a one-shot deserialization mode is often faster -and more memory efficient as it does not parse the input into an intermediate -tape, and instead deserializes right from the input. - -It is instantiated and used similarly to `BinaryDeserializer` - -```rust -use jomini::OndemandBinaryDeserializer; -// [...snip code from previous example...] - -let actual: MyStruct = OndemandBinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -``` - -### Direct identifier deserialization with `token` attribute - -There may be some performance loss during binary deserialization as -tokens are resolved to strings via a `TokenResolver` and then matched against the -string representations of a struct's fields. - -We can fix this issue by directly encoding the expected token value into the struct: - -```rust -#[derive(JominiDeserialize, PartialEq, Debug)] -struct MyStruct { - #[jomini(token = 0x2d82)] - field1: String, -} - -// Empty token to string resolver -let map = HashMap::::new(); - -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -``` - -Couple notes: - -- This does not obviate need for the token to string resolver as tokens may be used as values. -- If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. - ## Caveats -Caller is responsible for: +Before calling any Jomini API, callers are expected to: -- Determining the correct format (text or binary) ahead of time -- Stripping off any header that may be present (eg: `EU4txt` / `EU4bin`) -- Providing the token resolver for the binary format -- Providing the conversion to reconcile how, for example, a date may be encoded as an integer in +- Determine the correct format (text or binary) ahead of time. +- Strip off any header that may be present (eg: `EU4txt` / `EU4bin`) +- Provide the token resolver for the binary format +- Provide the conversion to reconcile how, for example, a date may be encoded as an integer in the binary format, but as a string when in plaintext. ## The Mid-level API @@ -199,6 +155,9 @@ for (key, _op, value) in reader.fields() { } ``` +For even lower level of parisng, see the respective binary and text +documentation. + The mid-level API also provides the excellent utility of converting the plaintext Clausewitz format to JSON when the `json` feature is enabled. @@ -211,28 +170,6 @@ let actual = reader.json().to_string()?; assert_eq!(actual, r#"{"foo":"bar"}"#); ``` -## One Level Lower - -At the lowest layer, one can interact with the raw data directly via `TextTape` -and `BinaryTape`. - -```rust -use jomini::{TextTape, TextToken, Scalar}; - -let data = b"foo=bar"; - -assert_eq!( - TextTape::from_slice(&data[..])?.tokens(), - &[ - TextToken::Unquoted(Scalar::new(b"foo")), - TextToken::Unquoted(Scalar::new(b"bar")), - ] -); -``` - -If one will only use `TextTape` and `BinaryTape` then `jomini` can be compiled without default -features, resulting in a build without dependencies. - ## Write API There are two targeted use cases for the write API. One is when a text tape is on hand. diff --git a/benches/jomini_bench.rs b/benches/jomini_bench.rs index acad516..1f72aa3 100644 --- a/benches/jomini_bench.rs +++ b/benches/jomini_bench.rs @@ -3,11 +3,9 @@ use criterion::{ }; use flate2::read::GzDecoder; use jomini::{ - binary::{ - de::OndemandBinaryDeserializerBuilder, BinaryFlavor, BinaryTapeParser, TokenResolver, - }, + binary::{BinaryFlavor, BinaryTapeParser, TokenResolver}, common::Date, - BinaryDeserializer, BinaryTape, Encoding, Scalar, TextTape, Utf8Encoding, Windows1252Encoding, + BinaryTape, Encoding, Scalar, TextTape, Utf8Encoding, Windows1252Encoding, }; use std::{borrow::Cow, io::Read}; @@ -125,15 +123,26 @@ pub fn binary_deserialize_benchmark(c: &mut Criterion) { group.throughput(Throughput::Bytes(data.len() as u64)); group.bench_function("ondemand", |b| { b.iter(|| { - let _res: Gamestate = OndemandBinaryDeserializerBuilder::with_flavor(BinaryTestFlavor) + let _res: Gamestate = BinaryTestFlavor + .deserializer() .deserialize_slice(&data[..], &MyBinaryResolver) .unwrap(); }) }); + group.bench_function("ondemand-reader", |b| { + b.iter(|| { + let _res: Gamestate = BinaryTestFlavor + .deserializer() + .deserialize_reader(&data[..], &MyBinaryResolver) + .unwrap(); + }) + }); group.bench_function("tape", |b| { b.iter(|| { - let _res: Gamestate = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &MyBinaryResolver) + let tape = BinaryTape::from_slice(&data[..]).unwrap(); + let _res: Gamestate = BinaryTestFlavor + .deserializer() + .deserialize_tape(&tape, &MyBinaryResolver) .unwrap(); }) }); diff --git a/fuzz/fuzz_targets/fuzz_binary.rs b/fuzz/fuzz_targets/fuzz_binary.rs index a07cd61..ca8b9f4 100644 --- a/fuzz/fuzz_targets/fuzz_binary.rs +++ b/fuzz/fuzz_targets/fuzz_binary.rs @@ -62,6 +62,17 @@ fuzz_target!(|data: &[u8]| { hash.insert(0x354eu16, "selector"); hash.insert(0x209u16, "localization"); + let mut lexer = jomini::binary::Lexer::new(data); + let mut reader = jomini::binary::TokenReader::builder().buffer_len(100).build(data); + + loop { + match (lexer.read_token(), reader.read()) { + (Ok(t1), Ok(t2)) => assert_eq!(t1, t2), + (Err(e1), Err(e2)) => { break; } + (x, y) => panic!("{:?} {:?}", x, y), + } + } + let mut utape = jomini::BinaryTape::default(); let ures = jomini::binary::BinaryTapeParser.parse_slice_into_tape_unoptimized(&data, &mut utape); diff --git a/fuzz/fuzz_targets/fuzz_text.rs b/fuzz/fuzz_targets/fuzz_text.rs index 595418a..790ba50 100644 --- a/fuzz/fuzz_targets/fuzz_text.rs +++ b/fuzz/fuzz_targets/fuzz_text.rs @@ -98,6 +98,17 @@ where } fuzz_target!(|data: &[u8]| { + let mut reader = jomini::text::TokenReader::new(data); + let mut i = 0; + while let Ok(Some(x)) = reader.next() { + if matches!(x, jomini::text::Token::Open) { + i += 1; + if i % 2 == 1 { + let _ = reader.skip_container(); + } + } + } + let _: Result = jomini::TextTape::from_slice(&data).and_then(|tape| { let tokens = tape.tokens(); for (i, token) in tokens.iter().enumerate() { diff --git a/src/binary/de.rs b/src/binary/de.rs index 315e1f6..5de348a 100644 --- a/src/binary/de.rs +++ b/src/binary/de.rs @@ -1,290 +1,586 @@ -use super::{tokens::*, Rgb}; +use super::{ + lexer::{LexemeId, Lexer}, + LexError, Token, TokenReader, TokenReaderBuilder, +}; use crate::{ binary::{BinaryFlavor, FailedResolveStrategy, TokenResolver}, de::ColorSequence, - util::get_split, - BinaryTape, BinaryToken, DeserializeError, DeserializeErrorKind, Error, ErrorKind, + BinaryTape, BinaryToken, DeserializeError, DeserializeErrorKind, Error, +}; +use serde::de::{ + self, Deserialize, DeserializeOwned, DeserializeSeed, MapAccess, SeqAccess, Visitor, }; -use serde::de::{self, Deserialize, DeserializeSeed, MapAccess, SeqAccess, Visitor}; -use std::borrow::Cow; +use std::{borrow::Cow, io::Read}; -#[derive(Debug)] -struct OndemandParser<'data> { - data: &'data [u8], - original_length: usize, +/// Serde deserializer over a streaming binary reader +pub struct BinaryReaderDeserializer<'res, RES, F, R> { + reader: TokenReader, + config: BinaryConfig<'res, RES, F>, +} + +impl<'res, RES: TokenResolver, E: BinaryFlavor, R: Read> BinaryReaderDeserializer<'res, RES, E, R> { + /// Deserialize into provided type + pub fn deserialize(&mut self) -> Result + where + T: DeserializeOwned, + { + T::deserialize(self) + } +} + +impl<'a, 'de, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::Deserializer<'de> + for &'a mut BinaryReaderDeserializer<'res, RES, F, R> +{ + type Error = Error; + + fn deserialize_any(self, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "root deserializer can only work with key value pairs", + )), + })) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_map(BinaryReaderMap::new(self, true)) + } + + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) + } + + serde::forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct enum ignored_any identifier + } } -impl<'data> OndemandParser<'data> { +struct BinaryReaderMap<'a: 'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + root: bool, +} + +impl<'a, 'res, RES: 'a, F, R> BinaryReaderMap<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, root: bool) -> Self { + BinaryReaderMap { de, root } + } +} + +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> MapAccess<'de> + for BinaryReaderMap<'a, 'res, RES, F, R> +{ + type Error = Error; + #[inline] - pub fn peek(&mut self) -> Option { - self.data - .get(..2) - .map(|head| u16::from_le_bytes([head[0], head[1]])) + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + loop { + match self.de.reader.next() { + Ok(Some(Token::Close)) => return Ok(None), + Ok(Some(Token::Open)) => { + let _ = self.de.reader.read(); + } + Ok(Some(token)) => { + return seed + .deserialize(BinaryReaderTokenDeserializer { de, token }) + .map(Some) + } + Ok(None) if self.root => return Ok(None), + Ok(None) => return Err(LexError::Eof.at(self.de.reader.position()).into()), + Err(e) => return Err(e.into()), + } + } } #[inline] - pub fn next(&mut self) -> Option { - let (data, token) = - get_split::<2>(self.data).map(|(head, rest)| (rest, u16::from_le_bytes(head)))?; - self.data = data; - Some(token) + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + let mut token = self.de.reader.read()?; + if matches!(token, Token::Equal) { + token = self.de.reader.read()?; + } + + seed.deserialize(BinaryReaderTokenDeserializer { de, token }) } +} + +struct BinaryReaderTokenDeserializer<'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + token: Token<'a>, +} +impl<'a, 'res, RES: TokenResolver, F: BinaryFlavor, R> + BinaryReaderTokenDeserializer<'a, 'res, RES, F, R> +where + F: BinaryFlavor, + R: Read, +{ #[inline] - pub fn read(&mut self) -> Result { - self.next().ok_or_else(Error::eof) + fn deser<'de, V>(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + 'res: 'de, + { + match self.token { + Token::U32(x) => visitor.visit_u32(x), + Token::U64(x) => visitor.visit_u64(x), + Token::I32(x) => visitor.visit_i32(x), + Token::Bool(x) => visitor.visit_bool(x), + Token::Quoted(x) | Token::Unquoted(x) => { + match self.de.config.flavor.decode(x.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } + Token::F32(x) => visitor.visit_f32(self.de.config.flavor.visit_f32(x)), + Token::F64(x) => visitor.visit_f64(self.de.config.flavor.visit_f64(x)), + Token::Rgb(x) => visitor.visit_seq(ColorSequence::new(x)), + Token::I64(x) => visitor.visit_i64(x), + Token::Id(s) => match self.de.config.resolver.resolve(s) { + Some(id) => visitor.visit_borrowed_str(id), + None => match self.de.config.failed_resolve_strategy { + FailedResolveStrategy::Error => Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::UnknownToken { token_id: s }, + })), + FailedResolveStrategy::Stringify => visitor.visit_string(format!("0x{:x}", s)), + FailedResolveStrategy::Ignore => { + visitor.visit_borrowed_str("__internal_identifier_ignore") + } + }, + }, + Token::Close => Err(Error::invalid_syntax( + "did not expect end", + self.de.reader.position(), + )), + Token::Equal => Err(Error::invalid_syntax( + "did not expect equal", + self.de.reader.position(), + )), + Token::Open => visitor.visit_seq(BinaryReaderSeq::new(self.de)), + } } +} + +macro_rules! deserialize_scalar { + ($method:ident) => { + #[inline] + fn $method(self, visitor: V) -> Result + where + V: de::Visitor<'de>, + { + self.deser(visitor) + } + }; +} + +impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::Deserializer<'de> + for BinaryReaderTokenDeserializer<'a, 'res, RES, F, R> +{ + type Error = Error; + + deserialize_scalar!(deserialize_any); + deserialize_scalar!(deserialize_i8); + deserialize_scalar!(deserialize_i16); + deserialize_scalar!(deserialize_u8); + deserialize_scalar!(deserialize_char); + deserialize_scalar!(deserialize_identifier); + deserialize_scalar!(deserialize_bytes); + deserialize_scalar!(deserialize_byte_buf); #[inline] - pub fn read_string(&mut self) -> Result<&'data [u8], Error> { - let (head, rest) = get_split::<2>(self.data).ok_or_else(Error::eof)?; - let text_len = usize::from(u16::from_le_bytes(head)); - if text_len <= rest.len() { - let (text, rest) = rest.split_at(text_len); - self.data = rest; - Ok(text) + fn deserialize_bool(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::Bool(x) = &self.token { + visitor.visit_bool(*x) } else { - Err(Error::eof()) + self.deser(visitor) } } #[inline] - pub fn read_bool(&mut self) -> Result { - let (&first, rest) = self.data.split_first().ok_or_else(Error::eof)?; - self.data = rest; - Ok(first != 0) + fn deserialize_u16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::Id(x) = &self.token { + visitor.visit_u16(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_i32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::I32(x) = &self.token { + visitor.visit_i32(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_u32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::U32(x) = &self.token { + visitor.visit_u32(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_u64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::U64(x) = &self.token { + visitor.visit_u64(*x) + } else { + self.deser(visitor) + } + } + + #[inline] + fn deserialize_i64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::I64(x) = &self.token { + visitor.visit_i64(*x) + } else { + self.deser(visitor) + } } #[inline] - fn read_u32(&mut self) -> Result { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(u32::from_le_bytes(head)) + fn deserialize_f32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::F32(x) = &self.token { + visitor.visit_f32(self.de.config.flavor.visit_f32(*x)) + } else { + self.deser(visitor) + } } #[inline] - fn read_u64(&mut self) -> Result { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(u64::from_le_bytes(head)) + fn deserialize_f64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Token::F64(x) = &self.token { + visitor.visit_f64(self.de.config.flavor.visit_f64(*x)) + } else { + self.deser(visitor) + } } #[inline] - fn read_i64(&mut self) -> Result { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(i64::from_le_bytes(head)) + fn deserialize_str(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_string(visitor) } #[inline] - fn read_i32(&mut self) -> Result { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(i32::from_le_bytes(head)) + fn deserialize_string(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Quoted(x) | Token::Unquoted(x) => { + match self.de.config.flavor.decode(x.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } + _ => self.deser(visitor), + } + } + + #[inline] + fn deserialize_option(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_some(self) + } + + #[inline] + fn deserialize_unit(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + #[inline] + fn deserialize_unit_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + #[inline] + fn deserialize_newtype_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + #[inline] + fn deserialize_seq(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => { + let mut seq = BinaryReaderSeq::new(self.de); + let result = visitor.visit_seq(&mut seq)?; + if !seq.hit_end { + // For when we are deserializing an array that doesn't read + // the closing token + if !matches!(self.de.reader.read()?, Token::Close) { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.reader.position(), + )); + } + } + Ok(result) + } + Token::Rgb(x) => visitor.visit_seq(ColorSequence::new(x)), + _ => self.deser(visitor), + } + } + + #[inline] + fn deserialize_tuple(self, _len: usize, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + #[inline] + fn deserialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) } #[inline] - fn read_f32(&mut self) -> Result<[u8; 4], Error> { - let (head, rest) = get_split::<4>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(head) + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + visitor.visit_map(BinaryReaderMap::new(self.de, false)) + } else { + self.deser(visitor) + } } #[inline] - fn read_f64(&mut self) -> Result<[u8; 8], Error> { - let (head, rest) = get_split::<8>(self.data).ok_or_else(Error::eof)?; - self.data = rest; - Ok(head) + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) } #[inline] - fn skip_value(&mut self, init: u16) -> Result<(), Error> { - match init { - QUOTED_STRING | UNQUOTED_STRING => { - self.read_string()?; - Ok(()) - } - U32 => { - self.read_u32()?; - Ok(()) - } - I32 => { - self.read_i32()?; - Ok(()) - } - U64 => { - self.read_u64()?; - Ok(()) - } - I64 => { - self.read_i64()?; - Ok(()) - } - BOOL => { - self.read_bool()?; - Ok(()) - } - F32 => { - self.read_f32()?; - Ok(()) - } - F64 => { - self.read_f64()?; - Ok(()) - } - OPEN => self.skip_container(), - _ => Ok(()), - } + fn deserialize_enum( + self, + _name: &'static str, + _variants: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_enum(BinaryReaderEnum::new(self.de, self.token)) } #[inline] - fn skip_container(&mut self) -> Result<(), Error> { - let mut depth = 1; - while depth != 0 { - match self.read()? { - QUOTED_STRING | UNQUOTED_STRING => { - self.read_string()?; - } - U32 => { - self.read_u32()?; - } - I32 => { - self.read_i32()?; - } - U64 => { - self.read_u64()?; - } - I64 => { - self.read_i64()?; - } - BOOL => { - self.read_bool()?; - } - F32 => { - self.read_f32()?; - } - F64 => { - self.read_f64()?; - } - END => depth -= 1, - OPEN => depth += 1, - _ => {} - } + fn deserialize_ignored_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + self.de.reader.skip_container()?; } - Ok(()) + visitor.visit_unit() } +} - fn read_rgb(&mut self) -> Result { - let start = self.read()?; - let rtoken = self.read()?; - let r = self.read_u32()?; - let gtoken = self.read()?; - let g = self.read_u32()?; - let btoken = self.read()?; - let b = self.read_u32()?; - let next_tok = self.read()?; - let a = match (start, rtoken, gtoken, btoken, next_tok) { - (OPEN, U32, U32, U32, END) => None, - (OPEN, U32, U32, U32, U32) => { - let a = Some(self.read_u32()?); - if self.read()? != END { - return Err(self.invalid_syntax("expected end after rgb alpha")); - } - a - } - _ => return Err(self.invalid_syntax("invalid rgb value")), - }; +struct BinaryReaderSeq<'a: 'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + hit_end: bool, +} - Ok(Rgb { r, g, b, a }) +impl<'a, 'de: 'a, 'res: 'de, RES: 'a, F, R> BinaryReaderSeq<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>) -> Self { + BinaryReaderSeq { de, hit_end: false } } +} - #[cold] - #[inline(never)] - fn invalid_syntax>(&self, msg: T) -> Error { - Error::new(ErrorKind::InvalidSyntax { - msg: msg.into(), - offset: self.original_length - self.data.len(), - }) +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> SeqAccess<'de> + for BinaryReaderSeq<'a, 'res, RES, F, R> +{ + type Error = Error; + + fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> + where + T: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + match self.de.reader.read()? { + Token::Close => { + self.hit_end = true; + Ok(None) + } + token => seed + .deserialize(BinaryReaderTokenDeserializer { de, token }) + .map(Some), + } } } -/// On-demand binary deserializer -pub struct OndemandBinaryDeserializer<'data, 'res: 'data, RES, F> { - parser: OndemandParser<'data>, - config: BinaryConfig<'res, RES, F>, +struct BinaryReaderEnum<'a, 'res, RES: 'a, F, R> { + de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, + token: Token<'a>, } -impl OndemandBinaryDeserializer<'_, '_, (), ()> { - /// Constructs a OndemandBinaryDeserializerBuilder - pub fn builder_flavor(flavor: F) -> OndemandBinaryDeserializerBuilder { - OndemandBinaryDeserializerBuilder::with_flavor(flavor) +impl<'a, 'res, RES: 'a, F, R> BinaryReaderEnum<'a, 'res, RES, F, R> { + fn new(de: &'a mut BinaryReaderDeserializer<'res, RES, F, R>, token: Token<'a>) -> Self { + BinaryReaderEnum { de, token } } } -/// Build a tweaked on-deman binary deserializer -#[derive(Debug)] -pub struct OndemandBinaryDeserializerBuilder { - failed_resolve_strategy: FailedResolveStrategy, - flavor: F, +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R: Read> de::EnumAccess<'de> + for BinaryReaderEnum<'a, 'res, RES, F, R> +{ + type Error = Error; + type Variant = Self; + + fn variant_seed(self, seed: V) -> Result<(V::Value, Self), Self::Error> + where + V: de::DeserializeSeed<'de>, + { + let variant = seed.deserialize(BinaryReaderTokenDeserializer { + de: self.de, + token: self.token, + })?; + Ok((variant, self)) + } } -impl OndemandBinaryDeserializerBuilder -where - F: BinaryFlavor, +impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor, R> de::VariantAccess<'de> + for BinaryReaderEnum<'a, 'res, RES, F, R> { - /// Create a new builder instance - pub fn with_flavor(flavor: F) -> Self { - OndemandBinaryDeserializerBuilder { - failed_resolve_strategy: FailedResolveStrategy::Ignore, - flavor, - } - } + type Error = Error; - /// Set the behavior when a unknown token is encountered - pub fn on_failed_resolve(&mut self, strategy: FailedResolveStrategy) -> &mut Self { - self.failed_resolve_strategy = strategy; - self + fn unit_variant(self) -> Result<(), Self::Error> { + Ok(()) } - /// Convenience method for parsing and building a deserializer - pub fn from_slice<'data, 'res: 'data, RES>( - self, - data: &'data [u8], - resolver: &'res RES, - ) -> OndemandBinaryDeserializer<'data, 'res, RES, F> + fn newtype_variant_seed(self, _seed: T) -> Result where - RES: TokenResolver, + T: DeserializeSeed<'de>, { - let config = BinaryConfig { - resolver, - failed_resolve_strategy: self.failed_resolve_strategy, - flavor: self.flavor, - }; + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } - OndemandBinaryDeserializer { - parser: OndemandParser { - data, - original_length: data.len(), - }, - config, - } + fn tuple_variant(self, _len: usize, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) } - /// Convenience method for parsing and deserializing binary data - pub fn deserialize_slice<'b, 'data, 'res: 'data, RES, T>( + fn struct_variant( self, - data: &'data [u8], - resolver: &'res RES, - ) -> Result + _fields: &'static [&'static str], + _visitor: V, + ) -> Result where - T: Deserialize<'data>, - RES: TokenResolver, + V: Visitor<'de>, { - self.from_slice(data, resolver).deserialize() + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) } } +/// On-demand binary deserializer +pub struct OndemandBinaryDeserializer<'data, 'res: 'data, RES, F> { + parser: Lexer<'data>, + config: BinaryConfig<'res, RES, F>, +} + impl<'de, 'res, RES: TokenResolver, E: BinaryFlavor> OndemandBinaryDeserializer<'de, 'res, RES, E> { /// Deserialize into provided type pub fn deserialize(&mut self) -> Result @@ -357,17 +653,18 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> where K: DeserializeSeed<'de>, { - let token = self.de.parser.next(); - match token { - Some(END) => Ok(None), - None if self.root => Ok(None), - None => Err(Error::eof()), - Some(token) => seed + match self.de.parser.read_id() { + Ok(LexemeId::CLOSE) => Ok(None), + Ok(token) => seed .deserialize(OndemandTokenDeserializer { de: &mut *self.de, token, }) .map(Some), + Err(e) => match e.kind() { + LexError::Eof if self.root => Ok(None), + _ => Err(e.into()), + }, } } @@ -375,9 +672,9 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> where V: DeserializeSeed<'de>, { - let mut token = self.de.parser.read()?; - if token == EQUAL { - token = self.de.parser.read()?; + let mut token = self.de.parser.read_id()?; + if token == LexemeId::EQUAL { + token = self.de.parser.read_id()?; } seed.deserialize(OndemandTokenDeserializer { @@ -389,7 +686,7 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> MapAccess<'de> struct OndemandTokenDeserializer<'a, 'de: 'a, 'res: 'de, RES: 'a, F> { de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, - token: u16, + token: LexemeId, } impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> @@ -404,32 +701,36 @@ where let mut tok = self.token; // Skip empty objects masquerading as keys - while tok == OPEN && matches!(self.de.parser.peek(), Some(END)) { - self.de.parser.read()?; - tok = self.de.parser.read()?; + while tok == LexemeId::OPEN && matches!(self.de.parser.peek_id(), Some(LexemeId::CLOSE)) { + self.de.parser.read_id()?; + tok = self.de.parser.read_id()?; } match tok { - QUOTED_STRING | UNQUOTED_STRING => { + LexemeId::QUOTED | LexemeId::UNQUOTED => { let data = self.de.parser.read_string()?; - match self.de.config.flavor.decode(data) { + match self.de.config.flavor.decode(data.as_bytes()) { Cow::Borrowed(x) => visitor.visit_borrowed_str(x), Cow::Owned(x) => visitor.visit_string(x), } } - U32 => visitor.visit_u32(self.de.parser.read_u32()?), - I32 => visitor.visit_i32(self.de.parser.read_i32()?), - U64 => visitor.visit_u64(self.de.parser.read_u64()?), - I64 => visitor.visit_i64(self.de.parser.read_i64()?), - BOOL => visitor.visit_bool(self.de.parser.read_bool()?), - F32 => visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)), - F64 => visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)), - OPEN => visitor.visit_seq(OndemandSeq::new(self.de)), - END | EQUAL => Err(self - .de - .parser - .invalid_syntax("unexpected token encountered")), - s => match self.de.config.resolver.resolve(s) { + LexemeId::U32 => visitor.visit_u32(self.de.parser.read_u32()?), + LexemeId::I32 => visitor.visit_i32(self.de.parser.read_i32()?), + LexemeId::U64 => visitor.visit_u64(self.de.parser.read_u64()?), + LexemeId::I64 => visitor.visit_i64(self.de.parser.read_i64()?), + LexemeId::BOOL => visitor.visit_bool(self.de.parser.read_bool()?), + LexemeId::F32 => { + visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)) + } + LexemeId::F64 => { + visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)) + } + LexemeId::OPEN => visitor.visit_seq(OndemandSeq::new(self.de)), + LexemeId::CLOSE | LexemeId::EQUAL => Err(Error::invalid_syntax( + "unexpected token encountered", + self.de.parser.position(), + )), + LexemeId(s) => match self.de.config.resolver.resolve(s) { Some(id) => visitor.visit_borrowed_str(id), None => match self.de.config.failed_resolve_strategy { FailedResolveStrategy::Error => Err(Error::from(DeserializeError { @@ -474,10 +775,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == BOOL { + if self.token == LexemeId::BOOL { visitor.visit_bool(self.de.parser.read_bool()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -485,10 +786,11 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - match self.token { - QUOTED_STRING | UNQUOTED_STRING | U32 | I32 | U64 | I64 | BOOL | F32 | F64 | OPEN - | END | EQUAL => self.deser(visitor), - x => visitor.visit_u16(x), + if self.token.is_id() { + let LexemeId(x) = self.token; + visitor.visit_u16(x) + } else { + self.deser(visitor) } } @@ -496,10 +798,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == I32 { + if self.token == LexemeId::I32 { visitor.visit_i32(self.de.parser.read_i32()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -507,10 +809,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == U32 { + if self.token == LexemeId::U32 { visitor.visit_u32(self.de.parser.read_u32()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -518,10 +820,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == U64 { + if self.token == LexemeId::U64 { visitor.visit_u64(self.de.parser.read_u64()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -529,10 +831,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == I64 { + if self.token == LexemeId::I64 { visitor.visit_i64(self.de.parser.read_i64()?) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -540,10 +842,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == F32 { + if self.token == LexemeId::F32 { visitor.visit_f32(self.de.config.flavor.visit_f32(self.de.parser.read_f32()?)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -551,10 +853,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == F64 { + if self.token == LexemeId::F64 { visitor.visit_f64(self.de.config.flavor.visit_f64(self.de.parser.read_f64()?)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -569,14 +871,14 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == QUOTED_STRING || self.token == UNQUOTED_STRING { + if self.token == LexemeId::QUOTED || self.token == LexemeId::UNQUOTED { let data = self.de.parser.read_string()?; - match self.de.config.flavor.decode(data) { + match self.de.config.flavor.decode(data.as_bytes()) { Cow::Borrowed(x) => visitor.visit_borrowed_str(x), Cow::Owned(x) => visitor.visit_string(x), } } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -620,26 +922,26 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == OPEN { + if self.token == LexemeId::OPEN { let mut seq = OndemandSeq::new(self.de); let result = visitor.visit_seq(&mut seq)?; if !seq.hit_end { // For when we are deserializing an array that doesn't read // the closing token - let ender = self.de.parser.read()?; - if ender != END { - return Err(self - .de - .parser - .invalid_syntax("Expected sequence to be terminated with an end token")); + let ender = self.de.parser.read_id()?; + if ender != LexemeId::CLOSE { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.parser.position(), + )); } } Ok(result) - } else if self.token == RGB { + } else if self.token == LexemeId::RGB { let rgb = self.de.parser.read_rgb()?; visitor.visit_seq(ColorSequence::new(rgb)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -666,10 +968,10 @@ impl<'a, 'de: 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::Deserializ where V: Visitor<'de>, { - if self.token == OPEN { + if self.token == LexemeId::OPEN { visitor.visit_map(OndemandMap::new(self.de, false)) } else { - Ok(self.deser(visitor)?) + self.deser(visitor) } } @@ -726,8 +1028,8 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> SeqAccess<'de> where T: DeserializeSeed<'de>, { - let token = self.de.parser.read()?; - if token == END { + let token = self.de.parser.read_id()?; + if token == LexemeId::CLOSE { self.hit_end = true; Ok(None) } else { @@ -742,11 +1044,11 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> SeqAccess<'de> struct OndemandEnum<'a, 'de: 'a, 'res: 'de, RES: 'a, F> { de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, - token: u16, + token: LexemeId, } impl<'a, 'de: 'a, 'res: 'de, RES: 'a, F> OndemandEnum<'a, 'de, 'res, RES, F> { - fn new(de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, token: u16) -> Self { + fn new(de: &'a mut OndemandBinaryDeserializer<'de, 'res, RES, F>, token: LexemeId) -> Self { OndemandEnum { de, token } } } @@ -864,7 +1166,7 @@ impl<'de, 'a, 'res: 'de, RES: TokenResolver, F: BinaryFlavor> de::VariantAccess< /// map.insert(0x2d83, String::from("field2")); /// /// let builder = BinaryDeserializer::builder_flavor(BinaryTestFlavor); -/// let mut deserializer = builder.from_slice(&data[..], &map)?; +/// let mut deserializer = builder.from_slice(&data[..], &map); /// let a: StructA = deserializer.deserialize()?; /// assert_eq!(a, StructA { /// field1: "ENG".to_string(), @@ -886,7 +1188,6 @@ pub struct BinaryDeserializer<'b, 'data: 'b, 'res: 'data, RES, F> { } enum BinaryDeserializerKind<'data, 'b> { - Owned(BinaryTape<'data>), Borrowed(&'b BinaryTape<'data>), } @@ -895,6 +1196,7 @@ enum BinaryDeserializerKind<'data, 'b> { pub struct BinaryDeserializerBuilder { failed_resolve_strategy: FailedResolveStrategy, flavor: F, + reader_config: TokenReaderBuilder, } impl BinaryDeserializerBuilder @@ -906,6 +1208,7 @@ where BinaryDeserializerBuilder { failed_resolve_strategy: FailedResolveStrategy::Ignore, flavor, + reader_config: TokenReaderBuilder::default(), } } @@ -915,30 +1218,63 @@ where self } - /// Convenience method for parsing and building a deserializer - pub fn from_slice<'b, 'a, 'res: 'a, RES>( + /// Set the reader buffer config (unused for slice deserializations) + pub fn reader_config(&mut self, val: TokenReaderBuilder) -> &mut Self { + self.reader_config = val; + self + } + + /// Create binary deserializer from reader + pub fn from_reader( + self, + reader: R, + resolver: &RES, + ) -> BinaryReaderDeserializer + where + RES: TokenResolver, + { + let reader = self.reader_config.build(reader); + let config = BinaryConfig { + resolver, + failed_resolve_strategy: self.failed_resolve_strategy, + flavor: self.flavor, + }; + + BinaryReaderDeserializer { reader, config } + } + + /// Deserialize value from reader + pub fn deserialize_reader(self, reader: R, resolver: &RES) -> Result + where + T: DeserializeOwned, + RES: TokenResolver, + { + self.from_reader(reader, resolver).deserialize() + } + + /// Create a binary deserializer from a slice + pub fn from_slice<'a, 'res: 'a, RES>( self, data: &'a [u8], resolver: &'res RES, - ) -> Result, Error> + ) -> OndemandBinaryDeserializer<'a, 'res, RES, F> where RES: TokenResolver, { - let tape = BinaryTape::from_slice(data)?; let config = BinaryConfig { resolver, failed_resolve_strategy: self.failed_resolve_strategy, flavor: self.flavor, }; - Ok(BinaryDeserializer { - tape: BinaryDeserializerKind::Owned(tape), + OndemandBinaryDeserializer { + parser: Lexer::new(data), config, - }) + } } - /// Convenience method for parsing and deserializing binary data - pub fn deserialize_slice<'b, 'data, 'res: 'data, RES, T>( + /// Deserialize value from slice + pub fn deserialize_slice<'data, 'res: 'data, RES, T>( self, data: &'data [u8], resolver: &'res RES, @@ -947,8 +1283,7 @@ where T: Deserialize<'data>, RES: TokenResolver, { - let deser = self.from_slice(data, resolver)?; - deser.deserialize() + self.from_slice(data, resolver).deserialize() } /// Deserialize the given binary tape @@ -971,6 +1306,19 @@ where config, } } + + /// Deserialize the given binary tape + pub fn deserialize_tape<'data, 'b, 'res: 'data, RES, T>( + self, + tape: &'b BinaryTape<'data>, + resolver: &'res RES, + ) -> Result + where + T: Deserialize<'data>, + RES: TokenResolver, + { + self.from_tape(tape, resolver).deserialize() + } } impl<'b, 'de, 'res, RES: TokenResolver, E: BinaryFlavor> BinaryDeserializer<'b, 'de, 'res, RES, E> { @@ -1025,13 +1373,12 @@ impl<'a, 'b, 'de, 'res, RES: TokenResolver, F: BinaryFlavor> de::Deserializer<'d V: Visitor<'de>, { match &self.tape { - BinaryDeserializerKind::Owned(x) | &BinaryDeserializerKind::Borrowed(x) => visitor - .visit_map(BinaryMap::new( - &self.config, - x.tokens(), - 0, - x.tokens().len(), - )), + &BinaryDeserializerKind::Borrowed(x) => visitor.visit_map(BinaryMap::new( + &self.config, + x.tokens(), + 0, + x.tokens().len(), + )), } } @@ -1556,13 +1903,24 @@ mod tests { T: Deserialize<'a> + PartialEq + std::fmt::Debug, RES: TokenResolver, { - let result = eu4_builder().from_slice(data, resolver)?.deserialize()?; - let ondemand = OndemandBinaryDeserializerBuilder::with_flavor(Eu4Flavor::new()) - .deserialize_slice(data, resolver)?; + let tape = BinaryTape::from_slice(data).unwrap(); + let result = eu4_builder().deserialize_tape(&tape, resolver)?; + let ondemand = eu4_builder().deserialize_slice(data, resolver)?; assert_eq!(result, ondemand); Ok(result) } + fn from_owned<'a, 'res: 'a, RES, T>(data: &'a [u8], resolver: &'res RES) -> Result + where + T: DeserializeOwned + PartialEq + std::fmt::Debug, + RES: TokenResolver, + { + let res = from_slice(data, resolver).unwrap(); + let reader: T = eu4_builder().deserialize_reader(data, resolver).unwrap(); + assert_eq!(reader, res); + Ok(res) + } + #[test] fn test_single_field() { let data = [ @@ -1577,7 +1935,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1640,7 +1998,7 @@ mod tests { map.insert(0x2d82, String::from("field1")); map.insert(0x284c, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1661,7 +2019,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 89 }); } @@ -1677,7 +2035,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 89 }); } @@ -1695,7 +2053,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x326b, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 128 }); } @@ -1713,7 +2071,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x326b, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: -1 }); } @@ -1729,7 +2087,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 0.023 }); } @@ -1747,7 +2105,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 1.78732 }); } @@ -1765,7 +2123,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1788,7 +2146,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1809,7 +2167,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1830,7 +2188,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1858,7 +2216,7 @@ mod tests { } } - let actual: MyStruct = from_slice(&data[..], &NullResolver).unwrap(); + let actual: MyStruct = from_owned(&data[..], &NullResolver).unwrap(); assert_eq!( actual, MyStruct { @@ -1885,7 +2243,7 @@ mod tests { map.insert(0x284c, String::from("yes")); map.insert(0x284b, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1914,7 +2272,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2ee1, String::from("dlc_enabled")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1950,7 +2308,7 @@ mod tests { map.insert(0x2ee1, String::from("dlc_enabled")); map.insert(0x2d82, String::from("field1")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -1994,7 +2352,7 @@ mod tests { map.insert(0x2ec7, String::from("third")); map.insert(0x2ec8, String::from("fourth")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2037,7 +2395,7 @@ mod tests { map.insert(0x2ec7, String::from("third")); map.insert(0x2ec8, String::from("fourth")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2058,7 +2416,7 @@ mod tests { ]; let map: HashMap = HashMap::new(); - let actual: HashMap = from_slice(&data[..], &map).unwrap(); + let actual: HashMap = from_owned(&data[..], &map).unwrap(); assert_eq!(actual.len(), 1); assert_eq!(actual.get(&89), Some(&30)); } @@ -2085,7 +2443,7 @@ mod tests { String::from("1444.11.11"), ); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2116,7 +2474,7 @@ mod tests { String::from(r#"Joe "Captain" Rogers"#), ); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2147,7 +2505,7 @@ mod tests { map.insert(0x00e1, String::from("type")); map.insert(0x28be, String::from("general")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2171,7 +2529,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x00e1, String::from("type")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { _type: vec![] }); } @@ -2198,7 +2556,7 @@ mod tests { map.insert(0x284c, String::from("yes")); map.insert(0x284b, String::from("no")); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2234,7 +2592,7 @@ mod tests { field1: u64, } - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!(actual, MyStruct { field1: 128 }); } @@ -2257,7 +2615,7 @@ mod tests { map.insert(0x2d82, "field1"); map.insert(0x28e3, "second"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2281,8 +2639,7 @@ mod tests { let map: HashMap = HashMap::new(); let mut builder = eu4_builder(); builder.on_failed_resolve(FailedResolveStrategy::Error); - let actual: Result = - builder.from_slice(&data[..], &map).unwrap().deserialize(); + let actual: Result = builder.from_slice(&data[..], &map).deserialize(); assert!(actual.is_err()); } @@ -2316,7 +2673,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2347,7 +2704,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2444,7 +2801,7 @@ mod tests { map.insert(0x1b, "name"); map.insert(0x165, "none"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2471,7 +2828,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2495,7 +2852,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2521,7 +2878,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x2d82, "field1"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2556,7 +2913,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "first"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2597,7 +2954,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "field"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2643,7 +3000,7 @@ mod tests { map.insert(0x2ec9, "savegame_version"); map.insert(0x28e2, "field"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2676,7 +3033,7 @@ mod tests { map.insert(0x2d82, "field1"); map.insert(0x28e3, "second"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2698,7 +3055,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x337f, "campaign_id"); - let actual: Meta = from_slice(&data[..], &map).unwrap(); + let actual: Meta = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, Meta { @@ -2717,7 +3074,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x053a, "color"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { @@ -2787,7 +3144,7 @@ mod tests { let mut map = HashMap::new(); map.insert(0x053a, "color"); - let actual: MyStruct = from_slice(&data[..], &map).unwrap(); + let actual: MyStruct = from_owned(&data[..], &map).unwrap(); assert_eq!( actual, MyStruct { diff --git a/src/binary/flavor.rs b/src/binary/flavor.rs index 79298a0..6f6199f 100644 --- a/src/binary/flavor.rs +++ b/src/binary/flavor.rs @@ -1,3 +1,9 @@ +#[cfg(feature = "derive")] +use crate::{ + binary::{de::BinaryDeserializerBuilder, TokenResolver}, + BinaryDeserializer, Error, +}; + /// Trait customizing decoding values from binary data /// /// How binary data is encoded differs between games and even @@ -8,6 +14,37 @@ pub trait BinaryFlavor: crate::Encoding { /// Decode a f64 from 8 bytes of data fn visit_f64(&self, data: [u8; 8]) -> f64; + + /// Create binary deserializer from this binary flavor + #[cfg(feature = "derive")] + fn deserializer(&self) -> BinaryDeserializerBuilder<&Self> { + BinaryDeserializer::builder_flavor(self) + } + + /// Deserialize value from slice of data with this binary flavor + #[cfg(feature = "derive")] + fn deserialize_slice<'de, 'res: 'de, T, RES>( + &self, + data: &'de [u8], + resolver: &'res RES, + ) -> Result + where + T: serde::de::Deserialize<'de>, + RES: TokenResolver, + { + self.deserializer().deserialize_slice(data, resolver) + } + + /// Deserialize value from stream of data with this binary flavor + #[cfg(feature = "derive")] + fn deserialize_reader(&self, reader: R, resolver: &RES) -> Result + where + T: serde::de::DeserializeOwned, + RES: TokenResolver, + R: std::io::Read, + { + self.deserializer().deserialize_reader(reader, resolver) + } } impl BinaryFlavor for &'_ T { diff --git a/src/binary/lexer.rs b/src/binary/lexer.rs new file mode 100644 index 0000000..bab54a0 --- /dev/null +++ b/src/binary/lexer.rs @@ -0,0 +1,778 @@ +use super::Rgb; +use crate::{util::get_split, Scalar}; +use std::fmt; + +/// The ID of current Lexeme +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(transparent)] +pub struct LexemeId(pub u16); + +impl LexemeId { + /// A binary '{' (open bracket) + pub const OPEN: LexemeId = LexemeId::new(0x0003); + + /// A binary '}' (close bracket) + pub const CLOSE: LexemeId = LexemeId::new(0x0004); + + /// A binary '=' + pub const EQUAL: LexemeId = LexemeId::new(0x0001); + + /// A binary 32 bit unsigned integer + pub const U32: LexemeId = LexemeId::new(0x0014); + + /// A binary 64 bit unsigned integer + pub const U64: LexemeId = LexemeId::new(0x029c); + + /// A binary 32 bit signed integer + pub const I32: LexemeId = LexemeId::new(0x000c); + + /// A binary boolean + pub const BOOL: LexemeId = LexemeId::new(0x000e); + + /// A binary string that is typically quoted + pub const QUOTED: LexemeId = LexemeId::new(0x000f); + + /// A binary string that is typically without quotes + pub const UNQUOTED: LexemeId = LexemeId::new(0x0017); + + /// A binary 32 bit floating point + pub const F32: LexemeId = LexemeId::new(0x000d); + + /// A binary 64 bit floating point + pub const F64: LexemeId = LexemeId::new(0x0167); + + /// A binary RGB value + pub const RGB: LexemeId = LexemeId::new(0x0243); + + /// A binary 64 bit signed integer + pub const I64: LexemeId = LexemeId::new(0x0317); + + /// Construct a new [LexemeId] from a 16bit value + #[inline] + pub const fn new(x: u16) -> Self { + LexemeId(x) + } + + /// Identifies if the given ID does not match of the predefined [LexemeId] + /// constants, and thus can be considered an ID token. + /// + /// ```rust + /// use jomini::binary::LexemeId; + /// let lid = LexemeId::new(0x1000); + /// assert!(lid.is_id()); + /// ``` + #[inline] + pub const fn is_id(&self) -> bool { + !matches!( + *self, + LexemeId::OPEN + | LexemeId::CLOSE + | LexemeId::EQUAL + | LexemeId::U32 + | LexemeId::U64 + | LexemeId::I32 + | LexemeId::BOOL + | LexemeId::QUOTED + | LexemeId::UNQUOTED + | LexemeId::F32 + | LexemeId::F64 + | LexemeId::RGB + | LexemeId::I64 + ) + } +} + +#[inline] +pub(crate) fn read_id(data: &[u8]) -> Result<(LexemeId, &[u8]), LexError> { + let (head, rest) = get_split::<2>(data).ok_or(LexError::Eof)?; + Ok((LexemeId::new(u16::from_le_bytes(head)), rest)) +} + +#[inline] +pub(crate) fn read_string(data: &[u8]) -> Result<(Scalar, &[u8]), LexError> { + let (head, rest) = get_split::<2>(data).ok_or(LexError::Eof)?; + let text_len = usize::from(u16::from_le_bytes(head)); + if text_len <= rest.len() { + let (text, rest) = rest.split_at(text_len); + Ok((Scalar::new(text), rest)) + } else { + Err(LexError::Eof) + } +} + +#[inline] +pub(crate) fn read_bool(data: &[u8]) -> Result<(bool, &[u8]), LexError> { + let (&first, rest) = data.split_first().ok_or(LexError::Eof)?; + Ok((first != 0, rest)) +} + +#[inline] +pub(crate) fn read_u32(data: &[u8]) -> Result<(u32, &[u8]), LexError> { + let (head, rest) = get_split::<4>(data).ok_or(LexError::Eof)?; + Ok((u32::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_u64(data: &[u8]) -> Result<(u64, &[u8]), LexError> { + let (head, rest) = get_split::<8>(data).ok_or(LexError::Eof)?; + Ok((u64::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_i64(data: &[u8]) -> Result<(i64, &[u8]), LexError> { + let (head, rest) = get_split::<8>(data).ok_or(LexError::Eof)?; + Ok((i64::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_i32(data: &[u8]) -> Result<(i32, &[u8]), LexError> { + let (head, rest) = get_split::<4>(data).ok_or(LexError::Eof)?; + Ok((i32::from_le_bytes(head), rest)) +} + +#[inline] +pub(crate) fn read_f32(data: &[u8]) -> Result<([u8; 4], &[u8]), LexError> { + get_split::<4>(data).ok_or(LexError::Eof) +} + +#[inline] +pub(crate) fn read_f64(data: &[u8]) -> Result<([u8; 8], &[u8]), LexError> { + get_split::<8>(data).ok_or(LexError::Eof) +} + +#[inline] +pub(crate) fn read_rgb(data: &[u8]) -> Result<(Rgb, &[u8]), LexError> { + let (start, data) = read_id(data)?; + let (rtoken, data) = read_id(data)?; + let (r, data) = read_u32(data)?; + let (gtoken, data) = read_id(data)?; + let (g, data) = read_u32(data)?; + let (btoken, data) = read_id(data)?; + let (b, data) = read_u32(data)?; + let (next_tok, data) = read_id(data)?; + match (start, rtoken, gtoken, btoken, next_tok) { + (LexemeId::OPEN, LexemeId::U32, LexemeId::U32, LexemeId::U32, LexemeId::CLOSE) => { + Ok((Rgb { r, g, b, a: None }, data)) + } + (LexemeId::OPEN, LexemeId::U32, LexemeId::U32, LexemeId::U32, LexemeId::U32) => { + let (a, data) = read_u32(data)?; + let (end, data) = read_id(data)?; + if end == LexemeId::CLOSE { + let a = Some(a); + Ok((Rgb { r, g, b, a }, data)) + } else { + Err(LexError::InvalidRgb) + } + } + _ => Err(LexError::InvalidRgb), + } +} + +/// Binary token, the raw form of [BinaryToken](crate::binary::BinaryToken) +/// +/// This binary token contains the yielded raw tokens, and won't match open and +/// close tokens together, nor does it make a determination if open and close +/// represents an array, object, or both. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'a> { + /// '{' + Open, + + /// '}' + Close, + + /// '=' + Equal, + + /// 32bit unsigned integer + U32(u32), + + /// 64bit unsigned integer + U64(u64), + + /// 32bit signed integer + I32(i32), + + /// boolean + Bool(bool), + + /// quoted text + Quoted(Scalar<'a>), + + /// text that is not quoted + Unquoted(Scalar<'a>), + + /// 32bits of floating point data + F32([u8; 4]), + + /// 64bits of floating point data + F64([u8; 8]), + + /// Rgb data + Rgb(Rgb), + + /// 64bit signed integer + I64(i64), + + /// token id that can be resolved to a string via a + /// [TokenResolver](crate::binary::TokenResolver) + Id(u16), +} + +#[inline] +pub(crate) fn read_token(data: &[u8]) -> Result<(Token, &[u8]), LexError> { + let (id, data) = read_id(data)?; + match id { + LexemeId::OPEN => Ok((Token::Open, data)), + LexemeId::CLOSE => Ok((Token::Close, data)), + LexemeId::EQUAL => Ok((Token::Equal, data)), + LexemeId::U32 => read_u32(data).map(|(x, d)| (Token::U32(x), d)), + LexemeId::U64 => read_u64(data).map(|(x, d)| (Token::U64(x), d)), + LexemeId::I32 => read_i32(data).map(|(x, d)| (Token::I32(x), d)), + LexemeId::BOOL => read_bool(data).map(|(x, d)| (Token::Bool(x), d)), + LexemeId::QUOTED => read_string(data).map(|(x, d)| (Token::Quoted(x), d)), + LexemeId::UNQUOTED => read_string(data).map(|(x, d)| (Token::Unquoted(x), d)), + LexemeId::F32 => read_f32(data).map(|(x, d)| (Token::F32(x), d)), + LexemeId::F64 => read_f64(data).map(|(x, d)| (Token::F64(x), d)), + LexemeId::RGB => read_rgb(data).map(|(x, d)| (Token::Rgb(x), d)), + LexemeId::I64 => read_i64(data).map(|(x, d)| (Token::I64(x), d)), + LexemeId(id) => Ok((Token::Id(id), data)), + } +} + +/// Lexical error type without positional information +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum LexError { + /// Data ended too soon + Eof, + + /// An invalid RGB block encountered + InvalidRgb, +} + +impl std::error::Error for LexError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl std::fmt::Display for LexError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + LexError::Eof => write!(f, "unexpected end of file"), + LexError::InvalidRgb => write!(f, "invalid rgb data encountered",), + } + } +} + +impl LexError { + #[inline] + #[must_use] + pub(crate) fn at(self, position: usize) -> LexerError { + LexerError { + position, + kind: self, + } + } +} + +/// Lexical error type with positional information +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct LexerError { + position: usize, + kind: LexError, +} + +impl LexerError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position + } + + /// Return a reference the error kind + pub fn kind(&self) -> &LexError { + &self.kind + } + + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> LexError { + self.kind + } +} + +impl std::error::Error for LexerError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + None + } +} + +impl std::fmt::Display for LexerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + LexError::Eof => write!(f, "not enough data to read at {}", self.position), + LexError::InvalidRgb => write!(f, "invalid rgb data encountered at {}", self.position), + } + } +} + +/// Zero cost binary data scanner. +/// +/// There are two main ways to drive the lexer. To see them in action, imagine +/// we want to count the max amount of nesting. +/// +/// ```rust +/// use jomini::binary::{Lexer, Token}; +/// let mut lexer = Lexer::new(&[0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = lexer.next_token()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::LexerError>(()) +/// ``` +/// +/// The [Lexer::next_token] is an ergonomic way to scan through binary tokens. +/// The functions prefixed with `read_`denote more data is expected, while +/// `next_` allows for the data to finish. +/// +/// If it is desired scan through the binary data with zero overhead, one needs +/// to drive the lexer more thoroughly. +/// +/// ```rust +/// use jomini::binary::{Lexer, LexemeId}; +/// let mut lexer = Lexer::new(&[0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(id) = lexer.next_id()? { +/// match id { +/// LexemeId::OPEN => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// LexemeId::CLOSE => current_depth -= 1, +/// LexemeId::U32 => { lexer.read_u32()?; } +/// LexemeId::I32 => { lexer.read_i32()?; } +/// LexemeId::BOOL => { lexer.read_bool()?; } +/// LexemeId::QUOTED | LexemeId::UNQUOTED => { lexer.read_string()?; } +/// LexemeId::F32 => { lexer.read_f32()?; } +/// LexemeId::F64 => { lexer.read_f64()?; } +/// LexemeId::I64 => { lexer.read_i64()?; } +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::LexerError>(()) +/// ``` +/// +/// Only at token boundaries can `token` functions be intertwined with the +/// individual lexeme functions. +/// +/// Errors reported will contain positional information. +pub struct Lexer<'a> { + data: &'a [u8], + original_length: usize, +} + +impl<'a> Lexer<'a> { + /// Creates a new lexer over the given data + #[inline] + pub fn new(data: &'a [u8]) -> Self { + Self { + data, + original_length: data.len(), + } + } + + /// Returns the remaining data that has not yet been processed. + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId}; + /// let mut lexer = Lexer::new(&[0xd2, 0x28, 0xff]); + /// assert_eq!(lexer.read_id().unwrap(), LexemeId::new(0x28d2)); + /// assert_eq!(lexer.remainder(), &[0xff]); + /// ``` + #[inline] + pub fn remainder(&self) -> &'a [u8] { + self.data + } + + /// Returns how many bytes have been processed by the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId}; + /// let mut lexer = Lexer::new(&[0xd2, 0x28, 0xff]); + /// assert_eq!(lexer.read_id().unwrap(), LexemeId::new(0x28d2)); + /// assert_eq!(lexer.position(), 2); + /// ``` + #[inline] + pub fn position(&self) -> usize { + self.original_length - self.data.len() + } + + #[inline] + fn err_position(&self, err: LexError) -> LexerError { + err.at(self.position()) + } + + /// Advance the lexer through the next lexeme id, and return it + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.read_id(), Ok(LexemeId::new(0x282d))); + /// assert_eq!(lexer.read_id().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_id(&mut self) -> Result { + let (result, rest) = read_id(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Attempt to advance through the [LexemeId] + /// + /// An EOF error can still be thrown if data is present but not enough + /// exists to decode the next [LexemeId] + /// + /// ```rust + /// use jomini::binary::{Lexer, LexemeId, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.next_id(), Ok(Some(LexemeId::new(0x282d)))); + /// assert_eq!(lexer.next_id(), Ok(None)); + /// + /// let mut lexer = Lexer::new(&[0x2d]); + /// assert_eq!(lexer.next_id().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn next_id(&mut self) -> Result, LexerError> { + match read_id(self.data) { + Ok((result, rest)) => { + self.data = rest; + Ok(Some(result)) + } + Err(LexError::Eof) if self.remainder().is_empty() => Ok(None), + Err(e) => Err(self.err_position(e)), + } + } + + /// Assume more tokens exist in the data and read the next one. + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Token}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.read_token(), Ok(Token::Id(0x282d))); + /// assert_eq!(lexer.read_token().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_token(&mut self) -> Result, LexerError> { + let (result, rest) = read_token(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Attempt to advance through the next token or return `None` if no data remains + /// + /// An EOF error can still be thrown if data is present but not enough + /// exists to decode the next token. + /// + /// ```rust + /// use jomini::binary::{Lexer, Token, LexError}; + /// let mut lexer = Lexer::new(&[0x2d, 0x28]); + /// assert_eq!(lexer.next_token(), Ok(Some(Token::Id(0x282d)))); + /// assert_eq!(lexer.next_token(), Ok(None)); + /// + /// let mut lexer = Lexer::new(&[0x2d]); + /// assert_eq!(lexer.next_token().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn next_token(&mut self) -> Result>, LexerError> { + match read_token(self.data) { + Ok((result, rest)) => { + self.data = rest; + Ok(Some(result)) + } + Err(LexError::Eof) if self.remainder().is_empty() => Ok(None), + Err(e) => Err(self.err_position(e)), + } + } + + /// Peek at the next [LexemeId] without advancing the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, LexemeId}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.peek_id(), Some(LexemeId::EQUAL)); + /// assert_eq!(lexer.read_id(), Ok(LexemeId::EQUAL)); + /// assert_eq!(lexer.peek_id(), None); + /// ``` + #[inline] + pub fn peek_id(&mut self) -> Option { + self.data + .get(..2) + .map(|head| LexemeId::new(u16::from_le_bytes([head[0], head[1]]))) + } + + /// Peek at the next [Token] without advancing the lexer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Token}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.peek_token(), Some(Token::Equal)); + /// assert_eq!(lexer.read_token(), Ok(Token::Equal)); + /// assert_eq!(lexer.peek_token(), None); + /// ``` + #[inline] + pub fn peek_token(&mut self) -> Option> { + read_token(self.data).ok().map(|(t, _)| t) + } + + /// Advance the lexer through a length prefixed string + /// + /// ```rust + /// use jomini::{Scalar, binary::{Lexer, LexError}}; + /// let mut lexer = Lexer::new(&[0x03, 0x00, 0x45, 0x4e, 0x47][..]); + /// assert_eq!(lexer.read_string(), Ok(Scalar::new(b"ENG"))); + /// assert_eq!(lexer.read_string().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_string(&mut self) -> Result, LexerError> { + let (result, rest) = read_string(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through a boolean + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x01, 0x00][..]); + /// assert_eq!(lexer.read_bool(), Ok(true)); + /// assert_eq!(lexer.read_bool(), Ok(false)); + /// assert_eq!(lexer.read_bool().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_bool(&mut self) -> Result { + let (result, rest) = read_bool(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through unsigned little endian 32 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x59, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_u32(), Ok(89)); + /// assert_eq!(lexer.read_u32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_u32(&mut self) -> Result { + let (result, rest) = read_u32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through unsigned little endian 64 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_u64(), Ok(128)); + /// assert_eq!(lexer.read_u64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_u64(&mut self) -> Result { + let (result, rest) = read_u64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through signed little endian 64 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff][..]); + /// assert_eq!(lexer.read_i64(), Ok(-1)); + /// assert_eq!(lexer.read_i64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_i64(&mut self) -> Result { + let (result, rest) = read_i64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through signed little endian 32 bit integer + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(&[0x59, 0x00, 0x00, 0x00][..]); + /// assert_eq!(lexer.read_i32(), Ok(89)); + /// assert_eq!(lexer.read_i32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_i32(&mut self) -> Result { + let (result, rest) = read_i32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through 32 bits of floating point data and return the bytes + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let data = [0x17, 0x00, 0x00, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_f32(), Ok(data)); + /// assert_eq!(lexer.read_f32().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_f32(&mut self) -> Result<[u8; 4], LexerError> { + let (result, rest) = read_f32(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through 64 bits of floating point data and return the bytes + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let data = [0xc7, 0xe4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_f64(), Ok(data)); + /// assert_eq!(lexer.read_f64().unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_f64(&mut self) -> Result<[u8; 8], LexerError> { + let (result, rest) = read_f64(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance the lexer through an rgb value (with optional alpha channel) + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError, Rgb}; + /// let data = [0x03, 0x00, 0x14, 0x00, 0x6e, 0x00, 0x00, 0x00, + /// 0x14, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x14, 0x00, + /// 0x1b, 0x00, 0x00, 0x00, 0x04, 0x00]; + /// let mut lexer = Lexer::new(&data[..]); + /// assert_eq!(lexer.read_rgb(), Ok(Rgb { r: 110, g: 27, b: 27, a: None })); + /// assert_eq!(lexer.read_rgb().unwrap_err().kind(), &LexError::Eof); + /// ``` + pub fn read_rgb(&mut self) -> Result { + let (result, rest) = read_rgb(self.data).map_err(|e| self.err_position(e))?; + self.data = rest; + Ok(result) + } + + /// Advance a given number of bytes and return them + /// + /// ```rust + /// use jomini::binary::{Lexer, LexError}; + /// let mut lexer = Lexer::new(b"EU4bin"); + /// assert_eq!(lexer.read_bytes(6), Ok(&b"EU4bin"[..])); + /// assert_eq!(lexer.read_bytes(1).unwrap_err().kind(), &LexError::Eof); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&'a [u8], LexerError> { + if self.data.len() >= bytes { + let (head, rest) = self.data.split_at(bytes); + self.data = rest; + Ok(head) + } else { + Err(self.err_position(LexError::Eof)) + } + } + + /// Skip the value denoted by the [LexemeId]. Will skip entire containers. + #[inline] + pub fn skip_value(&mut self, id: LexemeId) -> Result<(), LexerError> { + match id { + LexemeId::QUOTED | LexemeId::UNQUOTED => { + self.read_string()?; + Ok(()) + } + LexemeId::U32 => { + self.read_u32()?; + Ok(()) + } + LexemeId::I32 => { + self.read_i32()?; + Ok(()) + } + LexemeId::U64 => { + self.read_u64()?; + Ok(()) + } + LexemeId::I64 => { + self.read_i64()?; + Ok(()) + } + LexemeId::BOOL => { + self.read_bool()?; + Ok(()) + } + LexemeId::F32 => { + self.read_f32()?; + Ok(()) + } + LexemeId::F64 => { + self.read_f64()?; + Ok(()) + } + LexemeId::OPEN => self.skip_container(), + _ => Ok(()), + } + } + + #[inline] + fn skip_container(&mut self) -> Result<(), LexerError> { + let mut depth = 1; + loop { + match self.read_id()? { + LexemeId::QUOTED | LexemeId::UNQUOTED => { + self.read_string()?; + } + LexemeId::U32 => { + self.read_u32()?; + } + LexemeId::I32 => { + self.read_i32()?; + } + LexemeId::U64 => { + self.read_u64()?; + } + LexemeId::I64 => { + self.read_i64()?; + } + LexemeId::BOOL => { + self.read_bool()?; + } + LexemeId::F32 => { + self.read_f32()?; + } + LexemeId::F64 => { + self.read_f64()?; + } + LexemeId::CLOSE => { + depth -= 1; + if depth == 0 { + return Ok(()); + } + } + LexemeId::OPEN => depth += 1, + _ => {} + } + } + } +} diff --git a/src/binary/mod.rs b/src/binary/mod.rs index 4c8eb3e..74b59c3 100644 --- a/src/binary/mod.rs +++ b/src/binary/mod.rs @@ -1,18 +1,85 @@ //! Types for parsing clausewitz binary input //! -//! See the top level module documentation for an overview that includes parsing -//! and deserializing binary data. +//! Main binary deserialization APIs: +//! - [BinaryFlavor::deserialize_slice] +//! - [BinaryFlavor::deserialize_reader] +//! +//! If the serde deserialization API is too high level, one can build +//! abstractions ontop of. +//! - [BinaryTape::from_slice]: Realizes a pseudo AST onto a linear tape. +//! Cleans up and normalizes data. +//! - [TokenReader]: An incremental binary lexer designed for handling large +//! saves in a memory efficient manner. +//! - [Lexer]: The lowest level, a zero cost binary data scanner over a byte +//! slice. +//! +//! ## Direct identifier deserialization with `token` attribute +//! +//! There may be some performance loss during binary deserialization as +//! tokens are resolved to strings via a `TokenResolver` and then matched against the +//! string representations of a struct's fields. +//! +//! We can fix this issue by directly encoding the expected token value into the struct: +//! +//! ```rust +//! # #[cfg(feature = "derive")] { +//! # use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; +//! # use std::{borrow::Cow, collections::HashMap}; +//! # +//! # #[derive(Debug, Default)] +//! # pub struct BinaryTestFlavor; +//! # +//! # impl BinaryFlavor for BinaryTestFlavor { +//! # fn visit_f32(&self, data: [u8; 4]) -> f32 { +//! # f32::from_le_bytes(data) +//! # } +//! # +//! # fn visit_f64(&self, data: [u8; 8]) -> f64 { +//! # f64::from_le_bytes(data) +//! # } +//! # } +//! # +//! # impl Encoding for BinaryTestFlavor { +//! # fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { +//! # Windows1252Encoding::decode(data) +//! # } +//! # } +//! # +//! # let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; +//! # +//! #[derive(JominiDeserialize, PartialEq, Debug)] +//! struct MyStruct { +//! #[jomini(token = 0x2d82)] +//! field1: String, +//! } +//! +//! // Empty token to string resolver +//! let map = HashMap::::new(); +//! +//! let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; +//! assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); +//! # } +//! # Ok::<(), Box>(()) +//! ``` +//! +//! Couple notes: +//! +//! - This does not obviate need for the token to string resolver as tokens may be used as values. +//! - If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. /// binary deserialization #[cfg(feature = "derive")] pub mod de; mod flavor; +mod lexer; +mod reader; mod resolver; mod rgb; mod tape; -mod tokens; pub use self::flavor::BinaryFlavor; +pub use self::lexer::{LexError, LexemeId, Lexer, LexerError, Token}; +pub use self::reader::{ReaderError, ReaderErrorKind, TokenReader, TokenReaderBuilder}; pub use self::resolver::{FailedResolveStrategy, TokenResolver}; pub use self::rgb::*; pub use self::tape::{BinaryTape, BinaryTapeParser, BinaryToken}; diff --git a/src/binary/reader.rs b/src/binary/reader.rs new file mode 100644 index 0000000..8185184 --- /dev/null +++ b/src/binary/reader.rs @@ -0,0 +1,430 @@ +use super::{ + lexer::{read_id, read_string, read_token}, + LexError, LexemeId, LexerError, Token, +}; +use crate::buffer::{BufferError, BufferWindow, BufferWindowBuilder, SliceReader}; +use std::{fmt, io::Read}; + +/// [Lexer](crate::binary::Lexer) that works over a [Read] implementation +/// +/// Example of computing the max nesting depth using a [TokenReader]. +/// +/// ```rust +/// use jomini::binary::{TokenReader, Token}; +/// let data = [0x2d, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, 0x04, 0x00, 0x04, 0x00]; +/// let mut reader = TokenReader::new(&data[..]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = reader.next()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::binary::ReaderError>(()) +/// ``` +/// +/// Unlike a [BinaryTape](crate::BinaryTape), which will skip ghost objects, +/// pair open and close tokens together, and recognize if a container is an +/// object, array, or mixed -- the tokens yielded from a [TokenReader] are not +/// fully formed. This is a much more raw view of the data that can be used to +/// construct higher level parsers, melters, and deserializers that operate over +/// a stream of data. +/// +/// [TokenReader] operates over a fixed size buffer, so using a +/// [BufRead](std::io::BufRead) affords no benefits. An error will be returned +/// for tokens that are impossible to fit within the buffer (eg: if the provided +/// with 100 byte buffer but there is a binary string that is 101 bytes long). +#[derive(Debug)] +pub struct TokenReader { + reader: R, + buf: BufferWindow, +} + +impl TokenReader<()> { + /// Read from a byte slice without memcpy's + #[inline] + pub fn from_slice(data: &[u8]) -> TokenReader> { + TokenReader { + reader: SliceReader::new(data), + buf: BufferWindow::from_slice(data), + } + } +} + +impl TokenReader +where + R: Read, +{ + /// Convenience method for constructing the default token reader + #[inline] + pub fn new(reader: R) -> Self { + TokenReader::builder().build(reader) + } + + /// Returns the byte position of the data stream that has been processed. + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[0xd2, 0x28, 0xff][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.position(), 2); + /// ``` + #[inline] + pub fn position(&self) -> usize { + self.buf.position() + } + + #[inline] + fn next_opt(&mut self) -> (Option, Option) { + loop { + let window = + unsafe { std::slice::from_raw_parts(self.buf.start, self.buf.window_len()) }; + match read_token(window) { + Ok((tok, new_data)) => { + self.buf.advance_to(new_data.as_ptr()); + return (Some(tok), None); + } + Err(LexError::Eof) => {} + Err(e) => return (None, Some(self.lex_error(e))), + } + + match self.buf.fill_buf(&mut self.reader) { + Ok(0) if self.buf.window_len() == 0 => return (None, None), + Ok(0) => return (None, Some(self.lex_error(LexError::Eof))), + Ok(_) => {} + Err(e) => return (None, Some(self.buffer_error(e))), + } + } + } + + /// Advance a given number of bytes and return them. + /// + /// The internal buffer must be large enough to accomodate all bytes. + /// + /// ```rust + /// use jomini::binary::{TokenReader, LexError, ReaderErrorKind}; + /// let mut reader = TokenReader::new(&b"EU4bin"[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4bin"[..]); + /// assert!(matches!(reader.read_bytes(1).unwrap_err().kind(), ReaderErrorKind::Lexer(LexError::Eof))); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&[u8], ReaderError> { + while self.buf.window_len() < bytes { + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.lex_error(LexError::Eof)), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), + } + } + + let input = unsafe { std::slice::from_raw_parts(self.buf.start, bytes) }; + self.buf.advance(bytes); + Ok(input) + } + + /// Advance through the containing block until the closing token is consumed + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x03, 0x00, + /// 0x04, 0x00, 0x04, 0x00, 0xff, 0xff + /// ][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.read().unwrap(), Token::Equal); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert!(reader.skip_container().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Id(0xffff)); + /// ``` + #[inline] + pub fn skip_container(&mut self) -> Result<(), ReaderError> { + let mut depth = 1; + loop { + while let Ok((id, data)) = read_id(self.buf.window()) { + match id { + LexemeId::CLOSE => { + self.buf.advance_to(data.as_ptr()); + depth -= 1; + if depth == 0 { + return Ok(()); + } + } + LexemeId::OPEN => { + self.buf.advance_to(data.as_ptr()); + depth += 1 + } + LexemeId::BOOL => match data.get(1..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::F32 | LexemeId::U32 | LexemeId::I32 => match data.get(4..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::F64 | LexemeId::I64 | LexemeId::U64 => match data.get(8..) { + Some(d) => self.buf.advance_to(d.as_ptr()), + None => break, + }, + LexemeId::QUOTED | LexemeId::UNQUOTED => match read_string(data) { + Ok((_, d)) => self.buf.advance_to(d.as_ptr()), + Err(_) => break, + }, + _ => self.buf.advance_to(data.as_ptr()), + } + } + + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.lex_error(LexError::Eof)), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), + } + } + } + + /// Consume the token reader and return the internal buffer and reader. This + /// allows the buffer to be reused. + /// + /// ```rust + /// use jomini::binary::TokenReader; + /// let data = b"EU4bin"; + /// let mut reader = TokenReader::new(&data[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &data[..]); + /// + /// let (buf, _) = reader.into_parts(); + /// let data = b"HOI4bin"; + /// let mut reader = TokenReader::builder().buffer(buf).build(&data[..]); + /// assert_eq!(reader.read_bytes(7).unwrap(), &data[..]); + /// ``` + #[inline] + pub fn into_parts(self) -> (Box<[u8]>, R) { + (self.buf.buf, self.reader) + } + + /// Read the next token in the stream. Will error if not enough data remains + /// to decode a token. + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token, ReaderErrorKind, LexError}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00 + /// ][..]); + /// assert_eq!(reader.read().unwrap(), Token::Id(0x28d2)); + /// assert_eq!(reader.read().unwrap(), Token::Equal); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert_eq!(reader.read().unwrap(), Token::Close); + /// assert!(matches!(reader.read().unwrap_err().kind(), ReaderErrorKind::Lexer(LexError::Eof))); + /// ``` + #[inline] + pub fn read(&mut self) -> Result { + // Workaround for borrow checker :( + let s = unsafe { &mut *(self as *mut TokenReader) }; + match self.next_opt() { + (Some(x), _) => Ok(x), + (None, None) => Err(s.lex_error(LexError::Eof)), + (None, Some(e)) => Err(e), + } + } + + /// Read a token, returning none when all the data has been consumed + /// + /// ```rust + /// use jomini::binary::{TokenReader, Token}; + /// let mut reader = TokenReader::new(&[ + /// 0xd2, 0x28, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00 + /// ][..]); + /// assert_eq!(reader.next().unwrap(), Some(Token::Id(0x28d2))); + /// assert_eq!(reader.next().unwrap(), Some(Token::Equal)); + /// assert_eq!(reader.next().unwrap(), Some(Token::Open)); + /// assert_eq!(reader.next().unwrap(), Some(Token::Close)); + /// assert_eq!(reader.next().unwrap(), None); + /// ``` + #[inline] + pub fn next(&mut self) -> Result, ReaderError> { + match self.next_opt() { + (Some(x), _) => Ok(Some(x)), + (None, None) => Ok(None), + (None, Some(e)) => Err(e), + } + } + + #[cold] + #[inline(never)] + fn buffer_error(&self, e: BufferError) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::from(e), + } + } + + #[cold] + #[inline(never)] + fn lex_error(&self, e: LexError) -> ReaderError { + ReaderError::from(e.at(self.position())) + } +} + +impl TokenReader<()> { + /// Initializes a default [TokenReaderBuilder] + pub fn builder() -> TokenReaderBuilder { + TokenReaderBuilder::default() + } +} + +/// Creates a binary token reader +#[derive(Debug, Default)] +pub struct TokenReaderBuilder { + buffer: BufferWindowBuilder, +} + +impl TokenReaderBuilder { + /// Set the fixed size buffer to the given buffer + #[inline] + pub fn buffer(mut self, val: Box<[u8]>) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer(val); + self + } + + /// Set the length of the buffer if no buffer is provided + #[inline] + pub fn buffer_len(mut self, val: usize) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer_len(val); + self + } + + /// Create a binary token reader around a given reader. + #[inline] + pub fn build(self, reader: R) -> TokenReader { + let buf = self.buffer.build(); + TokenReader { reader, buf } + } +} + +/// The specific binary reader error type. +#[derive(Debug)] +pub enum ReaderErrorKind { + /// An underlying error from a [Read]er + Read(std::io::Error), + + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, + + /// The data is corrupted + Lexer(LexError), +} + +/// An binary lexing error over a `Read` implementation +#[derive(Debug)] +pub struct ReaderError { + position: usize, + kind: ReaderErrorKind, +} + +impl ReaderError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position + } + + /// Return a reference the error kind + pub fn kind(&self) -> &ReaderErrorKind { + &self.kind + } + + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> ReaderErrorKind { + self.kind + } +} + +impl std::error::Error for ReaderError { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match &self.kind { + ReaderErrorKind::Read(cause) => Some(cause), + _ => None, + } + } +} + +impl std::fmt::Display for ReaderError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match &self.kind { + ReaderErrorKind::Read { .. } => { + write!(f, "failed to read past position: {}", self.position) + } + ReaderErrorKind::BufferFull => { + write!(f, "max buffer size exceeded at position: {}", self.position) + } + ReaderErrorKind::Lexer(cause) => { + write!(f, "{} at position: {}", cause, self.position) + } + } + } +} + +impl From for ReaderError { + fn from(value: LexerError) -> Self { + ReaderError { + position: value.position(), + kind: ReaderErrorKind::Lexer(value.into_kind()), + } + } +} + +impl From for ReaderErrorKind { + fn from(value: BufferError) -> Self { + match value { + BufferError::Io(x) => ReaderErrorKind::Read(x), + BufferError::BufferFull => ReaderErrorKind::BufferFull, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn test_reader(data: &[u8], expected: &[Token]) { + fn eq(mut reader: TokenReader, expected: &[Token]) + where + R: Read, + { + for token in expected { + assert_eq!(reader.next().unwrap(), Some(*token)); + } + assert_eq!(reader.next().unwrap(), None); + } + + eq(TokenReader::new(data), expected); + + let data_with_header: Vec<_> = b"EU4bin".iter().chain(data).copied().collect(); + let mut reader = TokenReader::new(data_with_header.as_slice()); + assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4bin"[..]); + eq(reader, expected); + } + + #[test] + fn test_binary_token_reader() { + let data = [0xe1, 0x00, 0x01, 0x00, 0x03, 0x00, 0x04, 0x00]; + test_reader( + &data, + &[Token::Id(0x00e1), Token::Equal, Token::Open, Token::Close], + ); + } + + #[test] + fn test_not_enough_data() { + let mut reader = TokenReader::new(&[0x43][..]); + assert!(matches!( + reader.read().unwrap_err().kind(), + &ReaderErrorKind::Lexer(LexError::Eof) + )); + } +} diff --git a/src/binary/tape.rs b/src/binary/tape.rs index a34b819..f0d8160 100644 --- a/src/binary/tape.rs +++ b/src/binary/tape.rs @@ -1,4 +1,10 @@ -use super::tokens::*; +use super::{ + lexer::{ + read_bool, read_f32, read_f64, read_i32, read_i64, read_id, read_rgb, read_string, + read_u32, read_u64, + }, + LexError, LexemeId, +}; use crate::{binary::Rgb, copyless::VecHelper, util::get_split, Error, ErrorKind, Scalar}; /// Represents any valid binary value @@ -151,125 +157,77 @@ impl<'a, 'b> ParserState<'a, 'b> { } #[inline] - fn parse_next_id(&mut self, data: &'a [u8]) -> Result<(&'a [u8], u16), Error> { - self.parse_next_id_opt(data).ok_or_else(Error::eof) + fn parse_next_id(&mut self, data: &'a [u8]) -> Result<(&'a [u8], LexemeId), Error> { + read_id(data) + .map(|(id, rest)| (rest, id)) + .map_err(|e| self.err_position(e, data)) } #[inline] fn parse_u32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - let val = u32::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::U32(val)); + let (result, rest) = read_u32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::U32(result)); Ok(rest) } #[inline] fn parse_u64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - let val = u64::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::U64(val)); + let (result, rest) = read_u64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::U64(result)); Ok(rest) } #[inline] fn parse_i64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - let val = i64::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::I64(val)); + let (result, rest) = read_i64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::I64(result)); Ok(rest) } #[inline] fn parse_i32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - let val = i32::from_le_bytes(head); - self.token_tape.alloc().init(BinaryToken::I32(val)); + let (result, rest) = read_i32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::I32(result)); Ok(rest) } #[inline] fn parse_f32(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<4>(data).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::F32(head)); + let (result, rest) = read_f32(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::F32(result)); Ok(rest) } #[inline] fn parse_f64(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (head, rest) = get_split::<8>(data).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::F64(head)); + let (result, rest) = read_f64(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::F64(result)); Ok(rest) } #[inline] fn parse_bool(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let val = data.first().map(|&x| x != 0).ok_or_else(Error::eof)?; - self.token_tape.alloc().init(BinaryToken::Bool(val)); - Ok(&data[1..]) + let (result, rest) = read_bool(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::Bool(result)); + Ok(rest) } fn parse_rgb(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let data = &data[2..]; - let (data, r_tok) = self.parse_next_id(data)?; - let (r_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let r = u32::from_le_bytes(r_data); - - let (data, g_tok) = self.parse_next_id(data)?; - let (g_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let g = u32::from_le_bytes(g_data); - - let (data, b_tok) = self.parse_next_id(data)?; - let (b_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let b = u32::from_le_bytes(b_data); - - if r_tok != U32 && g_tok != U32 && b_tok != U32 { - return Err(self.invalid_syntax("invalid rgb tokens", data)); - } - - let (data, next_tok) = self.parse_next_id(data)?; - - let (data, a) = match next_tok { - U32 => { - let (a_data, data) = get_split::<4>(data).ok_or_else(Error::eof)?; - let a = u32::from_le_bytes(a_data); - let (data, end_tok) = self.parse_next_id(data)?; - if end_tok != END { - return Err(self.invalid_syntax("expected end to follow rgb alpha", data)); - } - (data, Some(a)) - } - END => (data, None), - _ => return Err(self.invalid_syntax("invalid rgb end token", data)), - }; - - let val = Rgb { r, g, b, a }; - self.token_tape.alloc().init(BinaryToken::Rgb(val)); - Ok(data) - } - - #[inline(always)] - fn parse_string_inner(&mut self, data: &'a [u8]) -> Result<(Scalar<'a>, &'a [u8]), Error> { - let (head, rest) = get_split::<2>(data).ok_or_else(Error::eof)?; - let text_len = usize::from(u16::from_le_bytes(head)); - if text_len <= rest.len() { - let (text, rest) = rest.split_at(text_len); - let scalar = Scalar::new(text); - Ok((scalar, rest)) - } else { - Err(Error::eof()) - } + let (result, rest) = read_rgb(data).map_err(|e| self.err_position(e, data))?; + self.token_tape.alloc().init(BinaryToken::Rgb(result)); + Ok(rest) } #[inline(always)] fn parse_quoted_string(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (scalar, rest) = self.parse_string_inner(data)?; + let (scalar, rest) = read_string(data).map_err(|e| self.err_position(e, data))?; self.token_tape.alloc().init(BinaryToken::Quoted(scalar)); Ok(rest) } #[inline(always)] fn parse_unquoted_string(&mut self, data: &'a [u8]) -> Result<&'a [u8], Error> { - let (scalar, rest) = self.parse_string_inner(data)?; + let (scalar, rest) = read_string(data).map_err(|e| self.err_position(e, data))?; self.token_tape.alloc().init(BinaryToken::Unquoted(scalar)); Ok(rest) } @@ -294,6 +252,8 @@ impl<'a, 'b> ParserState<'a, 'b> { } fn parse(&mut self) -> Result<(), Error> { + use super::LexemeId as L; + let mut data = self.data; let mut state = ParseState::Key; @@ -320,25 +280,27 @@ impl<'a, 'b> ParserState<'a, 'b> { }; } - 'outer: while let Some((mut d, mut token_id)) = self.parse_next_id_opt(data) { + 'outer: while let Some((mut d, token_id)) = self.parse_next_id_opt(data) { + let mut token_id = LexemeId(token_id); + // This conditional is purely an optimization to parse an entire // = in one iteration of the loop, and can be removed // or ignored to ease understanding. See PR #111 for a breakdown on // field and value frequency. if ENABLE_OPTIMIZATION && state == ParseState::Key { - if token_id > UNQUOTED_STRING || token_id == 0xb { + if token_id > L::UNQUOTED || token_id == L(0xb) { // 65-90% of keys are tokens // 5% of these keys are id (0xb) - if token_id != F64 && token_id != U64 { - self.token_tape.alloc().init(BinaryToken::Token(token_id)); + if token_id != L::F64 && token_id != L::U64 { + self.token_tape.alloc().init(BinaryToken::Token(token_id.0)); let (d2, token_id2) = self.parse_next_id(d)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d3, token_id3) = self.parse_next_id(d2)?; - if token_id3 == I32 { + if token_id3 == L::I32 { data = self.parse_i32(d3)?; continue; - } else if token_id3 == OPEN { + } else if token_id3 == L::OPEN { // We could be looking at a primitive array // so we should attempt to parse it in one go let ind = self.token_tape.len(); @@ -357,7 +319,7 @@ impl<'a, 'b> ParserState<'a, 'b> { let (nd2, x) = self.parse_next_id(nd)?; if x == $token { nd = self.$fn(nd2)?; - } else if x == END { + } else if x == L::CLOSE { data = nd2; let end_idx = self.token_tape.len(); match unsafe { @@ -392,20 +354,22 @@ impl<'a, 'b> ParserState<'a, 'b> { } // These three array types cover 99.6% of EU4 arrays - if token_id4 == I32 { - parse_array_field!(parse_i32, I32); - } else if token_id4 == QUOTED_STRING { - parse_array_field!(parse_quoted_string, QUOTED_STRING); - } else if token_id4 == F32 { - parse_array_field!(parse_f32, F32); - } else if (token_id4 > UNQUOTED_STRING - && token_id4 != F64 - && token_id4 != U64) - || token_id4 == 0xb + if token_id4 == L::I32 { + parse_array_field!(parse_i32, L::I32); + } else if token_id4 == L::QUOTED { + parse_array_field!(parse_quoted_string, L::QUOTED); + } else if token_id4 == L::F32 { + parse_array_field!(parse_f32, L::F32); + } else if (token_id4 > L::UNQUOTED + && token_id4 != L::F64 + && token_id4 != L::U64) + || token_id4 == L(0xb) { - self.token_tape.alloc().init(BinaryToken::Token(token_id4)); + self.token_tape + .alloc() + .init(BinaryToken::Token(token_id4.0)); let (d4, token_id4) = self.parse_next_id(d4)?; - if token_id4 == EQUAL { + if token_id4 == L::EQUAL { unsafe { self.set_parent_to_object(parent_ind) }; state = ParseState::ObjectValue; (d, token_id) = self.parse_next_id(d4)?; @@ -419,10 +383,10 @@ impl<'a, 'b> ParserState<'a, 'b> { token_id = token_id4; state = ParseState::OpenFirst; } - } else if token_id3 == QUOTED_STRING { + } else if token_id3 == L::QUOTED { data = self.parse_quoted_string(d3)?; continue; - } else if token_id3 == F32 { + } else if token_id3 == L::F32 { data = self.parse_f32(d3)?; continue; } else { @@ -436,19 +400,19 @@ impl<'a, 'b> ParserState<'a, 'b> { state = ParseState::KeyValueSeparator; } } - } else if token_id == END { + } else if token_id == L::CLOSE { push_end!(); data = d; continue; - } else if token_id == QUOTED_STRING { + } else if token_id == L::QUOTED { // over 20% of EU4 object keys are quoted strings and they // nearly always are objects let d2 = self.parse_quoted_string(d)?; let (d3, token_id2) = self.parse_next_id(d2)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d4, token_id3) = self.parse_next_id(d3)?; - if token_id3 == OPEN { + if token_id3 == L::OPEN { let ind = self.token_tape.len(); self.token_tape.alloc().init(BinaryToken::Array(parent_ind)); parent_ind = ind; @@ -456,18 +420,18 @@ impl<'a, 'b> ParserState<'a, 'b> { (d, token_id) = self.parse_next_id(d4)?; // Expect an object that follows a quoted string to start with a token - if token_id > UNQUOTED_STRING && token_id != F64 && token_id != U64 { - self.token_tape.alloc().init(BinaryToken::Token(token_id)); + if token_id > L::UNQUOTED && token_id != L::F64 && token_id != L::U64 { + self.token_tape.alloc().init(BinaryToken::Token(token_id.0)); (d, token_id) = self.parse_next_id(d)?; - if token_id == EQUAL { + if token_id == L::EQUAL { unsafe { self.set_parent_to_object(parent_ind) }; state = ParseState::ObjectValue; (d, token_id) = self.parse_next_id(d)?; - if token_id == BOOL { + if token_id == L::BOOL { data = self.parse_bool(d)?; state = ParseState::Key; continue; - } else if token_id == QUOTED_STRING { + } else if token_id == L::QUOTED { data = self.parse_quoted_string(d)?; state = ParseState::Key; continue; @@ -486,15 +450,15 @@ impl<'a, 'b> ParserState<'a, 'b> { token_id = token_id2; state = ParseState::KeyValueSeparator; } - } else if token_id == I32 { + } else if token_id == L::I32 { // 8% of Vic3 and EU4 object keys are i32 // 96% of i32 keys have an i32 value let d2 = self.parse_i32(d)?; let (d3, token_id2) = self.parse_next_id(d2)?; - if token_id2 == EQUAL { + if token_id2 == L::EQUAL { let (d4, token_id3) = self.parse_next_id(d3)?; - if token_id3 == I32 { + if token_id3 == L::I32 { data = self.parse_i32(d4)?; continue; } else { @@ -520,15 +484,15 @@ impl<'a, 'b> ParserState<'a, 'b> { } match token_id { - U32 => { + L::U32 => { data = self.parse_u32(d)?; state = Self::next_state(state); } - U64 => { + L::U64 => { data = self.parse_u64(d)?; state = Self::next_state(state); } - I32 => { + L::I32 => { data = self.parse_i32(d)?; state = Self::next_state(state); @@ -536,9 +500,9 @@ impl<'a, 'b> ParserState<'a, 'b> { let mut nd = data; loop { let (nd2, x) = self.parse_next_id(nd)?; - if x == I32 { + if x == L::I32 { nd = self.parse_i32(nd2)?; - } else if x == END { + } else if x == L::CLOSE { push_end!(); data = nd2; break; @@ -550,28 +514,28 @@ impl<'a, 'b> ParserState<'a, 'b> { } } } - BOOL => { + L::BOOL => { data = self.parse_bool(d)?; state = Self::next_state(state); } - QUOTED_STRING => { + L::QUOTED => { data = self.parse_quoted_string(d)?; state = Self::next_state(state); } - UNQUOTED_STRING => { + L::UNQUOTED => { data = self.parse_unquoted_string(d)?; state = Self::next_state(state); } - F32 => { + L::F32 => { data = self.parse_f32(d)?; state = Self::next_state(state); } - F64 => { + L::F64 => { data = self.parse_f64(d)?; state = Self::next_state(state); } - OPEN => { + L::OPEN => { if state != ParseState::Key { let ind = self.token_tape.len(); self.token_tape.alloc().init(BinaryToken::Array(parent_ind)); @@ -585,12 +549,12 @@ impl<'a, 'b> ParserState<'a, 'b> { // position eg: `a={b=c {} d=1}`. These occur in every // EU4 save, even in 1.34. match self.parse_next_id(d)? { - (nd, END) => data = nd, + (nd, L::CLOSE) => data = nd, _ => return Err(self.empty_object_err(data)), } } } - END => { + L::CLOSE => { match state { ParseState::KeyValueSeparator => { // `a={b=c 10}` @@ -608,7 +572,7 @@ impl<'a, 'b> ParserState<'a, 'b> { push_end!(); data = d; } - EQUAL => { + L::EQUAL => { data = d; if state == ParseState::KeyValueSeparator { state = ParseState::ObjectValue; @@ -657,17 +621,17 @@ impl<'a, 'b> ParserState<'a, 'b> { return Err(self.equal_key_error(data)); } } - RGB if state == ParseState::ObjectValue => { + L::RGB if state == ParseState::ObjectValue => { data = self.parse_rgb(d)?; state = ParseState::Key; } - I64 => { + L::I64 => { data = self.parse_i64(d)?; state = Self::next_state(state); } x => { data = d; - self.token_tape.alloc().init(BinaryToken::Token(x)); + self.token_tape.alloc().init(BinaryToken::Token(x.0)); state = Self::next_state(state); } } @@ -718,6 +682,14 @@ impl<'a, 'b> ParserState<'a, 'b> { self.token_tape.alloc().init(stashed1); } + #[inline] + fn err_position(&self, err: LexError, data: &[u8]) -> Error { + match err { + LexError::Eof => Error::eof(), + LexError::InvalidRgb => Error::invalid_syntax("invalid rgb", self.offset(data)), + } + } + #[inline(never)] #[cold] fn equal_key_error(&mut self, data: &[u8]) -> Error { @@ -771,15 +743,6 @@ impl<'a, 'b> ParserState<'a, 'b> { offset: self.offset(data), }) } - - #[inline(never)] - #[cold] - fn invalid_syntax>(&self, msg: T, data: &[u8]) -> Error { - Error::new(ErrorKind::InvalidSyntax { - msg: msg.into(), - offset: self.offset(data), - }) - } } /// Houses the tape of tokens that is extracted from binary data @@ -1074,7 +1037,7 @@ mod tests { data.extend_from_slice(b"schools_initiated"); data.extend_from_slice(&[0x01, 0x00, 0x0f, 0x00, 0x0b, 0x00]); data.extend_from_slice(b"1444.11.11\n"); - data.extend_from_slice(&END.to_le_bytes()); + data.extend_from_slice(&LexemeId::CLOSE.0.to_le_bytes()); let tape = parse(&data[..]).unwrap(); assert_eq!( tape.token_tape, diff --git a/src/binary/tokens.rs b/src/binary/tokens.rs deleted file mode 100644 index ebb37c6..0000000 --- a/src/binary/tokens.rs +++ /dev/null @@ -1,13 +0,0 @@ -pub(crate) const END: u16 = 0x0004; -pub(crate) const OPEN: u16 = 0x0003; -pub(crate) const EQUAL: u16 = 0x0001; -pub(crate) const U32: u16 = 0x0014; -pub(crate) const U64: u16 = 0x029c; -pub(crate) const I32: u16 = 0x000c; -pub(crate) const BOOL: u16 = 0x000e; -pub(crate) const QUOTED_STRING: u16 = 0x000f; -pub(crate) const UNQUOTED_STRING: u16 = 0x0017; -pub(crate) const F32: u16 = 0x000d; -pub(crate) const F64: u16 = 0x0167; -pub(crate) const RGB: u16 = 0x0243; -pub(crate) const I64: u16 = 0x0317; diff --git a/src/buffer.rs b/src/buffer.rs new file mode 100644 index 0000000..45aebea --- /dev/null +++ b/src/buffer.rs @@ -0,0 +1,167 @@ +use crate::Scalar; +use std::{io::Read, marker::PhantomData, ops::Range}; + +#[derive(Debug)] +pub struct BufferWindow { + pub buf: Box<[u8]>, + + // start of window into buffer + pub start: *const u8, + + // end of window into buffer + pub end: *const u8, + + // number of consumed bytes from prior reads + pub prior_reads: usize, +} + +pub enum BufferError { + Io(std::io::Error), + BufferFull, +} + +impl BufferWindow { + #[inline] + pub fn from_slice(data: &[u8]) -> Self { + Self { + buf: Box::new([]), + start: data.as_ptr(), + end: data.as_ptr_range().end, + prior_reads: 0, + } + } + + #[inline] + pub fn advance_to(&mut self, ptr: *const u8) { + debug_assert!((self.start..=self.end).contains(&ptr)); + self.start = ptr; + } + + #[inline] + pub fn advance(&mut self, amt: usize) { + let ptr = unsafe { self.start.add(amt) }; + debug_assert!((self.start..=self.end).contains(&ptr)); + self.start = ptr; + } + + #[inline] + pub fn window(&self) -> &[u8] { + unsafe { std::slice::from_raw_parts(self.start, self.window_len()) } + } + + #[inline] + pub fn window_len(&self) -> usize { + unsafe { self.end.offset_from(self.start) as usize } + } + + #[inline] + pub fn position(&self) -> usize { + self.prior_reads + self.consumed_data() + } + + #[inline] + pub fn consumed_data(&self) -> usize { + unsafe { self.start.offset_from(self.buf.as_ptr()) as usize } + } + + #[inline] + pub fn get(&self, range: Range<*const u8>) -> Scalar { + debug_assert!(range.start >= self.buf.as_ptr_range().start); + debug_assert!(range.end <= self.buf.as_ptr_range().end); + let len = unsafe { range.end.offset_from(range.start) as usize }; + let sl = unsafe { std::slice::from_raw_parts(range.start, len) }; + Scalar::new(sl) + } + + /// This seems similar to `BufRead::fill_buf`, but whereas the `BufRead` + /// will only call the underlying read if the buffer is currently empty, + /// this function will copy over the bytes that haven't been consumed to the + /// start. + #[inline] + pub fn fill_buf(&mut self, mut reader: impl Read) -> Result { + // Copy over the unconsumed bytes to the start of the buffer + let carry_over = self.window_len(); + if carry_over != 0 { + if carry_over >= self.buf.len() { + return Err(BufferError::BufferFull); + } + unsafe { self.start.copy_to(self.buf.as_mut_ptr(), carry_over) }; + } + + self.prior_reads += self.consumed_data(); + self.start = self.buf.as_ptr(); + self.end = unsafe { self.buf.as_ptr().add(carry_over) }; + + // Have the reader start filling in bytes after unconsumed bytes + match reader.read(&mut self.buf[carry_over..]) { + Ok(r) => { + self.end = unsafe { self.end.add(r) }; + Ok(r) + } + Err(e) => Err(BufferError::Io(e)), + } + } +} + +#[derive(Debug)] +pub struct BufferWindowBuilder { + buffer: Option>, + buffer_len: usize, +} + +impl Default for BufferWindowBuilder { + fn default() -> Self { + // Default buffer size of 32 KiB, same size that flate2 uses. + let buffer_len = 32 * 1024; + Self { + buffer: None, + buffer_len, + } + } +} + +impl BufferWindowBuilder { + #[inline] + pub fn buffer(mut self, val: Box<[u8]>) -> BufferWindowBuilder { + self.buffer = Some(val); + self + } + + #[inline] + pub fn buffer_len(mut self, val: usize) -> BufferWindowBuilder { + self.buffer_len = val; + self + } + + #[inline] + pub fn build(self) -> BufferWindow { + let init_len = self.buffer_len; + let buf = self + .buffer + .unwrap_or_else(|| vec![0; init_len].into_boxed_slice()); + let start = buf.as_ptr_range().start; + let end = buf.as_ptr_range().start; + BufferWindow { + buf, + start, + end, + prior_reads: 0, + } + } +} + +/// An no-op read implementation used for TokenReaders +#[derive(Debug)] +pub struct SliceReader<'a>(PhantomData<&'a [u8]>); + +impl<'a> SliceReader<'a> { + pub(crate) fn new(_data: &'a [u8]) -> Self { + SliceReader(PhantomData) + } +} + +impl<'a> Read for SliceReader<'a> { + fn read(&mut self, _buf: &mut [u8]) -> std::io::Result { + Ok(0) + } +} diff --git a/src/errors.rs b/src/errors.rs index 167c132..fd4f00b 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -1,4 +1,8 @@ -use crate::ScalarError; +use crate::{ + binary::{LexError, LexerError, ReaderError as BinReaderError}, + text::ReaderError as TextReaderError, + ScalarError, +}; use std::fmt; /// An error that can occur when processing data @@ -16,6 +20,17 @@ impl Error { Self::new(ErrorKind::Eof) } + #[cold] + pub(crate) fn invalid_syntax(msg: T, position: usize) -> Error + where + T: Into, + { + Self::new(ErrorKind::InvalidSyntax { + msg: msg.into(), + offset: position, + }) + } + /// Return the specific type of error pub fn kind(&self) -> &ErrorKind { &self.0 @@ -64,6 +79,10 @@ pub enum ErrorKind { /// An error occurred when performing IO. Io(std::io::Error), + + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, } impl ErrorKind { @@ -103,6 +122,9 @@ impl std::fmt::Display for Error { ), ErrorKind::Deserialize(ref err) => write!(f, "deserialize error: {}", err), ErrorKind::Io(ref err) => write!(f, "io error: {}", err), + ErrorKind::BufferFull => { + write!(f, "max buffer size exceeded") + }, } } } @@ -113,6 +135,42 @@ impl From for Error { } } +impl From for Error { + fn from(value: LexerError) -> Self { + match value.kind() { + LexError::Eof => Error::eof(), + _ => Error::new(ErrorKind::InvalidSyntax { + msg: format!("{}", value.kind()), + offset: value.position(), + }), + } + } +} + +impl From for Error { + fn from(value: BinReaderError) -> Self { + let pos = value.position(); + match value.into_kind() { + crate::binary::ReaderErrorKind::Read(x) => Error::new(ErrorKind::Io(x)), + crate::binary::ReaderErrorKind::BufferFull => todo!(), + crate::binary::ReaderErrorKind::Lexer(LexError::Eof) => Error::eof(), + crate::binary::ReaderErrorKind::Lexer(LexError::InvalidRgb) => { + Error::invalid_syntax("invalid rgb", pos) + } + } + } +} + +impl From for Error { + fn from(value: TextReaderError) -> Self { + match value.into_kind() { + crate::text::ReaderErrorKind::Read(x) => Error::new(ErrorKind::Io(x)), + crate::text::ReaderErrorKind::BufferFull => todo!(), + crate::text::ReaderErrorKind::Eof => Error::eof(), + } + } +} + /// A Serde deserialization error. #[derive(Debug, PartialEq)] pub struct DeserializeError { diff --git a/src/lib.rs b/src/lib.rs index 9e81656..258f3d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,7 +14,7 @@ Converters](https://github.com/ParadoxGameConverters) and ## Features - ✔ Versatile: Handle both plaintext and binary encoded data -- ✔ Fast: Parse data at 1 GB/s +- ✔ Fast: Parse data at over 1 GB/s - ✔ Small: Compile with zero dependencies - ✔ Safe: Extensively fuzzed against potential malicious input - ✔ Ergonomic: Use [serde](https://serde.rs/derive.html)-like macros to have parsing logic automatically implemented @@ -22,7 +22,9 @@ Converters](https://github.com/ParadoxGameConverters) and ## Quick Start -Below is a demonstration on parsing plaintext data using jomini tools. +Below is a demonstration of deserializing plaintext data using serde. +Several additional serde-like attributes are used to reconcile the serde +data model with structure of these files. ```rust # #[cfg(feature = "derive")] { @@ -72,9 +74,9 @@ assert_eq!(actual, expected); # Ok::<(), Box>(()) ``` -## Binary Parsing +## Binary Deserialization -Parsing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: +Deserializing data encoded in the binary format is done in a similar fashion but with a couple extra steps for the caller to supply: - How text should be decoded (typically Windows-1252 or UTF-8) - How rational (floating point) numbers are decoded @@ -86,7 +88,7 @@ Below is an example that defines a sample binary format and uses a hashmap token ```rust # #[cfg(feature = "derive")] { -use jomini::{BinaryDeserializer, Encoding, JominiDeserialize, Windows1252Encoding}; +use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, binary::BinaryFlavor}; use std::{borrow::Cow, collections::HashMap}; #[derive(JominiDeserialize, PartialEq, Debug)] @@ -97,7 +99,7 @@ struct MyStruct { #[derive(Debug, Default)] pub struct BinaryTestFlavor; -impl jomini::binary::BinaryFlavor for BinaryTestFlavor { +impl BinaryFlavor for BinaryTestFlavor { fn visit_f32(&self, data: [u8; 4]) -> f32 { f32::from_le_bytes(data) } @@ -118,8 +120,7 @@ let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; let mut map = HashMap::new(); map.insert(0x2d82, "field1"); -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; +let actual: MyStruct = BinaryTestFlavor.deserialize_slice(&data[..], &map)?; assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); # } # Ok::<(), Box>(()) @@ -130,120 +131,14 @@ without any duplication. One can configure the behavior when a token is unknown (ie: fail immediately or try to continue). -### Ondemand Deserialization - -The ondemand deserializer is a one-shot deserialization mode is often faster -and more memory efficient as it does not parse the input into an intermediate -tape, and instead deserializes right from the input. - -It is instantiated and used similarly to `BinaryDeserializer` - -```rust -# #[cfg(feature = "derive")] { -use jomini::OndemandBinaryDeserializer; -# use jomini::{Encoding, JominiDeserialize, Windows1252Encoding}; -# use std::{borrow::Cow, collections::HashMap}; -# -# #[derive(JominiDeserialize, PartialEq, Debug)] -# struct MyStruct { -# field1: String, -# } -# -# #[derive(Debug, Default)] -# pub struct BinaryTestFlavor; -# -# impl jomini::binary::BinaryFlavor for BinaryTestFlavor { -# fn visit_f32(&self, data: [u8; 4]) -> f32 { -# f32::from_le_bytes(data) -# } -# -# fn visit_f64(&self, data: [u8; 8]) -> f64 { -# f64::from_le_bytes(data) -# } -# } -# -# impl Encoding for BinaryTestFlavor { -# fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { -# Windows1252Encoding::decode(data) -# } -# } -# -# let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; -# -# let mut map = HashMap::new(); -# map.insert(0x2d82, "field1"); -// [...snip code from previous example...] - -let actual: MyStruct = OndemandBinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -# } -# Ok::<(), Box>(()) -``` - -### Direct identifier deserialization with `token` attribute - -There may be some performance loss during binary deserialization as -tokens are resolved to strings via a `TokenResolver` and then matched against the -string representations of a struct's fields. - -We can fix this issue by directly encoding the expected token value into the struct: - -```rust -# #[cfg(feature = "derive")] { -# use jomini::{Encoding, JominiDeserialize, Windows1252Encoding, BinaryDeserializer}; -# use std::{borrow::Cow, collections::HashMap}; -# -# #[derive(Debug, Default)] -# pub struct BinaryTestFlavor; -# -# impl jomini::binary::BinaryFlavor for BinaryTestFlavor { -# fn visit_f32(&self, data: [u8; 4]) -> f32 { -# f32::from_le_bytes(data) -# } -# -# fn visit_f64(&self, data: [u8; 8]) -> f64 { -# f64::from_le_bytes(data) -# } -# } -# -# impl Encoding for BinaryTestFlavor { -# fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { -# Windows1252Encoding::decode(data) -# } -# } -# -# let data = [ 0x82, 0x2d, 0x01, 0x00, 0x0f, 0x00, 0x03, 0x00, 0x45, 0x4e, 0x47 ]; -# -#[derive(JominiDeserialize, PartialEq, Debug)] -struct MyStruct { - #[jomini(token = 0x2d82)] - field1: String, -} - -// Empty token to string resolver -let map = HashMap::::new(); - -let actual: MyStruct = BinaryDeserializer::builder_flavor(BinaryTestFlavor) - .deserialize_slice(&data[..], &map)?; -assert_eq!(actual, MyStruct { field1: "ENG".to_string() }); -# } -# Ok::<(), Box>(()) -``` - -Couple notes: - -- This does not obviate need for the token to string resolver as tokens may be used as values. -- If the `token` attribute is specified on one field on a struct, it must be specified on all fields of that struct. - ## Caveats -Caller is responsible for: +Before calling any Jomini API, callers are expected to: -- Determining the correct format (text or binary) ahead of time -- Stripping off any header that may be present (eg: `EU4txt` / `EU4bin`) -- Providing the token resolver for the binary format -- Providing the conversion to reconcile how, for example, a date may be encoded as an integer in +- Determine the correct format (text or binary) ahead of time. +- Strip off any header that may be present (eg: `EU4txt` / `EU4bin`) +- Provide the token resolver for the binary format +- Provide the conversion to reconcile how, for example, a date may be encoded as an integer in the binary format, but as a string when in plaintext. ## The Mid-level API @@ -264,6 +159,8 @@ for (key, _op, value) in reader.fields() { } ``` +For even lower level of parisng, see the respective [binary] and [text] module documentation. + */ #![cfg_attr( feature = "json", @@ -287,28 +184,6 @@ assert_eq!(actual, r#"{"foo":"bar"}"#); "## )] /*! -## One Level Lower - -At the lowest layer, one can interact with the raw data directly via `TextTape` -and `BinaryTape`. - -```rust -use jomini::{TextTape, TextToken, Scalar}; - -let data = b"foo=bar"; - -assert_eq!( - TextTape::from_slice(&data[..])?.tokens(), - &[ - TextToken::Unquoted(Scalar::new(b"foo")), - TextToken::Unquoted(Scalar::new(b"bar")), - ] -); -# Ok::<(), Box>(()) -``` - -If one will only use `TextTape` and `BinaryTape` then `jomini` can be compiled without default -features, resulting in a build without dependencies. ## Write API @@ -349,6 +224,7 @@ assert_eq!(&out, b"hello=world\nfoo=bar"); */ #![warn(missing_docs)] pub mod binary; +pub(crate) mod buffer; pub mod common; mod copyless; mod data; @@ -362,17 +238,17 @@ mod scalar; pub mod text; pub(crate) mod util; +#[doc(inline)] pub use self::binary::{BinaryTape, BinaryToken}; +pub use self::buffer::SliceReader; pub use self::encoding::*; pub use self::errors::*; pub use self::scalar::{Scalar, ScalarError}; +#[doc(inline)] pub use self::text::{TextTape, TextToken, TextWriter, TextWriterBuilder}; #[cfg(feature = "derive")] #[doc(inline)] -pub use self::{ - binary::de::{BinaryDeserializer, OndemandBinaryDeserializer}, - text::de::TextDeserializer, -}; +pub use self::{binary::de::BinaryDeserializer, text::de::TextDeserializer}; #[cfg(feature = "derive")] pub use jomini_derive::*; diff --git a/src/text/de.rs b/src/text/de.rs index 1365495..6ffdea3 100644 --- a/src/text/de.rs +++ b/src/text/de.rs @@ -1,13 +1,14 @@ -use super::reader::ValuesIter; +use super::{dom::ValuesIter, reader::Token, TokenReader}; use crate::{ text::{ArrayReader, FieldsIter, ObjectReader, Operator, Reader, ScalarReader, ValueReader}, DeserializeError, DeserializeErrorKind, Encoding, Error, TextTape, TextToken, Utf8Encoding, Windows1252Encoding, }; -use serde::de::{self, Deserialize, DeserializeSeed, Visitor}; +use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, Visitor}; use std::{ borrow::Cow, fmt::{self, Debug}, + io::Read, }; /// Represents the field value that contains an operator @@ -122,12 +123,600 @@ where TextDeserializer::from_windows1252_slice(data)?.deserialize() } +/// (**Experimental**) Create a Windows1252 text value from a reader +/// +/// Considered experimental as it uses a [TokenReader] under the hood, which +/// uses a different parsing routine geared toward save files. +pub fn from_windows1252_reader(reader: R) -> Result +where + T: DeserializeOwned, + R: Read, +{ + TextDeserializer::from_windows1252_reader(TokenReader::new(reader)).deserialize() +} + +/// Convenience method for deserializing streaming utf8 data into a Rust value +pub fn from_utf8_reader(reader: R) -> Result +where + T: DeserializeOwned, + R: Read, +{ + TextDeserializer::from_utf8_reader(TokenReader::new(reader)).deserialize() +} + /// Convenience method for parsing the given text data and deserializing as utf8 encoded. pub fn from_utf8_slice<'a, T>(data: &'a [u8]) -> Result where - T: Deserialize<'a>, + T: Deserialize<'a>, +{ + TextDeserializer::from_utf8_slice(data)?.deserialize() +} + +/// A serde deserializer over streaming data +pub struct TextReaderDeserializer { + reader: TokenReader, + encoding: E, +} + +impl TextReaderDeserializer { + /// Deserialize into provided type + pub fn deserialize(&mut self) -> Result + where + T: DeserializeOwned, + { + T::deserialize(self) + } +} + +impl<'de, R: Read, E: Encoding> de::Deserializer<'de> for &'_ mut TextReaderDeserializer { + type Error = Error; + + fn deserialize_any(self, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "root deserializer can only work with key value pairs", + )), + })) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_map(TextReaderMap::new(self, true)) + } + + fn deserialize_struct( + self, + _name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_map(visitor) + } + + serde::forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct enum ignored_any identifier + } +} + +struct TextReaderMap<'a, R, E> { + de: &'a mut TextReaderDeserializer, + root: bool, +} + +impl<'a, R, E> TextReaderMap<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, root: bool) -> Self { + TextReaderMap { de, root } + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::MapAccess<'de> for TextReaderMap<'a, R, E> { + type Error = Error; + + #[inline] + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + loop { + match self.de.reader.next() { + Ok(Some(Token::Close)) => return Ok(None), + Ok(Some(Token::Open)) => { + let _ = self.de.reader.read()?; + } + Ok(Some(token)) => { + return seed + .deserialize(TextReaderTokenDeserializer::new(de, token)) + .map(Some) + } + Ok(None) if self.root => return Ok(None), + Ok(None) => return Err(self.de.reader.eof_error().into()), + Err(e) => return Err(e.into()), + } + } + } + + #[inline] + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + let token = self.de.reader.read()?; + let deser = if let Token::Operator(op) = token { + let new_token = self.de.reader.read()?; + let mut deser = TextReaderTokenDeserializer::new(de, new_token); + deser.op = op; + deser + } else { + TextReaderTokenDeserializer::new(de, token) + }; + + seed.deserialize(deser) + } +} + +struct TextReaderTokenDeserializer<'a, R, E> { + de: &'a mut TextReaderDeserializer, + token: Token<'a>, + op: Operator, +} + +impl<'a, R, E> TextReaderTokenDeserializer<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, token: Token<'a>) -> Self { + Self { + de, + token, + op: Operator::Equal, + } + } +} + +impl<'a, 'de: 'a, R: Read, E: Encoding> de::Deserializer<'de> + for TextReaderTokenDeserializer<'a, R, E> +{ + type Error = Error; + + fn deserialize_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => visitor.visit_seq(TextReaderSeq::new(self.de)), + Token::Close => Err(Error::invalid_syntax( + "did not expect end", + self.de.reader.position(), + )), + Token::Operator(x) => visitor.visit_str(x.symbol()), + Token::Unquoted(s) | Token::Quoted(s) => match self.de.encoding.decode(s.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + }, + } + } + + fn deserialize_bool(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_bool().ok()) { + Some(x) => visitor.visit_bool(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_i8(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_i64(visitor) + } + + fn deserialize_i64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_i64().ok()) { + Some(x) => visitor.visit_i64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_u8(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u16(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_u64(visitor) + } + + fn deserialize_u64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_u64().ok()) { + Some(x) => visitor.visit_u64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_f32(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_f64(visitor) + } + + fn deserialize_f64(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar().and_then(|x| x.to_f64().ok()) { + Some(x) => visitor.visit_f64(x), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_char(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_any(visitor) + } + + fn deserialize_str(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if let Some(s) = self.token.as_scalar() { + match self.de.encoding.decode(s.as_bytes()) { + Cow::Borrowed(x) => visitor.visit_str(x), + Cow::Owned(x) => visitor.visit_string(x), + } + } else { + self.deserialize_any(visitor) + } + } + + fn deserialize_string(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_str(visitor) + } + + fn deserialize_bytes(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token.as_scalar() { + Some(s) => visitor.visit_bytes(s.as_bytes()), + None => self.deserialize_any(visitor), + } + } + + fn deserialize_byte_buf(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_bytes(visitor) + } + + fn deserialize_option(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + visitor.visit_some(self) + } + + fn deserialize_unit(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + fn deserialize_unit_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_ignored_any(visitor) + } + + fn deserialize_newtype_struct( + self, + _name: &'static str, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_newtype_struct(self) + } + + fn deserialize_seq(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + let mut seq = TextReaderSeq::new(self.de); + let result = visitor.visit_seq(&mut seq)?; + if !seq.hit_end { + // For when we are deserializing an array that doesn't read + // the closing token + if !matches!(self.de.reader.read()?, Token::Close) { + return Err(Error::invalid_syntax( + "Expected sequence to be terminated with an end token", + self.de.reader.position(), + )); + } + } + Ok(result) + } + + fn deserialize_tuple(self, _len: usize, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + fn deserialize_tuple_struct( + self, + _name: &'static str, + _len: usize, + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + self.deserialize_seq(visitor) + } + + fn deserialize_map(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + if matches!(self.token, Token::Open) { + visitor.visit_map(TextReaderMap::new(self.de, false)) + } else { + self.deserialize_any(visitor) + } + } + + fn deserialize_struct( + self, + name: &'static str, + _fields: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + if name == "_internal_jomini_property" { + visitor.visit_map(PropertyReaderMap { + de: self.de, + token: self.token, + op: self.op, + state: 0, + }) + } else { + self.deserialize_map(visitor) + } + } + + fn deserialize_enum( + self, + _name: &'static str, + _variants: &'static [&'static str], + visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + visitor.visit_enum(TextReaderEnum::new(self.de, self.token)) + } + + fn deserialize_identifier(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + self.deserialize_str(visitor) + } + + fn deserialize_ignored_any(self, visitor: V) -> Result + where + V: Visitor<'de>, + { + match self.token { + Token::Open => self.de.reader.skip_container()?, + Token::Unquoted(_) => self.de.reader.skip_unquoted_value()?, + _ => {} + } + visitor.visit_unit() + } +} + +struct TextReaderSeq<'a, R, E> { + de: &'a mut TextReaderDeserializer, + hit_end: bool, +} + +impl<'a, R, E> TextReaderSeq<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer) -> Self { + TextReaderSeq { de, hit_end: false } + } +} + +impl<'de, 'a, R, E> de::SeqAccess<'de> for TextReaderSeq<'a, R, E> +where + R: Read, + E: Encoding, { - TextDeserializer::from_utf8_slice(data)?.deserialize() + type Error = Error; + + fn next_element_seed(&mut self, seed: T) -> Result, Self::Error> + where + T: DeserializeSeed<'de>, + { + let de = unsafe { &mut *(self.de as *mut _) }; + match self.de.reader.read()? { + Token::Close => { + self.hit_end = true; + Ok(None) + } + token => seed + .deserialize(TextReaderTokenDeserializer::new(de, token)) + .map(Some), + } + } +} + +struct TextReaderEnum<'a, R, E> { + de: &'a mut TextReaderDeserializer, + token: Token<'a>, +} + +impl<'a, R, E> TextReaderEnum<'a, R, E> { + fn new(de: &'a mut TextReaderDeserializer, token: Token<'a>) -> Self { + TextReaderEnum { de, token } + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::EnumAccess<'de> for TextReaderEnum<'a, R, E> { + type Error = Error; + type Variant = Self; + + fn variant_seed(self, seed: V) -> Result<(V::Value, Self), Self::Error> + where + V: de::DeserializeSeed<'de>, + { + let variant = seed.deserialize(TextReaderTokenDeserializer::new(self.de, self.token))?; + Ok((variant, self)) + } +} + +impl<'de, 'a, R: Read, E: Encoding> de::VariantAccess<'de> for TextReaderEnum<'a, R, E> { + type Error = Error; + + fn unit_variant(self) -> Result<(), Self::Error> { + Ok(()) + } + + fn newtype_variant_seed(self, _seed: T) -> Result + where + T: DeserializeSeed<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } + + fn tuple_variant(self, _len: usize, _visitor: V) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } + + fn struct_variant( + self, + _fields: &'static [&'static str], + _visitor: V, + ) -> Result + where + V: Visitor<'de>, + { + Err(Error::from(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from( + "unsupported enum deserialization. Please file issue", + )), + })) + } +} + +struct PropertyReaderMap<'a, R, E> { + de: &'a mut TextReaderDeserializer, + op: Operator, + token: Token<'a>, + state: usize, +} + +impl<'a, 'de, R, E> de::MapAccess<'de> for PropertyReaderMap<'a, R, E> +where + E: Encoding, + R: Read, +{ + type Error = Error; + + fn next_key_seed(&mut self, seed: K) -> Result, Self::Error> + where + K: DeserializeSeed<'de>, + { + match self.state { + 0 => seed.deserialize(StaticDeserializer("operator")).map(Some), + 1 => seed.deserialize(StaticDeserializer("value")).map(Some), + _ => Ok(None), + } + } + + fn next_value_seed(&mut self, seed: V) -> Result + where + V: DeserializeSeed<'de>, + { + self.state += 1; + if self.state == 1 { + seed.deserialize(OperatorDeserializer(self.op)) + } else { + seed.deserialize(TextReaderTokenDeserializer::new(self.de, self.token)) + } + } } /// A structure to deserialize text data into Rust values. @@ -192,6 +781,37 @@ enum TextDeserializerKind<'a, 'b, E> { Reader { reader: &'b ObjectReader<'a, 'b, E> }, } +impl TextDeserializer<'_, '_, Windows1252Encoding> { + /// (**Experimental**) Create a Windows1252 text deserializer over a reader + /// + /// Considered experimental as it uses a [TokenReader] under the hood, which + /// uses a different parsing routine geared toward save files. + pub fn from_windows1252_reader( + reader: TokenReader, + ) -> TextReaderDeserializer + where + R: Read, + { + TextReaderDeserializer { + reader, + encoding: Windows1252Encoding, + } + } +} + +impl TextDeserializer<'_, '_, Utf8Encoding> { + /// Create a UTF8 text deserializer over a reader + pub fn from_utf8_reader(reader: TokenReader) -> TextReaderDeserializer + where + R: Read, + { + TextReaderDeserializer { + reader, + encoding: Utf8Encoding, + } + } +} + impl<'a, 'b> TextDeserializer<'a, 'b, Windows1252Encoding> { /// Convenience method for parsing the given text data and deserializing as windows1252 encoded. pub fn from_windows1252_slice( @@ -1074,12 +1694,12 @@ impl<'de> de::Deserializer<'de> for OperatorDeserializer { #[cfg(test)] mod tests { - use crate::common::{Date, DateHour, UniformDate}; - use super::*; + use crate::common::{Date, DateHour, UniformDate}; use jomini_derive::JominiDeserialize; + use rstest::rstest; use serde::{ - de::{self, Deserializer}, + de::{self, DeserializeOwned, Deserializer}, Deserialize, }; use std::fmt; @@ -1092,6 +1712,24 @@ mod tests { Ok(super::from_windows1252_slice(data)?) } + fn from_owned(data: &[u8]) -> T + where + T: DeserializeOwned + PartialEq + Debug, + { + let tape = TextTape::from_slice(data).unwrap(); + let x1: T = TextDeserializer::from_windows1252_tape(&tape) + .deserialize() + .unwrap(); + let reader = TokenReader::new(data); + let mut des = TextReaderDeserializer { + reader, + encoding: Windows1252Encoding, + }; + let x2 = T::deserialize(&mut des).unwrap(); + assert_eq!(x1, x2); + x1 + } + #[test] fn test_single_field() { let data = b"field1=ENG"; @@ -1101,7 +1739,7 @@ mod tests { field1: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1150,7 +1788,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1168,7 +1806,7 @@ mod tests { field1: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: false }); } @@ -1181,7 +1819,7 @@ mod tests { field1: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: true }); } @@ -1194,7 +1832,7 @@ mod tests { field1: u64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1207,7 +1845,7 @@ mod tests { field1: u32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1220,7 +1858,7 @@ mod tests { field1: u8, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 100 }); } @@ -1233,7 +1871,7 @@ mod tests { field1: u16, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: 1000 }); } @@ -1246,7 +1884,7 @@ mod tests { field1: i8, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100 }); } @@ -1259,7 +1897,7 @@ mod tests { field1: i16, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1272,7 +1910,7 @@ mod tests { field1: i32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1285,7 +1923,7 @@ mod tests { field1: i64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -1000 }); } @@ -1298,7 +1936,7 @@ mod tests { field1: f32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100.535 }); } @@ -1311,7 +1949,7 @@ mod tests { field1: f64, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { field1: -100.535 }); } @@ -1325,7 +1963,7 @@ mod tests { field2: bool, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1343,7 +1981,7 @@ mod tests { dlc_enabled: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1376,7 +2014,7 @@ mod tests { name: Option, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1400,7 +2038,7 @@ mod tests { discovered_by: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1423,7 +2061,7 @@ mod tests { id: u32, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1441,7 +2079,25 @@ mod tests { c: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); + assert_eq!( + actual, + MyStruct { + c: String::from("d"), + } + ); + } + + #[test] + fn test_skip_unwanted2() { + let data = b"a={ \"hello\" \"goodbye\" } \r\nc = d\r\ne = f"; + + #[derive(Deserialize, PartialEq, Debug)] + struct MyStruct { + c: String, + } + + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1462,7 +2118,7 @@ mod tests { e: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1485,7 +2141,7 @@ mod tests { e: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1511,7 +2167,7 @@ mod tests { c: Vec, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1529,7 +2185,7 @@ mod tests { field1: Option, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1542,7 +2198,7 @@ mod tests { fn test_deserialize_hashmap() { let data = b"-1=a\r\n-2=b"; - let actual: HashMap = from_slice(&data[..]).unwrap(); + let actual: HashMap = from_owned(&data[..]); let mut expected = HashMap::new(); expected.insert(-1, String::from("a")); expected.insert(-2, String::from("b")); @@ -1563,7 +2219,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); let mut expected = HashMap::new(); expected.insert( -1, @@ -1599,7 +2255,7 @@ mod tests { bar: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1628,7 +2284,7 @@ mod tests { bar: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1655,7 +2311,7 @@ mod tests { name: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1671,17 +2327,22 @@ mod tests { ); } - #[test] - fn test_deserialize_ignore_operator() { - let data = b"val > 3 a = b"; - + #[rstest] + #[case(b"val < 3 a = b")] + #[case(b"val <= 3 a = b")] + #[case(b"val > 3 a = b")] + #[case(b"val >= 3 a = b")] + #[case(b"val == 3 a = b")] + #[case(b"val != 3 a = b")] + #[case(b"val ?= 3 a = b")] + fn test_deserialize_ignore_operator(#[case] data: &[u8]) { #[derive(Deserialize, PartialEq, Debug)] struct MyStruct { val: i32, a: String, } - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1719,7 +2380,7 @@ mod tests { fn test_deserialize_enum_scalar() { let data = b"kind = infantry"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1927,34 +2588,17 @@ mod tests { }) } + #[rstest] + #[case(b"active_idea_groups = { a = 10 }", vec![(String::from("a"), 10)])] + #[case(b"active_idea_groups = { }", vec![])] + #[case(b"active_idea_groups = { ]=0 defensive_ideas=2 }", vec![(String::from("]"), 0), (String::from("defensive_ideas"), 2)])] #[test] - fn test_deserialize_vec_pair() { - let data = b"active_idea_groups = { a = 10 }"; - - let actual: MyStruct = from_slice(&data[..]).unwrap(); - assert_eq!( - actual, - MyStruct { - active_idea_groups: vec![(String::from("a"), 10)] - } - ); - - #[derive(Deserialize, Debug, PartialEq)] - struct MyStruct { - #[serde(default, deserialize_with = "deserialize_vec_pair")] - active_idea_groups: Vec<(String, u8)>, - } - } - - #[test] - fn test_deserialize_vec_pair_empty() { - let data = b"active_idea_groups = {}"; - - let actual: MyStruct = from_slice(&data[..]).unwrap(); + fn test_deserialize_vec_pair(#[case] input: &[u8], #[case] expected: Vec<(String, u8)>) { + let actual: MyStruct = from_owned(input); assert_eq!( actual, MyStruct { - active_idea_groups: Vec::new() + active_idea_groups: expected } ); @@ -1969,7 +2613,7 @@ mod tests { fn test_deserialize_date_string() { let data = b"date=\"1444.11.11\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -1987,7 +2631,7 @@ mod tests { fn test_deserialize_datehour_string() { let data = b"date=\"1936.1.1.24\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2005,7 +2649,7 @@ mod tests { fn test_deserialize_uniform_date() { let data = b"date=\"2200.2.30\""; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2023,7 +2667,7 @@ mod tests { fn test_deserialize_positive_num() { let data = b"pop_happiness = +0.10"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!(actual, MyStruct { pop_happiness: 0.1 }); #[derive(Deserialize, Debug, PartialEq)] @@ -2036,7 +2680,7 @@ mod tests { fn test_deserialize_operator() { let data = b"num_cities < 0.10"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2054,7 +2698,7 @@ mod tests { fn test_deserialize_operator2() { let data = b"modifier = { factor = 2 num_communications > 2 }"; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2124,7 +2768,7 @@ mod tests { } let data = br#"field1=1 field2=invalid"#; - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { @@ -2210,7 +2854,7 @@ mod tests { ), ]); - let actual: MyStruct = from_slice(&data[..]).unwrap(); + let actual: MyStruct = from_owned(&data[..]); assert_eq!( actual, MyStruct { diff --git a/src/text/dom.rs b/src/text/dom.rs new file mode 100644 index 0000000..96fa318 --- /dev/null +++ b/src/text/dom.rs @@ -0,0 +1,1502 @@ +use super::fnv::FnvBuildHasher; +use crate::{ + text::Operator, DeserializeError, DeserializeErrorKind, Encoding, Scalar, TextTape, TextToken, +}; +use std::{ + borrow::Cow, + collections::{hash_map::Entry, HashMap}, +}; + +pub type KeyValue<'data, 'tokens, E> = ( + ScalarReader<'data, E>, + Option, + ValueReader<'data, 'tokens, E>, +); + +pub type KeyValues<'data, 'tokens, E> = (ScalarReader<'data, E>, GroupEntry<'data, 'tokens, E>); + +/// Calculate what index the next value is. This assumes that a header + value +/// are two separate values +#[inline] +fn next_idx_header(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + TextToken::Operator(_) | TextToken::MixedContainer => idx + 2, + _ => idx + 1, + } +} + +/// Calculate what index the next value is. This assumes that a header + value +/// is one value +#[inline] +fn next_idx(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + TextToken::Operator(_) => next_idx(tokens, idx + 1), + TextToken::Header(_) => next_idx_header(tokens, idx + 1), + _ => idx + 1, + } +} + +#[inline] +fn next_idx_values(tokens: &[TextToken], idx: usize) -> usize { + match tokens[idx] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, + _ => idx + 1, + } +} + +#[inline] +fn fields_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { + let mut ind = start_ind; + let mut count = 0; + while ind < end_ind { + let key_ind = ind; + if tokens[key_ind] == TextToken::MixedContainer { + return count; + } + + let value_ind = match tokens[key_ind + 1] { + TextToken::Operator(_) => key_ind + 2, + _ => key_ind + 1, + }; + ind = next_idx(tokens, value_ind); + count += 1; + } + + count +} + +#[inline] +pub fn values_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { + let mut count = 0; + let mut ind = start_ind; + while ind < end_ind { + ind = next_idx_values(tokens, ind); + count += 1; + } + + count +} + +type OpValue<'data, 'tokens, E> = (Option, ValueReader<'data, 'tokens, E>); + +/// Iterator over values grouped by duplicate keys +/// +/// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example +pub struct GroupEntryIter<'data, 'tokens, 'parent, E> { + index: usize, + parent: &'parent GroupEntry<'data, 'tokens, E>, +} + +impl<'data, 'tokens, 'parent, E> Iterator for GroupEntryIter<'data, 'tokens, 'parent, E> +where + E: Clone, +{ + type Item = (Option, ValueReader<'data, 'tokens, E>); + + fn next(&mut self) -> Option { + match &self.parent { + GroupEntry::One((op, val)) => { + if self.index == 0 { + self.index += 1; + Some((*op, (*val).clone())) + } else { + None + } + } + GroupEntry::Multiple(entries) => { + let result = entries.get(self.index); + self.index += 1; + result.map(|(op, val)| (*op, (*val).clone())) + } + } + } +} + +/// Represents a group of values for duplicate keys +/// +/// May contain one or many values +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.field_groups(); +/// let first_group = fields.next(); +/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(first_key.as_deref(), Some("name")); +/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(first_values_len, Some(1)); +/// let first_values = first_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// +/// let second_group = fields.next(); +/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(second_key.as_deref(), Some("core")); +/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(second_values, Some(2)); +/// let second_values = second_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); +/// # Ok(()) +/// # } +/// ``` +pub enum GroupEntry<'data, 'tokens, E> { + /// Represents that the group is composed of only one value + /// + /// Most fields should only occur once, so this variant is optimized to + /// not require a memory allocation (unlike the `Multiple` variant). + One(OpValue<'data, 'tokens, E>), + + /// Represents that the group is composed of several values + Multiple(Vec>), +} + +impl<'data, 'tokens, E> GroupEntry<'data, 'tokens, E> { + /// Returns an iterator that includes all the values + pub fn values<'parent>(&'parent self) -> GroupEntryIter<'data, 'tokens, 'parent, E> { + GroupEntryIter { + index: 0, + parent: self, + } + } + + /// A group can never be empty so this returns false + pub fn is_empty(&self) -> bool { + false + } + + /// Returns the number of values in the group + pub fn len(&self) -> usize { + match &self { + GroupEntry::One(_) => 1, + GroupEntry::Multiple(x) => x.len(), + } + } +} + +/// All possible text reader variants +#[derive(Debug, Clone)] +pub enum Reader<'data, 'tokens, E> { + /// object reader + Object(ObjectReader<'data, 'tokens, E>), + + /// array reader + Array(ArrayReader<'data, 'tokens, E>), + + /// scalar reader + Scalar(ScalarReader<'data, E>), + + /// value reader + Value(ValueReader<'data, 'tokens, E>), +} + +impl<'data, 'tokens, E> Reader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Interpret value as a string + #[inline] + pub fn read_str(&self) -> Result, DeserializeError> { + match &self { + Reader::Scalar(x) => Ok(x.read_str()), + Reader::Value(x) => x.read_str(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } + + /// Interpret value as a string + #[inline] + pub fn read_string(&self) -> Result { + match &self { + Reader::Scalar(x) => Ok(x.read_string()), + Reader::Value(x) => x.read_string(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } + + /// Interpret value as a scalar + #[inline] + pub fn read_scalar(&self) -> Result, DeserializeError> { + match &self { + Reader::Scalar(x) => Ok(x.read_scalar()), + Reader::Value(x) => x.read_scalar(), + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }), + } + } +} + +/// Iterator over fields of an object grouped by key +/// +/// Since objects can have duplicated keys across fields, this iterator +/// consolidates them such that all values with the same key are grouped +/// together in the order that they appear in the object. Key order is +/// also equivalent, except that already seen keys will be skipped, as +/// those values have already been seen in an earlier group. +/// +/// The process of grouping values together is more expensive than simply +/// iterating the keys in order, so when possible prefer +/// [`ObjectReader::fields()`](crate::text::ObjectReader::fields) over +/// [`ObjectReader::field_groups()`](crate::text::ObjectReader::field_groups). +/// +/// These groups can be easily iterated: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// for (key, group) in reader.field_groups() { +/// match key.read_str().as_ref() { +/// "name" => assert_eq!(group.len(), 1), +/// "core" => assert_eq!(group.len(), 2), +/// x => panic!("unexpected key: {}", x), +/// } +/// } +/// # Ok(()) +/// # } +/// ``` +/// +/// And picked apart: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.field_groups(); +/// let first_group = fields.next(); +/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(first_key.as_deref(), Some("name")); +/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(first_values_len, Some(1)); +/// let first_values = first_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// +/// let second_group = fields.next(); +/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); +/// assert_eq!(second_key.as_deref(), Some("core")); +/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); +/// assert_eq!(second_values, Some(2)); +/// let second_values = second_group.map(|(_, group)| { +/// group.values() +/// .filter_map(|(_op, val)| val.read_string().ok()) +/// .collect() +/// }); +/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); +/// # Ok(()) +/// # } +/// ``` +pub struct FieldGroupsIter<'data, 'tokens, E> { + key_indices: HashMap<&'data [u8], Vec>, FnvBuildHasher>, + fields: FieldsIter<'data, 'tokens, E>, +} + +impl<'data, 'tokens, E> FieldGroupsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { + // Using the fnv hasher improved throughput of the eu4 json benchmark + // by over 15%. + let mut key_indices = + HashMap::with_capacity_and_hasher(reader.fields_len(), FnvBuildHasher::default()); + for (key, op, val) in reader.fields() { + let entry = key_indices.entry(key.read_scalar().as_bytes()); + + match entry { + Entry::Vacant(x) => { + x.insert(Vec::with_capacity(0)); + } + Entry::Occupied(mut x) => { + x.get_mut().push((op, val)); + } + } + } + + let fields = reader.fields(); + + FieldGroupsIter { + key_indices, + fields, + } + } + + /// See [the other `remainder` documentation](crate::text::FieldsIter::remainder) + pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { + self.fields.remainder() + } +} + +impl<'data, 'tokens, E> Iterator for FieldGroupsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = KeyValues<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + loop { + let (key, op, value) = self.fields.next()?; + + if let Some((_key, mut entries)) = + self.key_indices.remove_entry(key.read_scalar().as_bytes()) + { + if entries.is_empty() { + return Some((key, GroupEntry::One((op, value)))); + } else { + entries.insert(0, (op, value)); + return Some((key, GroupEntry::Multiple(entries))); + } + } + } + } + + fn size_hint(&self) -> (usize, Option) { + (self.key_indices.len(), None) + } +} + +/// Iterator over fields of an object in the order that they appear +/// +/// Since objects can have duplicated keys across fields, this iterator +/// may yield items that have duplicate keys. +/// +/// Fields can be easily iterated: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let (names, cores) = reader +/// .fields() +/// .fold((0, 0), |(names, cores), (key, _op, _value)| { +/// match key.read_str().as_ref() { +/// "name" => (names + 1, cores), +/// "core" => (names, cores + 1), +/// x => panic!("unexpected key: {}", x), +/// } +/// }); +/// assert_eq!((1, 2), (names, cores)); +/// # Ok(()) +/// # } +/// ``` +/// +/// And picked apart: +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; +/// let reader = tape.windows1252_reader(); +/// let mut fields = reader.fields(); +/// let (first_key, _op, first_val) = fields.next().unwrap(); +/// assert_eq!(first_key.read_str(), "name"); +/// assert_eq!(first_val.read_str().ok().as_deref(), Some("a")); +/// # Ok(()) +/// # } +/// ``` +pub struct FieldsIter<'data, 'tokens, E> { + token_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> FieldsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { + FieldsIter { + token_ind: reader.start_ind, + end_ind: reader.end_ind, + tokens: reader.tokens, + encoding: reader.encoding.clone(), + } + } + + /// Returns the remaining values from an object if the container is an + /// object that transitions into an array. + pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { + let start = self + .tokens + .get(self.token_ind) + .map(|x| match x { + TextToken::MixedContainer => self.token_ind + 1, + TextToken::End(y) => { + if let Some(TextToken::Array { .. }) = self.tokens.get(*y) { + *y + 1 + } else { + self.token_ind + } + } + _ => self.token_ind, + }) + .unwrap_or(self.end_ind); + + ArrayReader { + start_ind: start, + end_ind: self.end_ind, + encoding: self.encoding.clone(), + tokens: self.tokens, + } + } +} + +impl<'data, 'tokens, E> Iterator for FieldsIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = KeyValue<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + if self.token_ind >= self.end_ind { + return None; + } + + let key_ind = self.token_ind; + let token = self.tokens[key_ind].clone(); + let key_scalar = match token { + TextToken::Quoted(x) + | TextToken::Unquoted(x) + | TextToken::Parameter(x) + | TextToken::UndefinedParameter(x) => x, + TextToken::MixedContainer => { + return None; + } + _ => { + // this is a broken invariant, so we safely recover by saying the object + // has no more fields + debug_assert!(false, "All keys should be scalars, not {:?}", &token); + return None; + } + }; + + let key_reader = ScalarReader { + scalar: key_scalar, + token, + encoding: self.encoding.clone(), + }; + + let (op, value_ind) = match self.tokens[key_ind + 1] { + TextToken::Operator(x) => (Some(x), key_ind + 2), + _ => (None, key_ind + 1), + }; + + let value_reader = ValueReader { + value_ind, + tokens: self.tokens, + encoding: self.encoding.clone(), + }; + self.token_ind = next_idx(self.tokens, value_ind); + Some((key_reader, op, value_reader)) + } + + fn size_hint(&self) -> (usize, Option) { + let len = fields_len(self.tokens, self.token_ind, self.end_ind); + (len, None) + } +} + +/// A reader for objects +#[derive(Debug, Clone)] +pub struct ObjectReader<'data, 'tokens, E> { + start_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ObjectReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Create a new object reader from parsed data with encoded strings + pub fn new(tape: &'tokens TextTape<'data>, encoding: E) -> Self { + let tokens = tape.tokens(); + ObjectReader { + tokens, + end_ind: tokens.len(), + start_ind: 0, + encoding, + } + } + + /// Return the number of tokens contained within the object + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1} foo=bar")?; + /// let reader = tape.windows1252_reader(); + /// assert_eq!(reader.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + pub fn tokens_len(&self) -> usize { + self.end_ind - self.start_ind + } + + /// Deserialize from the object reader + /// + /// ``` + /// use jomini::TextTape; + /// use serde::Deserialize; + /// + /// # fn main() -> Result<(), Box> { + /// #[derive(Debug, Clone, Deserialize, PartialEq)] + /// pub struct Obj { + /// foo: String, + /// } + /// + /// let tape = TextTape::from_slice(b"obj={foo=bar}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, obj_value) = fields.next().unwrap(); + /// let obj_reader = obj_value.read_object().unwrap(); + /// let result: Obj = obj_reader.deserialize().unwrap(); + /// assert_eq!(result, Obj { foo: "bar".to_string() }); + /// # Ok(()) + /// # } + /// ``` + #[cfg(feature = "derive")] + pub fn deserialize(&self) -> Result + where + T: serde::Deserialize<'data>, + { + T::deserialize(&crate::TextDeserializer::from_reader(self)) + } + + /// Return the number of key value pairs that the object contains. + pub fn fields_len(&self) -> usize { + fields_len(self.tokens, self.start_ind, self.end_ind) + } + + /// Iterator over fields as they appear in the object + /// + /// See [FieldsIter](crate::text::FieldsIter) for a worked example + #[inline] + pub fn fields(&self) -> FieldsIter<'data, 'tokens, E> { + FieldsIter::new(self) + } + + /// Iterator over fields that are grouped by key + /// + /// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example + #[inline] + pub fn field_groups(&self) -> FieldGroupsIter<'data, 'tokens, E> { + FieldGroupsIter::new(self) + } +} + +/// A text reader that wraps an underlying scalar value +#[derive(Debug, Clone)] +pub struct ScalarReader<'data, E> { + scalar: Scalar<'data>, + token: TextToken<'data>, + encoding: E, +} + +impl<'data, E> ScalarReader<'data, E> +where + E: Encoding, +{ + /// Decode the data with a given string encoding + #[inline] + pub fn read_str(&self) -> Cow<'data, str> { + self.encoding.decode(self.scalar.as_bytes()) + } + + /// Decode the data with a given string encoding + #[inline] + pub fn read_string(&self) -> String { + self.encoding.decode(self.scalar.as_bytes()).into_owned() + } + + /// Return the underlying scalar + #[inline] + pub fn read_scalar(&self) -> Scalar<'data> { + self.scalar + } + + /// Return the token that the reader is abstracting + #[inline] + pub fn token(&self) -> &TextToken<'data> { + &self.token + } +} + +/// A text reader for a text value +#[derive(Debug, Clone)] +pub struct ValueReader<'data, 'tokens, E> { + value_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> { + /// Return the token that the reader is abstracting + #[inline] + pub fn token(&self) -> &TextToken<'data> { + &self.tokens[self.value_ind] + } + + #[cfg(feature = "derive")] + pub(crate) fn next(&mut self) -> Option<&TextToken<'data>> { + self.value_ind += 1; + self.tokens.get(self.value_ind) + } +} + +impl<'data, 'tokens, E> Encoding for ValueReader<'data, 'tokens, E> +where + E: Encoding, +{ + #[inline] + fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { + self.encoding.decode(data) + } +} + +impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn raw_str(&self) -> Option> { + match self.tokens[self.value_ind] { + TextToken::Header(s) + | TextToken::Unquoted(s) + | TextToken::Quoted(s) + | TextToken::Parameter(s) + | TextToken::UndefinedParameter(s) => Some(self.encoding.decode(s.as_bytes())), + TextToken::Operator(s) => Some(Cow::Borrowed(s.symbol())), + _ => None, + } + } + + /// Interpret the current value as string + #[inline] + pub fn read_str(&self) -> Result, DeserializeError> { + self.raw_str().ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a string")), + }) + } + + /// Interpret the current value as string + #[inline] + pub fn read_string(&self) -> Result { + self.raw_str() + .map(String::from) + .ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a string")), + }) + } + + /// Interpret the current value as a scalar + #[inline] + pub fn read_scalar(&self) -> Result, DeserializeError> { + self.tokens[self.value_ind] + .as_scalar() + .ok_or_else(|| DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), + }) + } + + /// Interpret the current value as an object + #[inline] + pub fn read_object(&self) -> Result, DeserializeError> { + match self.tokens[self.value_ind] { + TextToken::Object { end, .. } => Ok(ObjectReader { + tokens: self.tokens, + start_ind: self.value_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }), + + TextToken::Array { end, .. } => Ok(ObjectReader { + tokens: self.tokens, + start_ind: end, + end_ind: end, + encoding: self.encoding.clone(), + }), + + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not an object")), + }), + } + } + + /// Interpret the current value as an array + #[inline] + pub fn read_array(&self) -> Result, DeserializeError> { + match self.tokens[self.value_ind] { + TextToken::Object { end, mixed: true } => { + let mut start_ind = self.value_ind + 1; + while self.tokens.get(start_ind) != Some(&TextToken::MixedContainer) { + start_ind = next_idx(self.tokens, start_ind); + } + + Ok(ArrayReader { + tokens: self.tokens, + start_ind: start_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }) + } + TextToken::Array { end, .. } | TextToken::Object { end, .. } => Ok(ArrayReader { + tokens: self.tokens, + start_ind: self.value_ind + 1, + end_ind: end, + encoding: self.encoding.clone(), + }), + + // A header can be seen as a two element array + TextToken::Header(_) => Ok(ArrayReader { + tokens: self.tokens, + start_ind: self.value_ind, + end_ind: next_idx(self.tokens, self.value_ind + 1), + encoding: self.encoding.clone(), + }), + + _ => Err(DeserializeError { + kind: DeserializeErrorKind::Unsupported(String::from("not an array")), + }), + } + } + + /// Return the number of tokens the value encompases + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, first_value) = fields.next().unwrap(); + /// assert_eq!(first_value.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + #[inline] + pub fn tokens_len(&self) -> usize { + match self.tokens[self.value_ind] { + TextToken::Array { end, .. } | TextToken::Object { end, .. } => { + end - self.value_ind - 1 + } + _ => 1, + } + } +} + +/// An iterator over the values of an array +/// +/// ``` +/// use jomini::TextTape; +/// +/// # fn main() -> Result<(), Box> { +/// let tape = TextTape::from_slice(b"cores={a b}")?; +/// let reader = tape.windows1252_reader(); +/// +/// let mut all_cores = Vec::new(); +/// for (key, _op, value) in reader.fields() { +/// assert_eq!(key.read_str(), "cores"); +/// let cores = value.read_array()?; +/// assert_eq!(cores.len(), 2); +/// for value in cores.values() { +/// all_cores.push(value.read_string()?); +/// } +/// } +/// assert_eq!(all_cores, vec![String::from("a"), String::from("b")]); +/// # Ok(()) +/// # } +/// ``` +pub struct ValuesIter<'data, 'tokens, E> { + token_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ValuesIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + fn new(reader: &ArrayReader<'data, 'tokens, E>) -> Self { + ValuesIter { + token_ind: reader.start_ind, + end_ind: reader.end_ind, + tokens: reader.tokens, + encoding: reader.encoding.clone(), + } + } +} + +impl<'data, 'tokens, E> Iterator for ValuesIter<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + type Item = ValueReader<'data, 'tokens, E>; + + fn next(&mut self) -> Option { + if self.token_ind < self.end_ind { + let value_ind = self.token_ind; + self.token_ind = next_idx_values(self.tokens, self.token_ind); + Some(ValueReader { + value_ind, + tokens: self.tokens, + encoding: self.encoding.clone(), + }) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + let len = values_len(self.tokens, self.token_ind, self.end_ind); + (len, Some(len)) + } +} + +/// A text reader for sequences of values +#[derive(Debug, Clone)] +pub struct ArrayReader<'data, 'tokens, E> { + start_ind: usize, + end_ind: usize, + tokens: &'tokens [TextToken<'data>], + encoding: E, +} + +impl<'data, 'tokens, E> ArrayReader<'data, 'tokens, E> +where + E: Encoding + Clone, +{ + /// Iterator over values of an array + /// + /// See [ValuesIter](crate::text::ValuesIter) for a worked example + #[inline] + pub fn values(&self) -> ValuesIter<'data, 'tokens, E> { + ValuesIter::new(self) + } + + /// Returns if the array is empty + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Return the number of values in the array + #[inline] + pub fn len(&self) -> usize { + values_len(self.tokens, self.start_ind, self.end_ind) + } + + /// Return the number of tokens contained within the object + /// + /// ``` + /// use jomini::TextTape; + /// + /// # fn main() -> Result<(), Box> { + /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; + /// let reader = tape.windows1252_reader(); + /// let mut fields = reader.fields(); + /// let (_, _, first_value) = fields.next().unwrap(); + /// let array = first_value.read_array()?; + /// assert_eq!(array.tokens_len(), 6); + /// # Ok(()) + /// # } + /// ``` + #[inline] + pub fn tokens_len(&self) -> usize { + self.end_ind - self.start_ind + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn read_value(value: ValueReader) + where + E: crate::Encoding + Clone, + { + match value.token() { + TextToken::Object { .. } => { + iterate_object(value.read_object().unwrap()); + iterate_array(value.read_array().unwrap()); + } + TextToken::Array { .. } => { + iterate_object(value.read_object().unwrap()); + iterate_array(value.read_array().unwrap()); + } + TextToken::End(_) => panic!("end!?"), + TextToken::Operator(_) => {} + TextToken::MixedContainer => {} + TextToken::Unquoted(_) + | TextToken::Quoted(_) + | TextToken::Header(_) + | TextToken::Parameter(_) + | TextToken::UndefinedParameter(_) => { + let _ = value.read_str().unwrap(); + } + } + } + + fn iterate_array(reader: ArrayReader) + where + E: crate::Encoding + Clone, + { + for value in reader.values() { + read_value(value); + } + } + + fn iterate_object(reader: ObjectReader) + where + E: crate::Encoding + Clone, + { + for (_key, group) in reader.field_groups() { + for (_op, value) in group.values() { + read_value(value); + } + } + + let mut fields = reader.fields(); + for (key, _op, value) in fields.by_ref() { + let _ = key.read_str(); + read_value(value); + } + } + + #[test] + fn simple_text_reader_text() { + let data = b"foo=bar"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + assert_eq!(reader.fields_len(), 1); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + assert_eq!(value.read_string().unwrap(), String::from("bar")); + + assert!(iter.next().is_none()); + } + + #[test] + fn simple_text_reader_obj() { + let data = b"foo={bar=qux}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + + let nested = value.read_object().unwrap(); + let mut nested_iter = nested.fields(); + let (key2, _op, value2) = nested_iter.next().unwrap(); + assert_eq!(key2.read_string(), String::from("bar")); + assert_eq!(value2.read_string().unwrap(), String::from("qux")); + assert!(nested_iter.next().is_none()); + assert!(iter.next().is_none()); + } + + #[test] + fn simple_text_reader_array() { + let data = b"foo={bar qux}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("foo")); + + let nested = value.read_array().unwrap(); + let mut values = nested.values(); + assert_eq!(nested.len(), 2); + let value1 = values.next().unwrap().read_string().unwrap(); + let value2 = values.next().unwrap().read_string().unwrap(); + + assert!(values.next().is_none()); + assert_eq!(value1, String::from("bar")); + assert_eq!(value2, String::from("qux")); + } + + #[test] + fn text_reader_read_fields() { + let data = b"name=aaa name=bbb core=123 core=456 name=ccc name=ddd"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + let mut field_groups = reader.field_groups(); + let (key, values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + + let values = values.values().collect::>(); + assert_eq!(values.len(), 4); + assert_eq!(values[0].1.read_string().unwrap(), String::from("aaa")); + assert_eq!(values[1].1.read_string().unwrap(), String::from("bbb")); + assert_eq!(values[2].1.read_string().unwrap(), String::from("ccc")); + assert_eq!(values[3].1.read_string().unwrap(), String::from("ddd")); + + let (key, values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("core")); + + let values = values.values().collect::>(); + assert_eq!(values.len(), 2); + assert_eq!(values[0].1.read_string().unwrap(), String::from("123")); + assert_eq!(values[1].1.read_string().unwrap(), String::from("456")); + } + + #[test] + fn text_reader_read_fields_nested() { + let data = + b"army={name=aaa unit={name=bbb} unit={name=ccc}} army={name=ddd unit={name=eee}}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut field_groups = reader.field_groups(); + + let (key, army_values) = field_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("army")); + assert_eq!(army_values.len(), 2); + + let army_values = army_values.values().collect::>(); + let aaa = army_values[0].1.read_object().unwrap(); + let mut aaa_groups = aaa.field_groups(); + assert_eq!(aaa.fields_len(), 3); + + let (key, values) = aaa_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(values.len(), 1); + assert_eq!( + values.values().nth(0).unwrap().1.read_string().unwrap(), + String::from("aaa") + ); + + let (key, values) = aaa_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("unit")); + assert_eq!(values.len(), 2); + + let bbb = values.values().nth(0).unwrap().1.read_object().unwrap(); + let mut bbb_fields = bbb.fields(); + let (key, _, value) = bbb_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("bbb")); + + let ccc = values.values().nth(1).unwrap().1.read_object().unwrap(); + let mut ccc_fields = ccc.fields(); + let (key, _, value) = ccc_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("ccc")); + + let ddd = army_values[1].1.read_object().unwrap(); + assert_eq!(ddd.fields_len(), 2); + + let mut ddd_groups = ddd.field_groups(); + let (key, values) = ddd_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(values.len(), 1); + assert_eq!( + values.values().nth(0).unwrap().1.read_string().unwrap(), + String::from("ddd") + ); + + let (key, values) = ddd_groups.next().unwrap(); + assert_eq!(key.read_string(), String::from("unit")); + assert_eq!(values.len(), 1); + + let eee = values.values().nth(0).unwrap().1.read_object().unwrap(); + let mut eee_fields = eee.fields(); + let (key, _, value) = eee_fields.next().unwrap(); + assert_eq!(key.read_string(), String::from("name")); + assert_eq!(value.read_string().unwrap(), String::from("eee")); + } + + #[test] + fn text_reader_read_fields_consume() { + let data = b"name=aaa name=bbb core=123 name=ccc name=ddd"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut count = 0; + for (_key, entries) in reader.field_groups() { + for (_i, (_op, value)) in entries.values().enumerate() { + count += value.read_scalar().map(|_| 1).unwrap_or(0); + } + } + + assert_eq!(count, 5); + } + + #[test] + fn text_reader_mixed_object_1() { + let data = b"levels={10 0=1 0=2}"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + assert_eq!(reader.fields_len(), 1); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("levels")); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 8); + + assert_eq!( + nested.values().nth(3).unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + assert_eq!( + nested.values().nth(6).unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + + let values = nested + .values() + .filter(|x| x.token() != &TextToken::MixedContainer) + .map(|x| x.read_string().unwrap()) + .collect::>(); + + assert_eq!( + values.as_slice(), + &[ + String::from("10"), + String::from("0"), + String::from("="), + String::from("1"), + String::from("0"), + String::from("="), + String::from("2"), + ] + ); + } + + #[test] + fn text_reader_mixed_object_2() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "brittany_area"); + + let mut keys = vec![]; + let brittany = value.read_object().unwrap(); + let mut fields = brittany.fields(); + while let Some((key, _op, _value)) = fields.next() { + keys.push(key.read_str()) + } + + assert_eq!(keys, vec![String::from("color")]); + let trailer = fields.remainder(); + assert_eq!(trailer.len(), 5); + assert_eq!(trailer.values().next().unwrap().read_str().unwrap(), "169"); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 5); + + let mut values = nested.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"169")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"170")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"171")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"172")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"4384")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_mixed_object_3() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let (_key, _op, brittany) = reader.fields().next().unwrap(); + let brittany_reader = brittany.read_object().unwrap(); + + let mut fields = brittany_reader.fields(); + let (lower_bound, upper_bound) = fields.size_hint(); + assert_eq!(lower_bound, brittany_reader.fields_len()); + assert_eq!(lower_bound, 2); + assert!(upper_bound.is_none() || upper_bound == Some(7)); + + let _ = fields.next(); + let (lower_bound, upper_bound) = fields.size_hint(); + assert_eq!(lower_bound, 1); + assert!(upper_bound.is_none() || upper_bound == Some(6)); + + let mut groups = brittany_reader.field_groups(); + let (lower_bound, upper_bound) = groups.size_hint(); + assert_eq!(lower_bound, 1); + assert!(upper_bound.is_none() || upper_bound == Some(6)); + + let _ = groups.next(); + let (lower_bound, upper_bound) = groups.size_hint(); + assert_eq!(lower_bound, 0); + assert!(upper_bound.is_none() || upper_bound == Some(5)); + } + + #[test] + fn text_reader_mixed_object_4() { + let data = br#"levels={a=b 10 c=d 20}"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + + assert_eq!(reader.fields_len(), 1); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_string(), String::from("levels")); + + let nested = value.read_array().unwrap(); + assert_eq!(nested.len(), 5); + + let mut values = nested.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"10")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"c")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Operator(Operator::Equal) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"d")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"20")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_mixed_object_5() { + let data = br#"brittany_area = { #5 + color = { 118 99 151 } + 169 170 171 172 4384 + }"#; + + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "brittany_area"); + + let brittany = value.read_object().unwrap(); + let mut field_groups = brittany.field_groups(); + field_groups.next().unwrap(); + assert!(field_groups.next().is_none()); + + let trailer = field_groups.remainder(); + + let mut values = trailer.values(); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"169")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"170")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"171")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"172")) + ); + assert_eq!( + values.next().unwrap().token(), + &TextToken::Unquoted(Scalar::new(b"4384")) + ); + assert!(values.next().is_none()); + } + + #[test] + fn text_reader_empty_container() { + let data = b"active_idea_groups={ }"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "active_idea_groups"); + + let empty_array = value.read_array().unwrap(); + assert_eq!(0, empty_array.len()); + assert!(empty_array.values().next().is_none()); + + let empty_object = value.read_object().unwrap(); + let mut empty_object_iter = empty_object.fields(); + assert_eq!(0, empty_object.fields_len()); + assert!(empty_object_iter.next().is_none()); + } + + #[test] + fn text_reader_header() { + let data = b"color = rgb { 10 20 30 }"; + let tape = TextTape::from_slice(data).unwrap(); + let reader = tape.windows1252_reader(); + let mut iter = reader.fields(); + let (key, _op, value) = iter.next().unwrap(); + assert_eq!(key.read_str(), "color"); + + let header_array = value.read_array().unwrap(); + let mut values = header_array.values(); + let rgb = values.next().unwrap(); + assert_eq!(rgb.read_str().unwrap(), "rgb"); + + let vals = values.next().unwrap(); + let s = vals.read_array().unwrap(); + let svals = s.values(); + + let colors = svals + .map(|x| x.read_scalar().unwrap()) + .map(|x| x.to_u64().unwrap()) + .collect::>(); + + assert_eq!(colors, vec![10, 20, 30]); + } + + #[test] + fn reader_crash1() { + let data = b"a=r{}"; + let tape = TextTape::from_slice(data).unwrap(); + iterate_object(tape.windows1252_reader()); + } + + #[test] + fn text_reader_object_fields() { + let data = b"a{b=}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_op2() { + let data = b"a{}b>{}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_dupe() { + let data = b"a{b=c d=E d}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_header() { + let data = b"a{}b>r{}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_object_fields_dupe2() { + let data = b"a{b=c d b}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression() { + let data = b"a={b{}=2}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression2() { + let data = b"r={c=d=@{y=u}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + #[test] + fn text_reader_regression3() { + let data = b"a={{t c=d = b}}"; + if let Ok(tape) = TextTape::from_slice(data) { + let reader = tape.windows1252_reader(); + iterate_object(reader); + } + } + + // #[test] + // fn text_reader_regression4() { + // let data = include_bytes!("/home/nick/projects/jomini/fuzz/artifacts/fuzz_text/crash-a14643c9a89c0f4ab665815c99a07b15de3544a5"); + // // let data = b"a={{ b c == == = d e=f}}"; + // if let Ok(tape) = TextTape::from_slice(data) { + // let reader = tape.windows1252_reader(); + // iterate_object(reader); + // } + // } +} diff --git a/src/text/mod.rs b/src/text/mod.rs index 4f1f1b9..f17c392 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -1,14 +1,29 @@ //! Types for parsing clausewitz plaintext input //! -//! See the top level module documentation for an overview that includes parsing -//! and deserializing text. +//! Main text deserialization APIs: +//! - [TextDeserializer::from_utf8_slice](crate::text::de::TextDeserializer::from_utf8_slice): +//! Deserialize game and save files from a slice of data. +//! - [TextDeserializer::from_utf8_reader](crate::text::de::TextDeserializer::from_utf8_reader): +//! (**experimental**) much more memory efficient deserializer that is geared +//! towards deserializing large models like those found in save files. //! -//! For more examples of the mid-level DOM-like API, see [FieldGroupsIter], -//! [FieldsIter], and [ValuesIter] +//! If the serde deserialization API is too high level, one can build +//! abstractions ontop of: +//! - [TextTape::from_slice]: Realizes a pseudo AST onto +//! a linear tape. Cleans up and normalizes data. +//! - [TokenReader]: (**experimental**) an incremental text lexer +//! designed for handling large saves in a memory efficient manner. +//! +//! Some additional APIs are available to make working with a [TextTape] more +//! ergonomic for DOM-like use cases. +//! - [FieldGroupsIter] +//! - [FieldsIter] +//! - [ValuesIter] /// text deserialization #[cfg(feature = "derive")] pub mod de; +mod dom; mod fnv; mod operator; mod reader; @@ -18,10 +33,11 @@ mod writer; #[cfg(feature = "derive")] #[doc(inline)] pub use self::de::Property; -pub use self::operator::*; -pub use self::reader::{ +pub use self::dom::{ ArrayReader, FieldGroupsIter, FieldsIter, GroupEntry, GroupEntryIter, ObjectReader, Reader, ScalarReader, ValueReader, ValuesIter, }; +pub use self::operator::*; pub use self::tape::{TextTape, TextTapeParser, TextToken}; pub use self::writer::*; +pub use reader::{ReaderError, ReaderErrorKind, Token, TokenReader, TokenReaderBuilder}; diff --git a/src/text/reader.rs b/src/text/reader.rs index 1db0514..3e24ef8 100644 --- a/src/text/reader.rs +++ b/src/text/reader.rs @@ -1,1503 +1,1078 @@ -use super::fnv::FnvBuildHasher; +use super::Operator; use crate::{ - text::Operator, DeserializeError, DeserializeErrorKind, Encoding, Error, Scalar, TextTape, - TextToken, + buffer::{BufferError, BufferWindow, BufferWindowBuilder, SliceReader}, + data::is_boundary, + util::{contains_zero_byte, count_chunk, repeat_byte}, + Scalar, }; -use std::{ - borrow::Cow, - collections::{hash_map::Entry, HashMap}, -}; - -pub type KeyValue<'data, 'tokens, E> = ( - ScalarReader<'data, E>, - Option, - ValueReader<'data, 'tokens, E>, -); - -pub type KeyValues<'data, 'tokens, E> = (ScalarReader<'data, E>, GroupEntry<'data, 'tokens, E>); - -/// Calculate what index the next value is. This assumes that a header + value -/// are two separate values -#[inline] -fn next_idx_header(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - TextToken::Operator(_) | TextToken::MixedContainer => idx + 2, - _ => idx + 1, - } -} +use std::io::Read; -/// Calculate what index the next value is. This assumes that a header + value -/// is one value -#[inline] -fn next_idx(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - TextToken::Operator(_) => next_idx(tokens, idx + 1), - TextToken::Header(_) => next_idx_header(tokens, idx + 1), - _ => idx + 1, - } -} +/// Text token, the raw form of [TextToken](crate::text::TextToken) +/// +/// This binary tokens contains the yielded raw tokens, and won't match open and +/// close tokens, nor does it make a determination if open and close represents +/// an array, object, or both. +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub enum Token<'a> { + /// '{' or '[' + Open, -#[inline] -fn next_idx_values(tokens: &[TextToken], idx: usize) -> usize { - match tokens[idx] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => end + 1, - _ => idx + 1, - } -} + /// '{' or ']' + Close, -#[inline] -fn fields_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { - let mut ind = start_ind; - let mut count = 0; - while ind < end_ind { - let key_ind = ind; - if tokens[key_ind] == TextToken::MixedContainer { - return count; - } + /// An operator (eg: `foo=bar`) + Operator(Operator), - let value_ind = match tokens[key_ind + 1] { - TextToken::Operator(_) => key_ind + 2, - _ => key_ind + 1, - }; - ind = next_idx(tokens, value_ind); - count += 1; - } + /// value that is not surrounded by quotes + Unquoted(Scalar<'a>), - count + /// value that is quoted + Quoted(Scalar<'a>), } -#[inline] -pub fn values_len(tokens: &[TextToken], start_ind: usize, end_ind: usize) -> usize { - let mut count = 0; - let mut ind = start_ind; - while ind < end_ind { - ind = next_idx_values(tokens, ind); - count += 1; +impl<'a> Token<'a> { + /// Return as token as a scalar + #[inline] + pub fn as_scalar(&self) -> Option> { + match self { + Token::Quoted(s) | Token::Unquoted(s) => Some(*s), + _ => None, + } } - - count } -type OpValue<'data, 'tokens, E> = (Option, ValueReader<'data, 'tokens, E>); - -/// Iterator over values grouped by duplicate keys -/// -/// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example -pub struct GroupEntryIter<'data, 'tokens, 'parent, E> { - index: usize, - parent: &'parent GroupEntry<'data, 'tokens, E>, +#[derive(Debug)] +enum Utf8Bom { + Unknown, + NotPresent, + Present, } -impl<'data, 'tokens, 'parent, E> Iterator for GroupEntryIter<'data, 'tokens, 'parent, E> -where - E: Clone, -{ - type Item = (Option, ValueReader<'data, 'tokens, E>); - - fn next(&mut self) -> Option { - match &self.parent { - GroupEntry::One((op, val)) => { - if self.index == 0 { - self.index += 1; - Some((*op, (*val).clone())) - } else { - None - } - } - GroupEntry::Multiple(entries) => { - let result = entries.get(self.index); - self.index += 1; - result.map(|(op, val)| (*op, (*val).clone())) - } - } - } -} - -/// Represents a group of values for duplicate keys +/// Scan a [Read] implementation for text [Token]s /// -/// May contain one or many values +/// Example of computing the max nesting depth using a [TokenReader]. /// +/// ```rust +/// use jomini::text::{TokenReader, Token}; +/// let data = b"foo={{ id=3 } {} { id = 4 }}"; +/// let mut reader = TokenReader::new(&data[..]); +/// let mut max_depth = 0; +/// let mut current_depth = 0; +/// while let Some(token) = reader.next()? { +/// match token { +/// Token::Open => { +/// current_depth += 1; +/// max_depth = max_depth.max(current_depth); +/// } +/// Token::Close => current_depth -= 1, +/// _ => {} +/// } +/// } +/// assert_eq!(max_depth, 2); +/// # Ok::<(), jomini::text::ReaderError>(()) /// ``` -/// use jomini::TextTape; /// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.field_groups(); -/// let first_group = fields.next(); -/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(first_key.as_deref(), Some("name")); -/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(first_values_len, Some(1)); -/// let first_values = first_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(first_values, Some(vec![String::from("a")])); +/// Unlike a [TextTape](crate::TextTape), which will skip ghost objects, pair +/// open and close tokens together, and recognize if a container is an object, +/// array, or mixed -- the tokens yielded from a [TokenReader] are not fully +/// formed. This is a much more raw view of the data that can be used to +/// construct higher level parsers and deserializers that operate over a stream +/// of data. /// -/// let second_group = fields.next(); -/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(second_key.as_deref(), Some("core")); -/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(second_values, Some(2)); -/// let second_values = second_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); -/// # Ok(()) -/// # } -/// ``` -pub enum GroupEntry<'data, 'tokens, E> { - /// Represents that the group is composed of only one value - /// - /// Most fields should only occur once, so this variant is optimized to - /// not require a memory allocation (unlike the `Multiple` variant). - One(OpValue<'data, 'tokens, E>), - - /// Represents that the group is composed of several values - Multiple(Vec>), +/// The [TokenReader] is considered **experimental**, as it uses a different +/// parsing algorithm geared towards parsing large save files. Ergonomic +/// equivalents for more esoteric game syntax (like parameter definitions) have +/// not yet been finalized. Game files can still be parsed with the experimental +/// APIs, but these APIs may change in the future based on feedback. Since the +/// binary format is not used for game files, the +/// [binary::TokenReader](crate::binary::TokenReader) is not considered +/// experimental) +/// +/// [TokenReader] operates over a fixed size buffer, so using a +/// [BufRead](std::io::BufRead) affords no benefits. An error will be returned +/// for tokens that are impossible to fit within the buffer (eg: if the provided +/// with 100 byte buffer but there is a binary string that is 101 bytes long). +#[derive(Debug)] +pub struct TokenReader { + reader: R, + buf: BufferWindow, + utf8: Utf8Bom, } -impl<'data, 'tokens, E> GroupEntry<'data, 'tokens, E> { - /// Returns an iterator that includes all the values - pub fn values<'parent>(&'parent self) -> GroupEntryIter<'data, 'tokens, 'parent, E> { - GroupEntryIter { - index: 0, - parent: self, - } - } - - /// A group can never be empty so this returns false - pub fn is_empty(&self) -> bool { - false - } - - /// Returns the number of values in the group - pub fn len(&self) -> usize { - match &self { - GroupEntry::One(_) => 1, - GroupEntry::Multiple(x) => x.len(), +impl TokenReader<()> { + /// Read from a byte slice without memcpy's + #[inline] + pub fn from_slice(data: &[u8]) -> TokenReader> { + TokenReader { + reader: SliceReader::new(data), + buf: BufferWindow::from_slice(data), + utf8: Utf8Bom::Unknown, } } } -/// All possible text reader variants -#[derive(Debug, Clone)] -pub enum Reader<'data, 'tokens, E> { - /// object reader - Object(ObjectReader<'data, 'tokens, E>), - - /// array reader - Array(ArrayReader<'data, 'tokens, E>), - - /// scalar reader - Scalar(ScalarReader<'data, E>), - - /// value reader - Value(ValueReader<'data, 'tokens, E>), -} - -impl<'data, 'tokens, E> Reader<'data, 'tokens, E> +impl TokenReader where - E: Encoding + Clone, + R: Read, { - /// Interpret value as a string + /// Create a new text reader #[inline] - pub fn read_str(&self) -> Result, DeserializeError> { - match &self { - Reader::Scalar(x) => Ok(x.read_str()), - Reader::Value(x) => x.read_str(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), - } + pub fn new(reader: R) -> Self { + TokenReader::builder().build(reader) } - /// Interpret value as a string + /// Returns the byte position of the data stream that has been processed. + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.position(), 4); + /// ``` #[inline] - pub fn read_string(&self) -> Result { - match &self { - Reader::Scalar(x) => Ok(x.read_string()), - Reader::Value(x) => x.read_string(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), - } + pub fn position(&self) -> usize { + self.buf.position() } - /// Interpret value as a scalar #[inline] - pub fn read_scalar(&self) -> Result, DeserializeError> { - match &self { - Reader::Scalar(x) => Ok(x.read_scalar()), - Reader::Value(x) => x.read_scalar(), - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }), + unsafe fn next_opt(&mut self) -> (Option, Option) { + #[derive(Debug)] + enum ParseState { + None, + Quote, + Unquoted, } - } -} -/// Iterator over fields of an object grouped by key -/// -/// Since objects can have duplicated keys across fields, this iterator -/// consolidates them such that all values with the same key are grouped -/// together in the order that they appear in the object. Key order is -/// also equivalent, except that already seen keys will be skipped, as -/// those values have already been seen in an earlier group. -/// -/// The process of grouping values together is more expensive than simply -/// iterating the keys in order, so when possible prefer -/// [`ObjectReader::fields()`](crate::text::ObjectReader::fields) over -/// [`ObjectReader::field_groups()`](crate::text::ObjectReader::field_groups). -/// -/// These groups can be easily iterated: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// for (key, group) in reader.field_groups() { -/// match key.read_str().as_ref() { -/// "name" => assert_eq!(group.len(), 1), -/// "core" => assert_eq!(group.len(), 2), -/// x => panic!("unexpected key: {}", x), -/// } -/// } -/// # Ok(()) -/// # } -/// ``` -/// -/// And picked apart: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.field_groups(); -/// let first_group = fields.next(); -/// let first_key = first_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(first_key.as_deref(), Some("name")); -/// let first_values_len = first_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(first_values_len, Some(1)); -/// let first_values = first_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(first_values, Some(vec![String::from("a")])); -/// -/// let second_group = fields.next(); -/// let second_key = second_group.as_ref().map(|(key, _)| key.read_str()); -/// assert_eq!(second_key.as_deref(), Some("core")); -/// let second_values = second_group.as_ref().map(|(_, group)| group.len()); -/// assert_eq!(second_values, Some(2)); -/// let second_values = second_group.map(|(_, group)| { -/// group.values() -/// .filter_map(|(_op, val)| val.read_string().ok()) -/// .collect() -/// }); -/// assert_eq!(second_values, Some(vec![String::from("b"), String::from("c")])); -/// # Ok(()) -/// # } -/// ``` -pub struct FieldGroupsIter<'data, 'tokens, E> { - key_indices: HashMap<&'data [u8], Vec>, FnvBuildHasher>, - fields: FieldsIter<'data, 'tokens, E>, -} + let mut state = ParseState::None; + let mut ptr = self.buf.start; + loop { + let end = self.buf.end; + let (carry_over, offset) = match state { + ParseState::None => 'eof: loop { + if ptr == end { + break (0, 0); + } -impl<'data, 'tokens, E> FieldGroupsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { - // Using the fnv hasher improved throughput of the eu4 json benchmark - // by over 15%. - let mut key_indices = - HashMap::with_capacity_and_hasher(reader.fields_len(), FnvBuildHasher::default()); - for (key, op, val) in reader.fields() { - let entry = key_indices.entry(key.read_scalar().as_bytes()); - - match entry { - Entry::Vacant(x) => { - x.insert(Vec::with_capacity(0)); + 'inner: loop { + match *ptr { + c @ b' ' | c @ b'\t' => { + ptr = ptr.add(1); + loop { + if ptr == end { + break 'eof (0, 0); + } + + if *ptr != c { + break; + } + + ptr = ptr.add(1) + } + } + b'\n' | b'\r' | b';' => { + ptr = ptr.add(1); + break 'inner; + } + b'#' => { + let start_ptr = ptr; + ptr = ptr.add(1); + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, 0); + } + + if *ptr == b'\n' { + break; + } + + ptr = ptr.add(1) + } + } + b'{' => { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Open), None); + } + b'}' => { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Close), None); + } + b'"' => { + ptr = ptr.add(1); + let start_ptr = ptr; + loop { + if ptr == end { + state = ParseState::Quote; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over); + } + + if *ptr == b'\\' { + let advance = end.offset_from(ptr).min(2); + ptr = ptr.offset(advance); + if ptr == end { + state = ParseState::Quote; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over.max(2) - 2); + } + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr.add(1)); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Quoted(scalar)), None); + } + } + } + b'@' => { + let start_ptr = ptr; + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'[' { + ptr = ptr.add(1); + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, 0); + } else if *ptr == b']' { + ptr = ptr.add(1); + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } else { + ptr = ptr.add(1); + } + } + } else { + loop { + if ptr == end { + let carry_over = end.offset_from(start_ptr) as usize; + state = ParseState::Unquoted; + break 'eof (carry_over, carry_over); + } else if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + } + } + b'=' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::Equal)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Operator(Operator::Exact)), None); + } + } + b'<' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::LessThan)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return (Some(Token::Operator(Operator::LessThanEqual)), None); + } + } + b'!' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'=' { + ptr = ptr.add(1); + } + + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::NotEqual)), None); + } + b'?' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr == b'=' { + ptr = ptr.add(1); + } + + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::Exists)), None); + } + b'>' => { + ptr = ptr.add(1); + if ptr == end { + break 'eof (1, 0); + } + + if *ptr != b'=' { + self.buf.advance_to(ptr); + return (Some(Token::Operator(Operator::GreaterThan)), None); + } else { + self.buf.advance_to(ptr.add(1)); + return ( + Some(Token::Operator(Operator::GreaterThanEqual)), + None, + ); + } + } + b'\xef' if matches!(self.utf8, Utf8Bom::Unknown) => { + match self.buf.window().get(..3) { + Some([0xef, 0xbb, 0xbf]) => { + self.utf8 = Utf8Bom::Present; + ptr = ptr.add(3); + break 'inner; + } + Some(_) => self.utf8 = Utf8Bom::NotPresent, + None => break 'eof (self.buf.window_len(), 0), + } + } + _ => { + let start_ptr = ptr; + ptr = ptr.add(1); + loop { + if ptr == end { + state = ParseState::Unquoted; + let carry_over = end.offset_from(start_ptr) as usize; + break 'eof (carry_over, carry_over); + } else if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(start_ptr..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + } + } + } + }, + ParseState::Quote { .. } => { + while ptr < end { + if *ptr == b'\\' { + let advance = end.offset_from(ptr).min(2); + ptr = ptr.offset(advance); + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr.add(1)); + let scalar = self.buf.get(self.buf.buf.as_ptr()..ptr); + return (Some(Token::Quoted(scalar)), None); + } + } + + // buffer or prior read too small + (self.buf.window_len(), self.buf.window_len()) } - Entry::Occupied(mut x) => { - x.get_mut().push((op, val)); + ParseState::Unquoted { .. } => { + while ptr < end { + if !is_boundary(*ptr) { + ptr = ptr.add(1); + } else { + self.buf.advance_to(ptr); + let scalar = self.buf.get(self.buf.buf.as_ptr()..ptr); + return (Some(Token::Unquoted(scalar)), None); + } + } + + // buffer or prior read too small + (self.buf.window_len(), self.buf.window_len()) } + }; + + self.buf.advance_to(self.buf.end.sub(carry_over)); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => match state { + ParseState::None => { + // if we carried over data that isn't a comment, we + // should have made forward progress. + if carry_over == 0 || *self.buf.start == b'#' { + return (None, None); + } else { + return (None, Some(self.eof_error())); + } + } + ParseState::Quote { .. } => return (None, Some(self.eof_error())), + ParseState::Unquoted { .. } => { + let scalar = std::slice::from_raw_parts(self.buf.start, carry_over); + self.buf.advance_to(self.buf.end); + return (Some(Token::Unquoted(Scalar::new(scalar))), None); + } + }, + Ok(_) => ptr = self.buf.start.add(offset), + Err(e) => return (None, Some(self.buffer_error(e))), } } - - let fields = reader.fields(); - - FieldGroupsIter { - key_indices, - fields, - } } - /// See [the other `remainder` documentation](crate::text::FieldsIter::remainder) - pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { - self.fields.remainder() - } -} - -impl<'data, 'tokens, E> Iterator for FieldGroupsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = KeyValues<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - loop { - let (key, op, value) = self.fields.next()?; - - if let Some((_key, mut entries)) = - self.key_indices.remove_entry(key.read_scalar().as_bytes()) - { - if entries.is_empty() { - return Some((key, GroupEntry::One((op, value)))); - } else { - entries.insert(0, (op, value)); - return Some((key, GroupEntry::Multiple(entries))); - } + /// Advance a given number of bytes and return them. + /// + /// The internal buffer must be large enough to accomodate all bytes. + /// + /// ```rust + /// use jomini::text::{TokenReader, ReaderErrorKind}; + /// let mut reader = TokenReader::new(&b"EU4txt"[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &b"EU4txt"[..]); + /// assert!(matches!(reader.read_bytes(1).unwrap_err().kind(), ReaderErrorKind::Eof)); + /// ``` + #[inline] + pub fn read_bytes(&mut self, bytes: usize) -> Result<&[u8], ReaderError> { + while self.buf.window_len() < bytes { + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.eof_error()), + Ok(_) => {} + Err(e) => return Err(self.buffer_error(e)), } } - } - fn size_hint(&self) -> (usize, Option) { - (self.key_indices.len(), None) + let input = unsafe { std::slice::from_raw_parts(self.buf.start, bytes) }; + self.buf.advance(bytes); + Ok(input) } -} -/// Iterator over fields of an object in the order that they appear -/// -/// Since objects can have duplicated keys across fields, this iterator -/// may yield items that have duplicate keys. -/// -/// Fields can be easily iterated: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let (names, cores) = reader -/// .fields() -/// .fold((0, 0), |(names, cores), (key, _op, _value)| { -/// match key.read_str().as_ref() { -/// "name" => (names + 1, cores), -/// "core" => (names, cores + 1), -/// x => panic!("unexpected key: {}", x), -/// } -/// }); -/// assert_eq!((1, 2), (names, cores)); -/// # Ok(()) -/// # } -/// ``` -/// -/// And picked apart: -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"name=a core=b core=c")?; -/// let reader = tape.windows1252_reader(); -/// let mut fields = reader.fields(); -/// let (first_key, _op, first_val) = fields.next().unwrap(); -/// assert_eq!(first_key.read_str(), "name"); -/// assert_eq!(first_val.read_str().ok().as_deref(), Some("a")); -/// # Ok(()) -/// # } -/// ``` -pub struct FieldsIter<'data, 'tokens, E> { - token_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} - -impl<'data, 'tokens, E> FieldsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ObjectReader<'data, 'tokens, E>) -> Self { - FieldsIter { - token_ind: reader.start_ind, - end_ind: reader.end_ind, - tokens: reader.tokens, - encoding: reader.encoding.clone(), + /// Advance through the containing block until the closing token is consumed + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"foo={{bar={}}} qux=1"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"foo"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Open); + /// assert!(reader.skip_container().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"qux"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1"))); + /// ``` + #[inline] + pub fn skip_container(&mut self) -> Result<(), ReaderError> { + enum SkipState { + None, + Quote, + Comment, } - } - /// Returns the remaining values from an object if the container is an - /// object that transitions into an array. - pub fn remainder(&self) -> ArrayReader<'data, 'tokens, E> { - let start = self - .tokens - .get(self.token_ind) - .map(|x| match x { - TextToken::MixedContainer => self.token_ind + 1, - TextToken::End(y) => { - if let Some(TextToken::Array { .. }) = self.tokens.get(*y) { - *y + 1 - } else { - self.token_ind + let mut state = SkipState::None; + let mut depth = 1; + let mut ptr = self.buf.start; + loop { + let end = self.buf.end; + unsafe { + 'refill: loop { + match state { + SkipState::None => 'new_state: loop { + while end.offset_from(ptr) > 8 { + // process 8 bytes at a time, which reduced + // latency of this function in EU4 saves by 50% + // (a 7% reduction overall). + let data = ptr.cast::().read_unaligned(); + let has_quote = contains_zero_byte(data ^ repeat_byte(b'"')); + let has_comment = contains_zero_byte(data ^ repeat_byte(b'#')); + if has_quote || has_comment { + break; + } + + let has_close = contains_zero_byte(data ^ repeat_byte(b'}')); + let closes = if has_close { + count_chunk(data, b'}') as i32 + } else { + 0 + }; + + let new_depth = depth - closes; + if new_depth < 1 { + break; + } + depth = new_depth; + + let has_open = contains_zero_byte(data ^ repeat_byte(b'{')); + let opens = if has_open { + count_chunk(data, b'{') as i32 + } else { + 0 + }; + + depth += opens; + ptr = ptr.add(8); + } + + if ptr == end { + break 'refill; + } + + let val = *ptr; + ptr = ptr.add(1); + match val { + b'{' => depth += 1, + b'}' => { + depth -= 1; + if depth == 0 { + self.buf.advance_to(ptr); + return Ok(()); + } + } + b'"' => { + state = SkipState::Quote; + break 'new_state; + } + b'#' => { + state = SkipState::Comment; + break 'new_state; + } + _ => {} + } + }, + SkipState::Quote => loop { + if ptr == end { + break 'refill; + } + + if *ptr == b'\\' { + if end.offset_from(ptr) <= 2 { + break 'refill; + } + ptr = ptr.add(2); + } else if *ptr != b'"' { + ptr = ptr.add(1); + } else { + ptr = ptr.add(1); + state = SkipState::None; + break; + } + }, + SkipState::Comment => loop { + if ptr == end { + break 'refill; + } + + if *ptr == b'\n' { + ptr = ptr.add(1); + state = SkipState::None; + break; + } + + ptr = ptr.add(1) + }, } } - _ => self.token_ind, - }) - .unwrap_or(self.end_ind); + } - ArrayReader { - start_ind: start, - end_ind: self.end_ind, - encoding: self.encoding.clone(), - tokens: self.tokens, + self.buf.advance_to(ptr); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Err(self.eof_error()), + Err(e) => return Err(self.buffer_error(e)), + Ok(_) => ptr = self.buf.start, + } } } -} -impl<'data, 'tokens, E> Iterator for FieldsIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = KeyValue<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - if self.token_ind >= self.end_ind { - return None; - } + /// Skip any trailing data associated with the unquoted value. Useful for + /// skipping an unquoted value that may be serving as a header. + /// + /// In the below example the `rgb { 1 2 3 }` will first be parsed as + /// unquoted `rgb`, but the `{ 1 2 3 }` needs to be skipped as well as it is + /// tied to `rgb`. + /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"color = rgb { 1 2 3 } foo=bar"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"color"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"rgb"))); + /// assert!(reader.skip_unquoted_value().is_ok()); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"foo"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"bar"))); + /// ``` + #[inline] + pub fn skip_unquoted_value(&mut self) -> Result<(), ReaderError> { + loop { + unsafe { + let mut ptr = self.buf.start; + let end = self.buf.end; - let key_ind = self.token_ind; - let token = self.tokens[key_ind].clone(); - let key_scalar = match token { - TextToken::Quoted(x) - | TextToken::Unquoted(x) - | TextToken::Parameter(x) - | TextToken::UndefinedParameter(x) => x, - TextToken::MixedContainer => { - return None; - } - _ => { - // this is a broken invariant, so we safely recover by saying the object - // has no more fields - debug_assert!(false, "All keys should be scalars, not {:?}", &token); - return None; - } - }; - - let key_reader = ScalarReader { - scalar: key_scalar, - token, - encoding: self.encoding.clone(), - }; - - let (op, value_ind) = match self.tokens[key_ind + 1] { - TextToken::Operator(x) => (Some(x), key_ind + 2), - _ => (None, key_ind + 1), - }; - - let value_reader = ValueReader { - value_ind, - tokens: self.tokens, - encoding: self.encoding.clone(), - }; - self.token_ind = next_idx(self.tokens, value_ind); - Some((key_reader, op, value_reader)) - } + if end.offset_from(ptr) >= 4 { + let word = ptr.cast::().read_unaligned().to_le(); - fn size_hint(&self) -> (usize, Option) { - let len = fields_len(self.tokens, self.token_ind, self.end_ind); - (len, None) - } -} + // 50% of EU4 values followed by this whitespace sequence + if word == 0x0909090A { + // \n\t\t\t + ptr = ptr.add(4); + } + } -/// A reader for objects -#[derive(Debug, Clone)] -pub struct ObjectReader<'data, 'tokens, E> { - start_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} + while ptr < end { + match *ptr { + b'{' => { + self.buf.advance_to(ptr.add(1)); + return self.skip_container(); + } + b' ' | b'\t' | b'\n' | b'\r' | b';' => { + ptr = ptr.add(1); + } + _ => return Ok(()), + } + } -impl<'data, 'tokens, E> ObjectReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - /// Create a new object reader from parsed data with encoded strings - pub fn new(tape: &'tokens TextTape<'data>, encoding: E) -> Self { - let tokens = tape.tokens(); - ObjectReader { - tokens, - end_ind: tokens.len(), - start_ind: 0, - encoding, + self.buf.advance_to(end); + match self.buf.fill_buf(&mut self.reader) { + Ok(0) => return Ok(()), + Err(e) => return Err(self.buffer_error(e)), + Ok(_) => {} + } + } } } - /// Return the number of tokens contained within the object + /// Consume the token reader and return the internal buffer and reader. This + /// allows the buffer to be reused. /// - /// ``` - /// use jomini::TextTape; + /// ```rust + /// use jomini::text::{TokenReader}; + /// let data = b"EU4txt"; + /// let mut reader = TokenReader::new(&data[..]); + /// assert_eq!(reader.read_bytes(6).unwrap(), &data[..]); /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1} foo=bar")?; - /// let reader = tape.windows1252_reader(); - /// assert_eq!(reader.tokens_len(), 6); - /// # Ok(()) - /// # } + /// let (buf, _) = reader.into_parts(); + /// let data = b"HOI4txt"; + /// let mut reader = TokenReader::builder().buffer(buf).build(&data[..]); + /// assert_eq!(reader.read_bytes(7).unwrap(), &data[..]); /// ``` - pub fn tokens_len(&self) -> usize { - self.end_ind - self.start_ind + #[inline] + pub fn into_parts(self) -> (Box<[u8]>, R) { + (self.buf.buf, self.reader) } - /// Deserialize from the object reader + /// Read the next token in the stream. Will error if not enough data remains + /// to decode a token. /// + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, ReaderErrorKind, Operator}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1444.11.11"))); + /// assert!(matches!(reader.read().unwrap_err().kind(), ReaderErrorKind::Eof)); /// ``` - /// use jomini::TextTape; - /// use serde::Deserialize; - /// - /// # fn main() -> Result<(), Box> { - /// #[derive(Debug, Clone, Deserialize, PartialEq)] - /// pub struct Obj { - /// foo: String, - /// } - /// - /// let tape = TextTape::from_slice(b"obj={foo=bar}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, obj_value) = fields.next().unwrap(); - /// let obj_reader = obj_value.read_object().unwrap(); - /// let result: Obj = obj_reader.deserialize().unwrap(); - /// assert_eq!(result, Obj { foo: "bar".to_string() }); - /// # Ok(()) - /// # } - /// ``` - #[cfg(feature = "derive")] - pub fn deserialize(&self) -> Result - where - T: serde::Deserialize<'data>, - { - T::deserialize(&crate::TextDeserializer::from_reader(self)) - } - - /// Return the number of key value pairs that the object contains. - pub fn fields_len(&self) -> usize { - fields_len(self.tokens, self.start_ind, self.end_ind) - } - - /// Iterator over fields as they appear in the object - /// - /// See [FieldsIter](crate::text::FieldsIter) for a worked example #[inline] - pub fn fields(&self) -> FieldsIter<'data, 'tokens, E> { - FieldsIter::new(self) + pub fn read(&mut self) -> Result { + // Workaround for borrow checker :( + let s = unsafe { &mut *(self as *mut TokenReader) }; + match unsafe { self.next_opt() } { + (Some(x), _) => Ok(x), + (None, None) => Err(s.eof_error()), + (None, Some(e)) => Err(e), + } } - /// Iterator over fields that are grouped by key + /// Read a token, returning none when all the data has been consumed /// - /// See [FieldGroupsIter](crate::text::FieldGroupsIter) for a worked example - #[inline] - pub fn field_groups(&self) -> FieldGroupsIter<'data, 'tokens, E> { - FieldGroupsIter::new(self) - } -} - -/// A text reader that wraps an underlying scalar value -#[derive(Debug, Clone)] -pub struct ScalarReader<'data, E> { - scalar: Scalar<'data>, - token: TextToken<'data>, - encoding: E, -} - -impl<'data, E> ScalarReader<'data, E> -where - E: Encoding, -{ - /// Decode the data with a given string encoding - #[inline] - pub fn read_str(&self) -> Cow<'data, str> { - self.encoding.decode(self.scalar.as_bytes()) - } - - /// Decode the data with a given string encoding + /// ```rust + /// use jomini::{Scalar, text::{TokenReader, Token, Operator}}; + /// let mut reader = TokenReader::new(&b"date=1444.11.11"[..]); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"date"))); + /// assert_eq!(reader.read().unwrap(), Token::Operator(Operator::Equal)); + /// assert_eq!(reader.read().unwrap(), Token::Unquoted(Scalar::new(b"1444.11.11"))); + /// assert_eq!(reader.next().unwrap(), None); + /// ``` #[inline] - pub fn read_string(&self) -> String { - self.encoding.decode(self.scalar.as_bytes()).into_owned() + pub fn next(&mut self) -> Result, ReaderError> { + match unsafe { self.next_opt() } { + (Some(x), _) => Ok(Some(x)), + (None, None) => Ok(None), + (None, Some(e)) => Err(e), + } } - /// Return the underlying scalar - #[inline] - pub fn read_scalar(&self) -> Scalar<'data> { - self.scalar + #[cold] + #[inline(never)] + pub(crate) fn eof_error(&self) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::Eof, + } } - /// Return the token that the reader is abstracting - #[inline] - pub fn token(&self) -> &TextToken<'data> { - &self.token + #[cold] + #[inline(always)] + fn buffer_error(&self, e: BufferError) -> ReaderError { + ReaderError { + position: self.position(), + kind: ReaderErrorKind::from(e), + } } } -/// A text reader for a text value -#[derive(Debug, Clone)] -pub struct ValueReader<'data, 'tokens, E> { - value_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} - -impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> { - /// Return the token that the reader is abstracting - #[inline] - pub fn token(&self) -> &TextToken<'data> { - &self.tokens[self.value_ind] - } - - #[cfg(feature = "derive")] - pub(crate) fn next(&mut self) -> Option<&TextToken<'data>> { - self.value_ind += 1; - self.tokens.get(self.value_ind) +impl TokenReader<()> { + /// Initializes a default [TokenReaderBuilder] + pub fn builder() -> TokenReaderBuilder { + TokenReaderBuilder::default() } } -impl<'data, 'tokens, E> Encoding for ValueReader<'data, 'tokens, E> -where - E: Encoding, -{ - #[inline] - fn decode<'a>(&self, data: &'a [u8]) -> Cow<'a, str> { - self.encoding.decode(data) - } +/// Creates a text token reader +#[derive(Debug, Default)] +pub struct TokenReaderBuilder { + buffer: BufferWindowBuilder, } -impl<'data, 'tokens, E> ValueReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn raw_str(&self) -> Option> { - match self.tokens[self.value_ind] { - TextToken::Header(s) - | TextToken::Unquoted(s) - | TextToken::Quoted(s) - | TextToken::Parameter(s) - | TextToken::UndefinedParameter(s) => Some(self.encoding.decode(s.as_bytes())), - TextToken::Operator(s) => Some(Cow::Borrowed(s.symbol())), - _ => None, - } - } - - /// Interpret the current value as string - #[inline] - pub fn read_str(&self) -> Result, DeserializeError> { - self.raw_str().ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a string")), - }) - } - - /// Interpret the current value as string +impl TokenReaderBuilder { + /// Set the fixed size buffer to the given buffer #[inline] - pub fn read_string(&self) -> Result { - self.raw_str() - .map(String::from) - .ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a string")), - }) + pub fn buffer(mut self, val: Box<[u8]>) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer(val); + self } - /// Interpret the current value as a scalar + /// Set the length of the buffer if no buffer is provided #[inline] - pub fn read_scalar(&self) -> Result, DeserializeError> { - self.tokens[self.value_ind] - .as_scalar() - .ok_or_else(|| DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not a scalar")), - }) + pub fn buffer_len(mut self, val: usize) -> TokenReaderBuilder { + self.buffer = self.buffer.buffer_len(val); + self } - /// Interpret the current value as an object + /// Create a text token reader around a given reader. #[inline] - pub fn read_object(&self) -> Result, DeserializeError> { - match self.tokens[self.value_ind] { - TextToken::Object { end, .. } => Ok(ObjectReader { - tokens: self.tokens, - start_ind: self.value_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }), - - TextToken::Array { end, .. } => Ok(ObjectReader { - tokens: self.tokens, - start_ind: end, - end_ind: end, - encoding: self.encoding.clone(), - }), - - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not an object")), - }), - } - } - - /// Interpret the current value as an array - #[inline] - pub fn read_array(&self) -> Result, DeserializeError> { - match self.tokens[self.value_ind] { - TextToken::Object { end, mixed: true } => { - let mut start_ind = self.value_ind + 1; - while self.tokens.get(start_ind) != Some(&TextToken::MixedContainer) { - start_ind = next_idx(self.tokens, start_ind); - } - - Ok(ArrayReader { - tokens: self.tokens, - start_ind: start_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }) - } - TextToken::Array { end, .. } | TextToken::Object { end, .. } => Ok(ArrayReader { - tokens: self.tokens, - start_ind: self.value_ind + 1, - end_ind: end, - encoding: self.encoding.clone(), - }), - - // A header can be seen as a two element array - TextToken::Header(_) => Ok(ArrayReader { - tokens: self.tokens, - start_ind: self.value_ind, - end_ind: next_idx(self.tokens, self.value_ind + 1), - encoding: self.encoding.clone(), - }), - - _ => Err(DeserializeError { - kind: DeserializeErrorKind::Unsupported(String::from("not an array")), - }), - } - } - - /// Return the number of tokens the value encompases - /// - /// ``` - /// use jomini::TextTape; - /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, first_value) = fields.next().unwrap(); - /// assert_eq!(first_value.tokens_len(), 6); - /// # Ok(()) - /// # } - /// ``` - #[inline] - pub fn tokens_len(&self) -> usize { - match self.tokens[self.value_ind] { - TextToken::Array { end, .. } | TextToken::Object { end, .. } => { - end - self.value_ind - 1 - } - _ => 1, + pub fn build(self, reader: R) -> TokenReader { + let buf = self.buffer.build(); + TokenReader { + reader, + buf, + utf8: Utf8Bom::Unknown, } } } -/// An iterator over the values of an array -/// -/// ``` -/// use jomini::TextTape; -/// -/// # fn main() -> Result<(), Box> { -/// let tape = TextTape::from_slice(b"cores={a b}")?; -/// let reader = tape.windows1252_reader(); -/// -/// let mut all_cores = Vec::new(); -/// for (key, _op, value) in reader.fields() { -/// assert_eq!(key.read_str(), "cores"); -/// let cores = value.read_array()?; -/// assert_eq!(cores.len(), 2); -/// for value in cores.values() { -/// all_cores.push(value.read_string()?); -/// } -/// } -/// assert_eq!(all_cores, vec![String::from("a"), String::from("b")]); -/// # Ok(()) -/// # } -/// ``` -pub struct ValuesIter<'data, 'tokens, E> { - token_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, -} +/// The specific text reader error type. +#[derive(Debug)] +pub enum ReaderErrorKind { + /// An underlying error from a [Read]er + Read(std::io::Error), -impl<'data, 'tokens, E> ValuesIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - fn new(reader: &ArrayReader<'data, 'tokens, E>) -> Self { - ValuesIter { - token_ind: reader.start_ind, - end_ind: reader.end_ind, - tokens: reader.tokens, - encoding: reader.encoding.clone(), - } - } + /// The internal buffer does not have enough room to store data for the next + /// token + BufferFull, + + /// An early end of the data encountered + Eof, } -impl<'data, 'tokens, E> Iterator for ValuesIter<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - type Item = ValueReader<'data, 'tokens, E>; - - fn next(&mut self) -> Option { - if self.token_ind < self.end_ind { - let value_ind = self.token_ind; - self.token_ind = next_idx_values(self.tokens, self.token_ind); - Some(ValueReader { - value_ind, - tokens: self.tokens, - encoding: self.encoding.clone(), - }) - } else { - None +impl From for ReaderErrorKind { + #[inline] + fn from(value: BufferError) -> Self { + match value { + BufferError::Io(x) => ReaderErrorKind::Read(x), + BufferError::BufferFull => ReaderErrorKind::BufferFull, } } - - fn size_hint(&self) -> (usize, Option) { - let len = values_len(self.tokens, self.token_ind, self.end_ind); - (len, Some(len)) - } } -/// A text reader for sequences of values -#[derive(Debug, Clone)] -pub struct ArrayReader<'data, 'tokens, E> { - start_ind: usize, - end_ind: usize, - tokens: &'tokens [TextToken<'data>], - encoding: E, +/// An text lexing error over a `Read` implementation +#[derive(Debug)] +pub struct ReaderError { + position: usize, + kind: ReaderErrorKind, } -impl<'data, 'tokens, E> ArrayReader<'data, 'tokens, E> -where - E: Encoding + Clone, -{ - /// Iterator over values of an array - /// - /// See [ValuesIter](crate::text::ValuesIter) for a worked example - #[inline] - pub fn values(&self) -> ValuesIter<'data, 'tokens, E> { - ValuesIter::new(self) - } - - /// Returns if the array is empty - pub fn is_empty(&self) -> bool { - self.len() == 0 +impl ReaderError { + /// Return the byte position where the error occurred + pub fn position(&self) -> usize { + self.position } - /// Return the number of values in the array - #[inline] - pub fn len(&self) -> usize { - values_len(self.tokens, self.start_ind, self.end_ind) + /// Return a reference the error kind + pub fn kind(&self) -> &ReaderErrorKind { + &self.kind } - /// Return the number of tokens contained within the object - /// - /// ``` - /// use jomini::TextTape; - /// - /// # fn main() -> Result<(), Box> { - /// let tape = TextTape::from_slice(b"obj={1 {foo=bar} 3}")?; - /// let reader = tape.windows1252_reader(); - /// let mut fields = reader.fields(); - /// let (_, _, first_value) = fields.next().unwrap(); - /// let array = first_value.read_array()?; - /// assert_eq!(array.tokens_len(), 6); - /// # Ok(()) - /// # } - /// ``` - #[inline] - pub fn tokens_len(&self) -> usize { - self.end_ind - self.start_ind + /// Consume self and return the error kind + #[must_use] + pub fn into_kind(self) -> ReaderErrorKind { + self.kind } } #[cfg(test)] -mod tests { +mod test { use super::*; - - fn read_value(value: ValueReader) - where - E: crate::Encoding + Clone, - { - match value.token() { - TextToken::Object { .. } => { - iterate_object(value.read_object().unwrap()); - iterate_array(value.read_array().unwrap()); - } - TextToken::Array { .. } => { - iterate_object(value.read_object().unwrap()); - iterate_array(value.read_array().unwrap()); - } - TextToken::End(_) => panic!("end!?"), - TextToken::Operator(_) => {} - TextToken::MixedContainer => {} - TextToken::Unquoted(_) - | TextToken::Quoted(_) - | TextToken::Header(_) - | TextToken::Parameter(_) - | TextToken::UndefinedParameter(_) => { - let _ = value.read_str().unwrap(); - } - } - } - - fn iterate_array(reader: ArrayReader) - where - E: crate::Encoding + Clone, - { - for value in reader.values() { - read_value(value); - } - } - - fn iterate_object(reader: ObjectReader) - where - E: crate::Encoding + Clone, - { - for (_key, group) in reader.field_groups() { - for (_op, value) in group.values() { - read_value(value); - } - } - - let mut fields = reader.fields(); - for (key, _op, value) in fields.by_ref() { - let _ = key.read_str(); - read_value(value); - } - } - - #[test] - fn simple_text_reader_text() { - let data = b"foo=bar"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - assert_eq!(reader.fields_len(), 1); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - assert_eq!(value.read_string().unwrap(), String::from("bar")); - - assert!(iter.next().is_none()); - } - - #[test] - fn simple_text_reader_obj() { - let data = b"foo={bar=qux}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - - let nested = value.read_object().unwrap(); - let mut nested_iter = nested.fields(); - let (key2, _op, value2) = nested_iter.next().unwrap(); - assert_eq!(key2.read_string(), String::from("bar")); - assert_eq!(value2.read_string().unwrap(), String::from("qux")); - assert!(nested_iter.next().is_none()); - assert!(iter.next().is_none()); - } - - #[test] - fn simple_text_reader_array() { - let data = b"foo={bar qux}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("foo")); - - let nested = value.read_array().unwrap(); - let mut values = nested.values(); - assert_eq!(nested.len(), 2); - let value1 = values.next().unwrap().read_string().unwrap(); - let value2 = values.next().unwrap().read_string().unwrap(); - - assert!(values.next().is_none()); - assert_eq!(value1, String::from("bar")); - assert_eq!(value2, String::from("qux")); - } - - #[test] - fn text_reader_read_fields() { - let data = b"name=aaa name=bbb core=123 core=456 name=ccc name=ddd"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - let mut field_groups = reader.field_groups(); - let (key, values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - - let values = values.values().collect::>(); - assert_eq!(values.len(), 4); - assert_eq!(values[0].1.read_string().unwrap(), String::from("aaa")); - assert_eq!(values[1].1.read_string().unwrap(), String::from("bbb")); - assert_eq!(values[2].1.read_string().unwrap(), String::from("ccc")); - assert_eq!(values[3].1.read_string().unwrap(), String::from("ddd")); - - let (key, values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("core")); - - let values = values.values().collect::>(); - assert_eq!(values.len(), 2); - assert_eq!(values[0].1.read_string().unwrap(), String::from("123")); - assert_eq!(values[1].1.read_string().unwrap(), String::from("456")); - } - - #[test] - fn text_reader_read_fields_nested() { - let data = - b"army={name=aaa unit={name=bbb} unit={name=ccc}} army={name=ddd unit={name=eee}}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut field_groups = reader.field_groups(); - - let (key, army_values) = field_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("army")); - assert_eq!(army_values.len(), 2); - - let army_values = army_values.values().collect::>(); - let aaa = army_values[0].1.read_object().unwrap(); - let mut aaa_groups = aaa.field_groups(); - assert_eq!(aaa.fields_len(), 3); - - let (key, values) = aaa_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(values.len(), 1); - assert_eq!( - values.values().nth(0).unwrap().1.read_string().unwrap(), - String::from("aaa") - ); - - let (key, values) = aaa_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("unit")); - assert_eq!(values.len(), 2); - - let bbb = values.values().nth(0).unwrap().1.read_object().unwrap(); - let mut bbb_fields = bbb.fields(); - let (key, _, value) = bbb_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("bbb")); - - let ccc = values.values().nth(1).unwrap().1.read_object().unwrap(); - let mut ccc_fields = ccc.fields(); - let (key, _, value) = ccc_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("ccc")); - - let ddd = army_values[1].1.read_object().unwrap(); - assert_eq!(ddd.fields_len(), 2); - - let mut ddd_groups = ddd.field_groups(); - let (key, values) = ddd_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(values.len(), 1); + use rstest::*; + + #[rstest] + #[case(b"\"hello world\"")] + #[case(b" \"hello world\"")] + #[case(b" \"hello world\"")] + #[case(b"\t\"hello world\"")] + #[case(b"\t\t\"hello world\"")] + #[case(b"\r\n\"hello world\"")] + #[case(b"\r\n\r\n\"hello world\"")] + #[case(b"\n\"hello world\"")] + #[case(b"\n\n\"hello world\"")] + #[case(b" ; \"hello world\"")] + #[case(b" # good morning\n \"hello world\"")] + #[case(b" # good morning\r\n \"hello world\"")] + fn test_whitespace_quoted_scalar(#[case] input: &[u8]) { + let mut reader = TokenReader::new(input); assert_eq!( - values.values().nth(0).unwrap().1.read_string().unwrap(), - String::from("ddd") + reader.read().unwrap(), + Token::Quoted(Scalar::new(b"hello world")) ); - - let (key, values) = ddd_groups.next().unwrap(); - assert_eq!(key.read_string(), String::from("unit")); - assert_eq!(values.len(), 1); - - let eee = values.values().nth(0).unwrap().1.read_object().unwrap(); - let mut eee_fields = eee.fields(); - let (key, _, value) = eee_fields.next().unwrap(); - assert_eq!(key.read_string(), String::from("name")); - assert_eq!(value.read_string().unwrap(), String::from("eee")); - } - - #[test] - fn text_reader_read_fields_consume() { - let data = b"name=aaa name=bbb core=123 name=ccc name=ddd"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut count = 0; - for (_key, entries) in reader.field_groups() { - for (_i, (_op, value)) in entries.values().enumerate() { - count += value.read_scalar().map(|_| 1).unwrap_or(0); - } + assert!(reader.read().is_err()); + } + + #[rstest] + #[case(b" a=b ", &[ + Token::Unquoted(Scalar::new(b"a")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"b")), + ])] + #[case(b" open={1 2}", &[ + Token::Unquoted(Scalar::new(b"open")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Unquoted(Scalar::new(b"1")), + Token::Unquoted(Scalar::new(b"2")), + Token::Close, + ])] + #[case(b"field1=-100.535 ", &[ + Token::Unquoted(Scalar::new(b"field1")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"-100.535")), + ])] + #[case(b"field1=-100.535", &[ + Token::Unquoted(Scalar::new(b"field1")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"-100.535")), + ])] + #[case(b"dlc_enabled={\n\t\"Cop\"\n\t\"WoN\"\n\t\"RP\"\n\t\"AoW\"\n\t\"ED\"\n}", &[ + Token::Unquoted(Scalar::new(b"dlc_enabled")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Quoted(Scalar::new(b"Cop")), + Token::Quoted(Scalar::new(b"WoN")), + Token::Quoted(Scalar::new(b"RP")), + Token::Quoted(Scalar::new(b"AoW")), + Token::Quoted(Scalar::new(b"ED")), + Token::Close, + ])] + #[case(br#""foo"="bar" "3"="1444.11.11""#, &[ + Token::Quoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"bar")), + Token::Quoted(Scalar::new(b"3")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"1444.11.11")), + ])] + #[case(br#""foo"="bar"3="1444.11.11""#, &[ + Token::Quoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"bar")), + Token::Unquoted(Scalar::new(b"3")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"1444.11.11")), + ])] + #[case(br#"custom_name="THE !@#$%^&*( '\"LEGION\"')""#, &[ + Token::Unquoted(Scalar::new(b"custom_name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(br#"THE !@#$%^&*( '\"LEGION\"')"#)), + ])] + // Preventative measures to ensure we don't regress on imperator color codes + #[case(b"custom_name=\"ab \x15D ( ID: 691 )\x15!\"", &[ + Token::Unquoted(Scalar::new(b"custom_name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"ab \x15D ( ID: 691 )\x15!")), + ])] + // test_no_equal_object_event + #[case(b"foo{bar=qux}", &[ + Token::Unquoted(Scalar::new(b"foo")), + Token::Open, + Token::Unquoted(Scalar::new(b"bar")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"qux")), + Token::Close, + ])] + // test_array_of_objects + #[case(b"stats={{id=0 type=general} {id=1 type=admiral}}", &[ + Token::Unquoted(Scalar::new(b"stats")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Open, + Token::Unquoted(Scalar::new(b"id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"0")), + Token::Unquoted(Scalar::new(b"type")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"general")), + Token::Close, + Token::Open, + Token::Unquoted(Scalar::new(b"id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"1")), + Token::Unquoted(Scalar::new(b"type")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"admiral")), + Token::Close, + Token::Close, + ])] + // test_no_ws_comment + #[case(b"foo=abc#def\nbar=qux", &[ + Token::Unquoted(Scalar::new(b"foo")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"abc")), + Token::Unquoted(Scalar::new(b"bar")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"qux")), + ])] + // test_bom + #[case(b"\xef\xbb\xbf#hello", &[])] + // test_period_in_identifiers + #[case(b"flavor_tur.8=yes", &[ + Token::Unquoted(Scalar::new(b"flavor_tur.8")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"yes")), + ])] + // test_dashed_identifiers From stellaris saves + #[case(b"dashed-identifier=yes", &[ + Token::Unquoted(Scalar::new(b"dashed-identifier")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"yes")), + ])] + // test_colon_values + #[case(b"province_id = event_target:agenda_province", &[ + Token::Unquoted(Scalar::new(b"province_id")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"event_target:agenda_province")), + ])] + // test_parameter_syntax_with_values + // the new syntax to pass parameters to script values is explained in + // stellaris: common/script_values/00_script_values.txt + #[case(b"mult = value:job_weights_research_modifier|JOB|head_researcher|", &[ + Token::Unquoted(Scalar::new(b"mult")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new( + b"value:job_weights_research_modifier|JOB|head_researcher|" + )), + ])] + // test_variables + #[case(b"@planet_standard_scale = 11", &[ + Token::Unquoted(Scalar::new(b"@planet_standard_scale")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"11")), + ])] + // test_variables_value + #[case(b"window_name = @default_window_name", &[ + Token::Unquoted(Scalar::new(b"window_name")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"@default_window_name")), + ])] + // test_interpolated_variable + #[case(b"position = { @[1-leopard_x] @leopard_y }", &[ + Token::Unquoted(Scalar::new(b"position")), + Token::Operator(Operator::Equal), + Token::Open, + Token::Unquoted(Scalar::new(b"@[1-leopard_x]")), + Token::Unquoted(Scalar::new(b"@leopard_y")), + Token::Close, + ])] + // test_unquoted_non_ascii More vic2 shenanigans + #[case(b"jean_jaur\xe8s = bar ", &[ + Token::Unquoted(Scalar::new(b"jean_jaur\xe8s")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"bar")), + ])] + // test_skip_semicolon + #[case(b"value=\"win\"; a=b", &[ + Token::Unquoted(Scalar::new(b"value")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"win")), + Token::Unquoted(Scalar::new(b"a")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"b")), + ])] + fn test_input(#[case] input: &[u8], #[case] expected: &[Token]) { + let mut reader = TokenReader::new(input); + for (i, e) in expected.iter().enumerate() { + assert_eq!(*e, reader.read().unwrap(), "failure at token idx: {}", i); } - assert_eq!(count, 5); - } - - #[test] - fn text_reader_mixed_object_1() { - let data = b"levels={10 0=1 0=2}"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - assert_eq!(reader.fields_len(), 1); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("levels")); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 8); - - assert_eq!( - nested.values().nth(3).unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - assert_eq!( - nested.values().nth(6).unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - - let values = nested - .values() - .filter(|x| x.token() != &TextToken::MixedContainer) - .map(|x| x.read_string().unwrap()) - .collect::>(); - - assert_eq!( - values.as_slice(), - &[ - String::from("10"), - String::from("0"), - String::from("="), - String::from("1"), - String::from("0"), - String::from("="), - String::from("2"), - ] - ); - } - - #[test] - fn text_reader_mixed_object_2() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "brittany_area"); - - let mut keys = vec![]; - let brittany = value.read_object().unwrap(); - let mut fields = brittany.fields(); - while let Some((key, _op, _value)) = fields.next() { - keys.push(key.read_str()) - } - - assert_eq!(keys, vec![String::from("color")]); - let trailer = fields.remainder(); - assert_eq!(trailer.len(), 5); - assert_eq!(trailer.values().next().unwrap().read_str().unwrap(), "169"); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 5); - - let mut values = nested.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"169")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"170")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"171")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"172")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"4384")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_mixed_object_3() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let (_key, _op, brittany) = reader.fields().next().unwrap(); - let brittany_reader = brittany.read_object().unwrap(); - - let mut fields = brittany_reader.fields(); - let (lower_bound, upper_bound) = fields.size_hint(); - assert_eq!(lower_bound, brittany_reader.fields_len()); - assert_eq!(lower_bound, 2); - assert!(upper_bound.is_none() || upper_bound == Some(7)); - - let _ = fields.next(); - let (lower_bound, upper_bound) = fields.size_hint(); - assert_eq!(lower_bound, 1); - assert!(upper_bound.is_none() || upper_bound == Some(6)); - - let mut groups = brittany_reader.field_groups(); - let (lower_bound, upper_bound) = groups.size_hint(); - assert_eq!(lower_bound, 1); - assert!(upper_bound.is_none() || upper_bound == Some(6)); - - let _ = groups.next(); - let (lower_bound, upper_bound) = groups.size_hint(); - assert_eq!(lower_bound, 0); - assert!(upper_bound.is_none() || upper_bound == Some(5)); - } - - #[test] - fn text_reader_mixed_object_4() { - let data = br#"levels={a=b 10 c=d 20}"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - - assert_eq!(reader.fields_len(), 1); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_string(), String::from("levels")); - - let nested = value.read_array().unwrap(); - assert_eq!(nested.len(), 5); - - let mut values = nested.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"10")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"c")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Operator(Operator::Equal) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"d")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"20")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_mixed_object_5() { - let data = br#"brittany_area = { #5 - color = { 118 99 151 } - 169 170 171 172 4384 - }"#; - - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "brittany_area"); - - let brittany = value.read_object().unwrap(); - let mut field_groups = brittany.field_groups(); - field_groups.next().unwrap(); - assert!(field_groups.next().is_none()); - - let trailer = field_groups.remainder(); - - let mut values = trailer.values(); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"169")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"170")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"171")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"172")) - ); - assert_eq!( - values.next().unwrap().token(), - &TextToken::Unquoted(Scalar::new(b"4384")) - ); - assert!(values.next().is_none()); - } - - #[test] - fn text_reader_empty_container() { - let data = b"active_idea_groups={ }"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "active_idea_groups"); - - let empty_array = value.read_array().unwrap(); - assert_eq!(0, empty_array.len()); - assert!(empty_array.values().next().is_none()); - - let empty_object = value.read_object().unwrap(); - let mut empty_object_iter = empty_object.fields(); - assert_eq!(0, empty_object.fields_len()); - assert!(empty_object_iter.next().is_none()); - } - - #[test] - fn text_reader_header() { - let data = b"color = rgb { 10 20 30 }"; - let tape = TextTape::from_slice(data).unwrap(); - let reader = tape.windows1252_reader(); - let mut iter = reader.fields(); - let (key, _op, value) = iter.next().unwrap(); - assert_eq!(key.read_str(), "color"); - - let header_array = value.read_array().unwrap(); - let mut values = header_array.values(); - let rgb = values.next().unwrap(); - assert_eq!(rgb.read_str().unwrap(), "rgb"); - - let vals = values.next().unwrap(); - let s = vals.read_array().unwrap(); - let svals = s.values(); - - let colors = svals - .map(|x| x.read_scalar().unwrap()) - .map(|x| x.to_u64().unwrap()) - .collect::>(); - - assert_eq!(colors, vec![10, 20, 30]); - } - - #[test] - fn reader_crash1() { - let data = b"a=r{}"; - let tape = TextTape::from_slice(data).unwrap(); - iterate_object(tape.windows1252_reader()); - } - - #[test] - fn text_reader_object_fields() { - let data = b"a{b=}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_op2() { - let data = b"a{}b>{}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_dupe() { - let data = b"a{b=c d=E d}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_header() { - let data = b"a{}b>r{}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } - - #[test] - fn text_reader_object_fields_dupe2() { - let data = b"a{b=c d b}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } - } + reader.read().unwrap_err(); + } + + #[rstest] + #[case(b" hello= butIsaytoYou", &[ + Token::Unquoted(Scalar::new(b"hello")), + Token::Operator(Operator::Equal), + Token::Unquoted(Scalar::new(b"butIsaytoYou")), + ])] + #[case(b" \"lovely\"= \"who is it\"", &[ + Token::Quoted(Scalar::new(b"lovely")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(b"who is it")), + ])] + #[case(br#" "name"= "\"jolly\" john""#, &[ + Token::Quoted(Scalar::new(b"name")), + Token::Operator(Operator::Equal), + Token::Quoted(Scalar::new(br#"\"jolly\" john"#)), + ])] + fn test_refill(#[case] input: &[u8], #[case] expected: &[Token]) { + let min_buffer_size = expected + .iter() + .filter_map(|x| match x { + Token::Unquoted(s) => Some(s.as_bytes().len()), + Token::Quoted(s) => Some(s.as_bytes().len()), + _ => None, + }) + .max() + .unwrap() + + 1; + + for i in min_buffer_size..min_buffer_size + 10 { + let mut reader = TokenReader::builder().buffer_len(i).build(input); + for e in expected.iter() { + assert_eq!(*e, reader.read().unwrap()); + } - #[test] - fn text_reader_regression() { - let data = b"a={b{}=2}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); + assert!(reader.read().is_err()); } } - #[test] - fn text_reader_regression2() { - let data = b"r={c=d=@{y=u}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); + #[rstest] + #[case(b"a=b c=d } done")] + #[case(br#"a=alongervalue c=d } done"#)] + #[case(br#"a="a long quoted value" c=d } done"#)] + #[case(br#"a="a long \"quoted value\" with escapes" c=d } done"#)] + #[case(br#"a={"an object" { "nested array" }} c=d } done"#)] + fn test_skip_container(#[case] input: &[u8]) { + for i in 8..16 { + let mut reader = TokenReader::builder().buffer_len(i).build(input); + reader.skip_container().unwrap(); + + assert_eq!( + reader.read().unwrap(), + Token::Unquoted(Scalar::new(b"done")) + ); } } - #[test] - fn text_reader_regression3() { - let data = b"a={{t c=d = b}}"; - if let Ok(tape) = TextTape::from_slice(data) { - let reader = tape.windows1252_reader(); - iterate_object(reader); - } + #[rstest] + #[case(b"\"\\")] + fn test_crash_regression(#[case] input: &[u8]) { + let mut reader = TokenReader::new(input); + while let Ok(Some(_)) = reader.next() {} } - - // #[test] - // fn text_reader_regression4() { - // let data = include_bytes!("/home/nick/projects/jomini/fuzz/artifacts/fuzz_text/crash-a14643c9a89c0f4ab665815c99a07b15de3544a5"); - // // let data = b"a={{ b c == == = d e=f}}"; - // if let Ok(tape) = TextTape::from_slice(data) { - // let reader = tape.windows1252_reader(); - // iterate_object(reader); - // } - // } } diff --git a/src/util.rs b/src/util.rs index b112b30..f7f1d94 100644 --- a/src/util.rs +++ b/src/util.rs @@ -45,11 +45,50 @@ pub(crate) fn contains_zero_byte(x: u64) -> bool { x.wrapping_sub(LO_U64) & !x & HI_U64 != 0 } +/// https://github.com/llogiq/bytecount/blob/934ea0ef4338f00c797500b10c39f03b3cfc1692/src/integer_simd.rs#L21-L27 +#[inline] +const fn bytewise_equal(lhs: u64, rhs: u64) -> u64 { + let lo = u64::MAX / 0xFF; + let hi = lo << 7; + + let x = lhs ^ rhs; + !((((x & !hi) + !hi) | x) >> 7) & lo +} + +#[inline] +const fn sum_usize(values: u64) -> u64 { + let every_other_byte_lo = u64::MAX / 0xFFFF; + let every_other_byte = every_other_byte_lo * 0xFF; + + // Pairwise reduction to avoid overflow on next step. + let pair_sum: u64 = (values & every_other_byte) + ((values >> 8) & every_other_byte); + + // Multiplication results in top two bytes holding sum. + pair_sum.wrapping_mul(every_other_byte_lo) >> ((core::mem::size_of::() - 2) * 8) +} + +#[inline] +pub(crate) const fn count_chunk(value: u64, byte: u8) -> u64 { + sum_usize(bytewise_equal(value, repeat_byte(byte))) +} + #[cfg(test)] mod tests { use super::*; use rstest::*; + #[rstest] + #[case(*b" ", 0)] + #[case(*b" { ", 1)] + #[case(*b" { {", 2)] + #[case(*b"{ { {", 3)] + #[case(*b"{{{{{{{{", 8)] + fn test_count_chunk(#[case] input: [u8; 8], #[case] expected: u64) { + let lhs = u64::from_le_bytes(input); + let rhs = repeat_byte(b'{'); + assert_eq!(sum_usize(bytewise_equal(lhs, rhs)), expected); + } + #[rstest] #[case(*b"14441111", Some(14441111))] #[case(*b"14440101", Some(14440101))]