diff --git a/fuzz/fuzz_targets/compare_to_serde.rs b/fuzz/fuzz_targets/compare_to_serde.rs index 779c1929..03e8608e 100644 --- a/fuzz/fuzz_targets/compare_to_serde.rs +++ b/fuzz/fuzz_targets/compare_to_serde.rs @@ -84,10 +84,11 @@ fn remove_suffix(s: &str) -> &str { fn errors_equal(jiter_error: &JiterError, serde_error: &SerdeError) -> bool { let jiter_error_str = jiter_error.to_string(); let serde_error_str = serde_error.to_string(); - if jiter_error_str.starts_with("invalid escape at") { - // strings like `"\"\\u\\"` give a EOF error for serde and invalid escape for jiter - true - } else if serde_error_str.starts_with("number out of range") { + // if jiter_error_str.starts_with("invalid escape at") { + // // strings like `"\"\\u\\"` give a EOF error for serde and invalid escape for jiter + // true + // } else if serde_error_str.starts_with("number out of range") { + if serde_error_str.starts_with("number out of range") { // ignore this case as serde is stricter so fails on this before jiter does true } else if serde_error_str.starts_with("recursion limit exceeded") { diff --git a/src/errors.rs b/src/errors.rs index dc3e50f6..df66666d 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -68,7 +68,7 @@ pub enum JsonErrorType { FloatKeyMustBeFinite, /// Lone leading surrogate in hex escape. - LoneLeadingSurrogateInHexEscape, + LoneLeadingSurrogateInHexEscape(usize), /// JSON has a comma after the last value in an array or map. TrailingComma, @@ -77,7 +77,7 @@ pub enum JsonErrorType { TrailingCharacters, /// Unexpected end of hex escape. - UnexpectedEndOfHexEscape, + UnexpectedEndOfHexEscape(usize), /// Encountered nesting of JSON maps and arrays more than 128 layers deep. RecursionLimitExceeded, @@ -109,10 +109,10 @@ impl std::fmt::Display for JsonErrorType { Self::KeyMustBeAString => f.write_str("key must be a string"), Self::ExpectedNumericKey => f.write_str("invalid value: expected key to be a number in quotes"), Self::FloatKeyMustBeFinite => f.write_str("float key must be finite (got NaN or +/-inf)"), - Self::LoneLeadingSurrogateInHexEscape => f.write_str("lone leading surrogate in hex escape"), + Self::LoneLeadingSurrogateInHexEscape(_) => f.write_str("lone leading surrogate in hex escape"), Self::TrailingComma => f.write_str("trailing comma"), Self::TrailingCharacters => f.write_str("trailing characters"), - Self::UnexpectedEndOfHexEscape => f.write_str("unexpected end of hex escape"), + Self::UnexpectedEndOfHexEscape(_) => f.write_str("unexpected end of hex escape"), Self::RecursionLimitExceeded => f.write_str("recursion limit exceeded"), } } diff --git a/src/string_decoder.rs b/src/string_decoder.rs index 0a57895d..e4134351 100644 --- a/src/string_decoder.rs +++ b/src/string_decoder.rs @@ -94,16 +94,16 @@ fn to_str(bytes: &[u8], ascii_only: bool, start: usize) -> JsonResult<&str> { } } -/// Taken from https://github.com/serde-rs/json/blob/45f10ec816e3f2765ac08f7ca73752326b0475d7/src/read.rs#L873-L928 +/// Taken approximately from https://github.com/serde-rs/json/blob/v1.0.107/src/read.rs#L872-L945 fn parse_escape(data: &[u8], index: usize, start: usize) -> JsonResult<(char, usize)> { let (n, index) = parse_u4(data, index, start)?; match n { - 0xDC00..=0xDFFF => json_err!(InvalidEscape, index - start, start - 1), - 0xD800..=0xDBFF => match (data.get(index + 1), data.get(index + 2)) { - (Some(b'\\'), Some(b'u')) => { + 0xDC00..=0xDFFF => json_err!(LoneLeadingSurrogateInHexEscape, index - start, start - 1), + 0xD800..=0xDBFF => match data.get(index + 1..index + 3) { + Some(slice) if slice == b"\\u" => { let (n2, index) = parse_u4(data, index + 2, start)?; if !(0xDC00..=0xDFFF).contains(&n2) { - return json_err!(InvalidEscape, index - start, start - 1); + return json_err!(LoneLeadingSurrogateInHexEscape, index - start, start - 1); } let n2 = (((n - 0xD800) as u32) << 10 | (n2 - 0xDC00) as u32) + 0x1_0000; @@ -112,23 +112,27 @@ fn parse_escape(data: &[u8], index: usize, start: usize) -> JsonResult<(char, us None => json_err!(EofWhileParsingString, index), } } - _ => json_err!(InvalidEscape, index - start, start - 1), + Some(_) => json_err!(UnexpectedEndOfHexEscape, index - start, start - 1), + None => match data.get(index + 1) { + Some(b'\\') | None => json_err!(EofWhileParsingString, data.len()), + Some(_) => json_err!(UnexpectedEndOfHexEscape, index - start, start - 1), + }, }, _ => match char::from_u32(n as u32) { Some(c) => Ok((c, index)), - None => json_err!(EofWhileParsingString, index), + None => json_err!(InvalidEscape, index - start, index), }, } } fn parse_u4(data: &[u8], mut index: usize, start: usize) -> JsonResult<(u16, usize)> { let mut n = 0; - for _ in 0..4 { + let u4 = data + .get(index + 1..index + 5) + .ok_or_else(|| json_error!(EofWhileParsingString, data.len()))?; + + for c in u4.iter() { index += 1; - let c = match data.get(index) { - Some(c) => *c, - None => return json_err!(EofWhileParsingString, index), - }; let hex = match c { b'0'..=b'9' => (c & 0x0f) as u16, b'a'..=b'f' => (c - b'a' + 10) as u16, diff --git a/tests/main.rs b/tests/main.rs index d897bdc5..177ebe19 100644 --- a/tests/main.rs +++ b/tests/main.rs @@ -280,6 +280,38 @@ string_tests! { controls_python: "\"\\b\\f\\n\\r\\t\"" => "\x08\x0c\n\r\t"; // python notation for the same thing } +macro_rules! string_test_errors { + ($($name:ident: $json:literal => $expected_error:literal;)*) => { + $( + paste::item! { + #[test] + fn [< string_parsing_errors_ $name >]() { + let data = $json.as_bytes(); + let mut tape: Vec = Vec::new(); + let mut parser = Parser::new(data); + let peak = parser.peak().unwrap(); + assert_eq!(peak, Peak::String); + match parser.consume_string::(&mut tape) { + Ok(t) => panic!("unexpectedly valid: {:?} -> {:?}", $json, t), + Err(e) => { + let actual_error = format!("{:?} @ {}", e.error_type, e.index); + assert_eq!(actual_error, $expected_error); + } + } + } + } + )* + } +} + +string_test_errors! { + u4_unclosed: r#""\uxx"# => "EofWhileParsingString @ 5"; + u4_unclosed2: r#""\udBdd"# => "EofWhileParsingString @ 7"; + line_leading_surrogate: r#""\uddBd""# => "LoneLeadingSurrogateInHexEscape(5) @ 0"; + unexpected_hex_escape1: r#""\udBd8x"# => "UnexpectedEndOfHexEscape(5) @ 0"; + unexpected_hex_escape2: r#""\udBd8xx"# => "UnexpectedEndOfHexEscape(5) @ 0"; +} + #[test] fn test_key_str() { let json = r#"{"foo": "bar"}"#;