Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes in unescape routine #771

Merged
merged 10 commits into from
Jun 29, 2024
4 changes: 3 additions & 1 deletion Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,11 @@
- [#771]: `EscapeError::UnrecognizedSymbol` renamed to `EscapeError::UnrecognizedEntity`.
- [#771]: Implemented `PartialEq` for `EscapeError`.
- [#771]: Replace the following variants of `EscapeError` by `InvalidCharRef` variant
with a standard `ParseIntError` inside:
with a new `ParseCharRefError` inside:
- `EntityWithNull`
- `InvalidDecimal`
- `InvalidHexadecimal`
- `InvalidCodepoint`

[#771]: https://github.com/tafia/quick-xml/pull/771
[#772]: https://github.com/tafia/quick-xml/pull/772
Expand Down
74 changes: 54 additions & 20 deletions src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,56 @@ use std::borrow::Cow;
use std::num::ParseIntError;
use std::ops::Range;

/// Error of parsing character reference (`&#<dec-number>;` or `&#x<hex-number>;`).
#[derive(Clone, Debug, PartialEq)]
pub enum ParseCharRefError {
/// Number contains sign character (`+` or `-`) which is not allowed.
UnexpectedSign,
/// Number cannot be parsed due to non-number characters or a numeric overflow.
InvalidNumber(ParseIntError),
/// Character reference represents not a valid unicode codepoint.
InvalidCodepoint(u32),
/// Character reference expanded to a not permitted character for an XML.
///
/// Currently, only `0x0` character produces this error.
IllegalCharacter(u32),
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍


impl std::fmt::Display for ParseCharRefError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
Self::UnexpectedSign => f.write_str("unexpected number sign"),
Self::InvalidNumber(e) => e.fmt(f),
Self::InvalidCodepoint(n) => write!(f, "`{}` is not a valid codepoint", n),
Self::IllegalCharacter(n) => write!(f, "0x{:x} character is not permitted in XML", n),
}
}
}

impl std::error::Error for ParseCharRefError {
fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
match self {
Self::InvalidNumber(e) => Some(e),
_ => None,
}
}
}

/// Error for XML escape / unescape.
#[derive(Clone, Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(Range<usize>),
/// Referenced entity in unknown to the parser.
UnrecognizedEntity(Range<usize>, String),
/// Cannot find `;` after `&`
UnterminatedEntity(Range<usize>),
/// Attempt to parse character reference (`&#<dec-number>;` or `&#x<hex-number>;`)
/// was unsuccessful, not all characters are decimal or hexadecimal numbers.
InvalidCharRef(ParseIntError),
/// Not a valid unicode codepoint
InvalidCodepoint(u32),
InvalidCharRef(ParseCharRefError),
}

impl std::fmt::Display for EscapeError {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
match self {
EscapeError::EntityWithNull(e) => write!(
f,
"Error while escaping character at range {:?}: Null character entity not allowed",
e
),
EscapeError::UnrecognizedEntity(rge, res) => {
write!(f, "at {:?}: unrecognized entity `{}`", rge, res)
}
Expand All @@ -40,7 +66,6 @@ impl std::fmt::Display for EscapeError {
EscapeError::InvalidCharRef(e) => {
write!(f, "invalid character reference: {}", e)
}
EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n),
}
}
}
Expand Down Expand Up @@ -246,7 +271,7 @@ where
// search for character correctness
let pat = &raw[start + 1..end];
if let Some(entity) = pat.strip_prefix('#') {
let codepoint = parse_number(entity, start..end)?;
let codepoint = parse_number(entity).map_err(EscapeError::InvalidCharRef)?;
unescaped.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
} else if let Some(value) = resolve_entity(pat) {
unescaped.push_str(value);
Expand Down Expand Up @@ -1791,18 +1816,27 @@ pub const fn resolve_html5_entity(entity: &str) -> Option<&'static str> {
Some(s)
}

fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
let code = if let Some(hex_digits) = bytes.strip_prefix('x') {
u32::from_str_radix(hex_digits, 16)
fn parse_number(num: &str) -> Result<char, ParseCharRefError> {
let code = if let Some(hex) = num.strip_prefix('x') {
from_str_radix(hex, 16)?
} else {
u32::from_str_radix(bytes, 10)
}
.map_err(EscapeError::InvalidCharRef)?;
from_str_radix(num, 10)?
};
if code == 0 {
return Err(EscapeError::EntityWithNull(range));
return Err(ParseCharRefError::IllegalCharacter(code));
}
match std::char::from_u32(code) {
Some(c) => Ok(c),
None => Err(EscapeError::InvalidCodepoint(code)),
None => Err(ParseCharRefError::InvalidCodepoint(code)),
}
}

#[inline]
fn from_str_radix(src: &str, radix: u32) -> Result<u32, ParseCharRefError> {
match src.as_bytes().first().copied() {
// We should not allow sign numbers, but u32::from_str_radix will accept `+`.
// We also handle `-` to be consistent in returned errors
Some(b'+') | Some(b'-') => Err(ParseCharRefError::UnexpectedSign),
_ => u32::from_str_radix(src, radix).map_err(ParseCharRefError::InvalidNumber),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Much better, thanks

}
}
96 changes: 87 additions & 9 deletions tests/escape.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use pretty_assertions::assert_eq;
use quick_xml::escape::{self, EscapeError};
use quick_xml::escape::{self, EscapeError, ParseCharRefError};
use std::borrow::Cow;
use std::num::IntErrorKind;

Expand Down Expand Up @@ -93,15 +93,54 @@ fn unescape_long() {

// Too big numbers for u32 should produce errors
match escape::unescape(&format!("&#{};", u32::MAX as u64 + 1)) {
Err(EscapeError::InvalidCharRef(err)) => assert_eq!(err.kind(), &IntErrorKind::PosOverflow),
x => panic!("expected Err(InvalidCharRef(PosOverflow)), bug got {:?}", x),
Err(EscapeError::InvalidCharRef(ParseCharRefError::InvalidNumber(err))) => {
assert_eq!(err.kind(), &IntErrorKind::PosOverflow)
}
x => panic!(
"expected Err(InvalidCharRef(InvalidNumber(PosOverflow))), bug got {:?}",
x
),
}
match escape::unescape(&format!("&#x{:x};", u32::MAX as u64 + 1)) {
Err(EscapeError::InvalidCharRef(err)) => assert_eq!(err.kind(), &IntErrorKind::PosOverflow),
x => panic!("expected Err(InvalidCharRef(PosOverflow)), bug got {:?}", x),
Err(EscapeError::InvalidCharRef(ParseCharRefError::InvalidNumber(err))) => {
assert_eq!(err.kind(), &IntErrorKind::PosOverflow)
}
x => panic!(
"expected Err(InvalidCharRef(InvalidNumber(PosOverflow))), bug got {:?}",
x
),
}
}

#[test]
fn unescape_sign() {
assert_eq!(
escape::unescape("&#+48;"),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
assert_eq!(
escape::unescape("&#x+30;"),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);

assert_eq!(
escape::unescape("&#-48;"),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
assert_eq!(
escape::unescape("&#x-30;"),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
}

#[test]
fn unescape_with() {
let custom_entities = |ent: &str| match ent {
Expand Down Expand Up @@ -156,11 +195,50 @@ fn unescape_with_long() {

// Too big numbers for u32 should produce errors
match escape::unescape_with(&format!("&#{};", u32::MAX as u64 + 1), |_| None) {
Err(EscapeError::InvalidCharRef(err)) => assert_eq!(err.kind(), &IntErrorKind::PosOverflow),
x => panic!("expected Err(InvalidCharRef(PosOverflow)), bug got {:?}", x),
Err(EscapeError::InvalidCharRef(ParseCharRefError::InvalidNumber(err))) => {
assert_eq!(err.kind(), &IntErrorKind::PosOverflow)
}
x => panic!(
"expected Err(InvalidCharRef(InvalidNumber(PosOverflow))), bug got {:?}",
x
),
}
match escape::unescape_with(&format!("&#x{:x};", u32::MAX as u64 + 1), |_| None) {
Err(EscapeError::InvalidCharRef(err)) => assert_eq!(err.kind(), &IntErrorKind::PosOverflow),
x => panic!("expected Err(InvalidCharRef(PosOverflow)), bug got {:?}", x),
Err(EscapeError::InvalidCharRef(ParseCharRefError::InvalidNumber(err))) => {
assert_eq!(err.kind(), &IntErrorKind::PosOverflow)
}
x => panic!(
"expected Err(InvalidCharRef(InvalidNumber(PosOverflow))), bug got {:?}",
x
),
}
}

#[test]
fn unescape_with_sign() {
assert_eq!(
escape::unescape_with("&#+48;", |_| None),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
assert_eq!(
escape::unescape_with("&#x+30;", |_| None),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);

assert_eq!(
escape::unescape_with("&#-48;", |_| None),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
assert_eq!(
escape::unescape_with("&#x-30;", |_| None),
Err(EscapeError::InvalidCharRef(
ParseCharRefError::UnexpectedSign
)),
);
}