-
-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Decouple serialization and JSON log format more clearly
Add a custom serde_json Formatter with support for byte arrays. SpecialFormatter adds a write_byte_array method that outputs URI-encoded strings. Move percent encoding from QuotedString type to URIEscapeWriter which works as a filter for any type that implements the std::io::Write trait. Where possible, use serialize_bytes(), either directly or via a Bytes helper type.
- Loading branch information
Showing
7 changed files
with
277 additions
and
143 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
use std::io::{Result, Write}; | ||
|
||
use crate::quote::*; | ||
|
||
/// A Formatter for serde_josn that outputs byte buffers as | ||
/// URI-encodeed strings. | ||
#[derive(Clone, Debug)] | ||
pub struct SpecialFormatter; | ||
|
||
impl serde_json::ser::Formatter for SpecialFormatter { | ||
fn write_byte_array<W>(&mut self, writer: &mut W, value: &[u8]) -> Result<()> | ||
where | ||
W: ?Sized + Write, | ||
{ | ||
self.begin_string(writer)?; | ||
URIEscapeWriter(&mut BackslashEscapeWriter(writer)) | ||
.write(value) | ||
.map(|_| ())?; | ||
self.end_string(writer) | ||
} | ||
} | ||
|
||
pub fn to_writer<W, T>(writer: W, value: &T) -> serde_json::Result<()> | ||
where | ||
W: Write, | ||
T: ?Sized + serde::Serialize, | ||
{ | ||
let mut ser = serde_json::Serializer::with_formatter(writer, SpecialFormatter); | ||
value.serialize(&mut ser) | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::to_writer; | ||
use crate::types::Bytes; | ||
|
||
fn serialized(value: &[u8]) -> String { | ||
let mut buf = vec![]; | ||
to_writer(&mut buf, &Bytes(&value)).unwrap(); | ||
String::from_utf8(buf).unwrap() | ||
} | ||
|
||
#[test] | ||
fn json_serialize() { | ||
for (buf, expected) in &[ | ||
(&b" "[..], r#"" ""#), | ||
(&b"asdf"[..], r#""asdf""#), | ||
(&b"+"[..], r#""%2b""#), | ||
(&b"%"[..], r#""%25""#), | ||
(&b"+++"[..], r#""%2b%2b%2b""#), | ||
(&b"%%%"[..], r#""%25%25%25""#), | ||
(&b"%+%"[..], r#""%25%2b%25""#), | ||
(&b"\xc3\xa4"[..], r#""ä""#), | ||
(&b"\xe2\x82\xac"[..], r#""€""#), | ||
(&b"\xf0\x9f\x92\x96"[..], r#""💖""#), | ||
(&b"\xc3\xa4\xc3\xb6\xc3\xbc"[..], r#""äöü""#), | ||
(&b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh"[..], r#""abcdäöüefgh""#), | ||
(&b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"[..], r#""🄻🄰🅄🅁🄴🄻""#), | ||
(&b"\xc3\xc3\xa4"[..], r#""%c3ä""#), | ||
(&b"\xf0\xf0\x9f\x92\x96"[..], r#""%f0💖""#), | ||
(&b"\xf0\x9f\xf0\x9f\x92\x96"[..], r#""%f0%9f💖""#), | ||
(&b"\xf0\x9f\x92\xf0\x9f\x92\x96"[..], r#""%f0%9f%92💖""#), | ||
|
||
(&b"\xed\xa0\x80"[..], r#""%ed%a0%80""#), // illegal surrogate codepoint 0xd800 | ||
(&b"\xed\xa3\xbf"[..], r#""%ed%a3%bf""#), // illegal surrogate codepoint 0xd8ff | ||
(&b"\xed\xbf\xbf"[..], r#""%ed%bf%bf""#), // illegal surrogate codepoint 0xdfff | ||
] { | ||
assert_eq!(serialized(buf), *expected); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
use std::io::{Result, Write}; | ||
|
||
const HEXDIGITS: &[u8; 16] = b"0123456789abcdef"; | ||
|
||
/// Adapter that applies backslash-coding according to JSON rules to | ||
/// the bytes written. | ||
pub(crate) struct BackslashEscapeWriter<'a, W>(pub &'a mut W) | ||
where | ||
W: ?Sized + Write; | ||
|
||
impl<'a, W> Write for BackslashEscapeWriter<'a, W> | ||
where | ||
W: ?Sized + Write, | ||
{ | ||
fn write(&mut self, buf: &[u8]) -> Result<usize> { | ||
let mut quoted = [b'\\', b'u', b'0', b'0', b'0', b'0']; | ||
let mut start_unquoted = 0; | ||
for (n, c) in buf.iter().enumerate() { | ||
let quoted = match c { | ||
b'"' => &br#"\""#[..], | ||
b'\\' => &br#"\\"#[..], | ||
b'\x08' => &br#"\b"#[..], | ||
b'\x0c' => &br#"\f"#[..], | ||
b'\n' => &br#"\n"#[..], | ||
b'\r' => &br#"\r"#[..], | ||
b'\t' => &br#"\t"#[..], | ||
c if *c < 32 => { | ||
quoted[4] = HEXDIGITS[((*c & 0xf0) >> 4) as usize]; | ||
quoted[5] = HEXDIGITS[(*c & 0x0f) as usize]; | ||
"ed | ||
} | ||
_ => continue, | ||
}; | ||
self.0.write_all(&buf[start_unquoted..n])?; | ||
self.0.write_all(quoted)?; | ||
start_unquoted = n + 1; | ||
} | ||
self.0.write_all(&buf[start_unquoted..])?; | ||
Ok(buf.len()) | ||
} | ||
fn flush(&mut self) -> Result<()> { | ||
self.0.flush() | ||
} | ||
} | ||
|
||
fn write_quoted_byte<W>(writer: &mut W, value: u8) -> Result<()> | ||
where | ||
W: ?Sized + Write, | ||
{ | ||
let value = value as usize; | ||
writer.write_all(&[b'%', HEXDIGITS[value >> 4], HEXDIGITS[value & 0x0f]]) | ||
} | ||
|
||
/// Adapter that applies URI-escaping (except ' ' -> '+') to the bytes writen. | ||
/// | ||
/// Printable ASCII characters except `%`, `+`, and `\b`, `\f`, `\n`, | ||
/// `\r`, `\t` are left as-is. | ||
/// | ||
/// This is the "inner" encoding of the JSON strings produced by Laurel. | ||
pub(crate) struct URIEscapeWriter<'a, W>(pub &'a mut W) | ||
where | ||
W: ?Sized + Write; | ||
|
||
impl<'a, W> Write for URIEscapeWriter<'a, W> | ||
where | ||
W: ?Sized + Write, | ||
{ | ||
fn write(&mut self, buf: &[u8]) -> Result<usize> { | ||
let mut utf8state: Option<u8> = None; | ||
let mut stash = tinyvec::array_vec!([u8; 4]); | ||
let mut start_unquoted = 0; | ||
for (n, c) in buf.iter().enumerate() { | ||
loop { | ||
match utf8state { | ||
None => { | ||
if *c >= 32 | ||
&& *c < 127 | ||
&& ![b'%', b'+', b'\x08', b'\x0c', b'\n', b'\r', b'\t'].contains(c) | ||
{ | ||
// simple byte, collect to be output as-is. | ||
break; | ||
} | ||
self.0.write_all(&buf[start_unquoted..n])?; | ||
start_unquoted = n + 1; | ||
let len = match *c { | ||
n if n & 0b11100000 == 0b11000000 => 1, | ||
n if n & 0b11110000 == 0b11100000 => 2, | ||
n if n & 0b11111000 == 0b11110000 => 3, | ||
_ => { | ||
// simple non-representable byte | ||
write_quoted_byte(self.0, *c)?; | ||
break; | ||
} | ||
}; | ||
stash.clear(); | ||
stash.push(*c); | ||
utf8state = Some(len); | ||
break; | ||
} | ||
Some(ref mut len) => { | ||
if *c & 0b11000000 == 0b10000000 { | ||
start_unquoted = n + 1; | ||
stash.push(*c); | ||
*len -= 1; | ||
// Complete UTF-8 multi-byte-sequence. Write. | ||
if *len == 0 { | ||
match std::str::from_utf8(&stash) { | ||
Ok(_) => self.0.write_all(&stash)?, | ||
_ => stash | ||
.iter() | ||
.try_for_each(|c| write_quoted_byte(self.0, *c))?, | ||
} | ||
utf8state = None; | ||
} | ||
break; | ||
} else { | ||
// Incomplete UTF-8 multi-byte sequence. | ||
// Write and re-evaluate current byte. | ||
stash | ||
.iter() | ||
.try_for_each(|c| write_quoted_byte(self.0, *c))?; | ||
utf8state = None; | ||
} | ||
} | ||
} | ||
} | ||
} | ||
// invalid UTF-8 multi-byte-sequence at end of input. | ||
match utf8state { | ||
Some(_) => stash | ||
.iter() | ||
.try_for_each(|c| write_quoted_byte(self.0, *c))?, | ||
None => self.0.write_all(&buf[start_unquoted..])?, | ||
}; | ||
Ok(buf.len()) | ||
} | ||
fn flush(&mut self) -> Result<()> { | ||
self.0.flush() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test { | ||
use super::URIEscapeWriter; | ||
use std::io::Write; | ||
|
||
fn uri_escaped(value: &[u8]) -> String { | ||
let mut buf = Vec::with_capacity(value.len()); | ||
URIEscapeWriter(&mut buf).write(&value).unwrap(); | ||
String::from_utf8(buf).unwrap() | ||
} | ||
|
||
#[test] | ||
fn uri_escape() { | ||
assert_eq!(" ", uri_escaped(b" ")); | ||
assert_eq!("asdf", uri_escaped(b"asdf")); | ||
assert_eq!("%2b", uri_escaped(b"+")); | ||
assert_eq!("%25", uri_escaped(b"%")); | ||
assert_eq!("%2b%2b%2b", uri_escaped(b"+++")); | ||
assert_eq!("%25%25%25", uri_escaped(b"%%%")); | ||
assert_eq!("%25%2b%25", uri_escaped(b"%+%")); | ||
assert_eq!("ä", uri_escaped(b"\xc3\xa4")); | ||
assert_eq!("€", uri_escaped(b"\xe2\x82\xac")); | ||
assert_eq!("💖", uri_escaped(b"\xf0\x9f\x92\x96")); | ||
assert_eq!("äöü", uri_escaped(b"\xc3\xa4\xc3\xb6\xc3\xbc")); | ||
assert_eq!( | ||
"abcdäöüefgh", | ||
uri_escaped(b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh") | ||
); | ||
assert_eq!("🄻🄰🅄🅁🄴🄻", uri_escaped(b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb")); | ||
assert_eq!("%c3ä", uri_escaped(b"\xc3\xc3\xa4")); | ||
assert_eq!("%f0💖", uri_escaped(b"\xf0\xf0\x9f\x92\x96")); | ||
assert_eq!("%f0💖%f0", uri_escaped(b"\xf0\xf0\x9f\x92\x96\xf0")); | ||
assert_eq!("%f0💖asdf", uri_escaped(b"\xf0\xf0\x9f\x92\x96asdf")); | ||
assert_eq!("%f0%9f💖", uri_escaped(b"\xf0\x9f\xf0\x9f\x92\x96")); | ||
assert_eq!("%f0%9f%92💖", uri_escaped(b"\xf0\x9f\x92\xf0\x9f\x92\x96")); | ||
// This will probably need some corner cases. | ||
} | ||
} |
Oops, something went wrong.