Skip to content

Commit

Permalink
Decouple serialization and JSON log format more clearly
Browse files Browse the repository at this point in the history
Add a custom serde_json Formatter with support for byte arrays.
SpecialFormatter adds a write_byte_array method that outputs
URI-encoded strings.

Move percent encoding from QuotedString type to URIEscapeWriter which
works as a filter for any type that implements the std::io::Write
trait.

Where possible, use serialize_bytes(), either directly or via a Bytes
helper type.
  • Loading branch information
hillu committed Feb 26, 2024
1 parent 67bac75 commit 2776abc
Show file tree
Hide file tree
Showing 7 changed files with 277 additions and 143 deletions.
2 changes: 1 addition & 1 deletion src/bin/laurel/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ impl Logger {
if let Some(prefix) = &self.prefix {
self.output.write_all(prefix.as_bytes()).unwrap();
}
serde_json::to_writer(&mut self.output, &message).unwrap();
laurel::json::to_writer(&mut self.output, &message).unwrap();
self.output.write_all(b"\n").unwrap();
self.output.flush().unwrap();
}
Expand Down
3 changes: 1 addition & 2 deletions src/coalesce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1018,14 +1018,13 @@ impl Drop for Coalesce<'_, '_> {
#[cfg(test)]
mod test {
use super::*;
use serde_json;
use std::cell::RefCell;
use std::io::{BufRead, BufReader};
use std::rc::Rc;

fn event_to_json(e: &Event) -> String {
let mut out = vec![];
serde_json::to_writer(&mut out, e).unwrap();
crate::json::to_writer(&mut out, e).unwrap();
String::from_utf8_lossy(&out).to_string()
}

Expand Down
71 changes: 71 additions & 0 deletions src/json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
use std::io::{Result, Write};

use crate::quote::*;

/// A Formatter for serde_josn that outputs byte buffers as
/// URI-encodeed strings.
#[derive(Clone, Debug)]
pub struct SpecialFormatter;

impl serde_json::ser::Formatter for SpecialFormatter {
fn write_byte_array<W>(&mut self, writer: &mut W, value: &[u8]) -> Result<()>
where
W: ?Sized + Write,
{
self.begin_string(writer)?;
URIEscapeWriter(&mut BackslashEscapeWriter(writer))
.write(value)
.map(|_| ())?;
self.end_string(writer)
}
}

pub fn to_writer<W, T>(writer: W, value: &T) -> serde_json::Result<()>
where
W: Write,
T: ?Sized + serde::Serialize,
{
let mut ser = serde_json::Serializer::with_formatter(writer, SpecialFormatter);
value.serialize(&mut ser)
}

#[cfg(test)]
mod test {
use super::to_writer;
use crate::types::Bytes;

fn serialized(value: &[u8]) -> String {
let mut buf = vec![];
to_writer(&mut buf, &Bytes(&value)).unwrap();
String::from_utf8(buf).unwrap()
}

#[test]
fn json_serialize() {
for (buf, expected) in &[
(&b" "[..], r#"" ""#),
(&b"asdf"[..], r#""asdf""#),
(&b"+"[..], r#""%2b""#),
(&b"%"[..], r#""%25""#),
(&b"+++"[..], r#""%2b%2b%2b""#),
(&b"%%%"[..], r#""%25%25%25""#),
(&b"%+%"[..], r#""%25%2b%25""#),
(&b"\xc3\xa4"[..], r#""ä""#),
(&b"\xe2\x82\xac"[..], r#""€""#),
(&b"\xf0\x9f\x92\x96"[..], r#""💖""#),
(&b"\xc3\xa4\xc3\xb6\xc3\xbc"[..], r#""äöü""#),
(&b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh"[..], r#""abcdäöüefgh""#),
(&b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"[..], r#""🄻🄰🅄🅁🄴🄻""#),
(&b"\xc3\xc3\xa4"[..], r#""%c3ä""#),
(&b"\xf0\xf0\x9f\x92\x96"[..], r#""%f0💖""#),
(&b"\xf0\x9f\xf0\x9f\x92\x96"[..], r#""%f0%9f💖""#),
(&b"\xf0\x9f\x92\xf0\x9f\x92\x96"[..], r#""%f0%9f%92💖""#),

(&b"\xed\xa0\x80"[..], r#""%ed%a0%80""#), // illegal surrogate codepoint 0xd800
(&b"\xed\xa3\xbf"[..], r#""%ed%a3%bf""#), // illegal surrogate codepoint 0xd8ff
(&b"\xed\xbf\xbf"[..], r#""%ed%bf%bf""#), // illegal surrogate codepoint 0xdfff
] {
assert_eq!(serialized(buf), *expected);
}
}
}
3 changes: 2 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub mod coalesce;
pub mod config;
pub mod constants;
pub mod json;
pub mod label_matcher;
pub mod logger;
pub mod parser;
pub mod proc;
#[cfg(all(feature = "procfs", target_os = "linux"))]
pub mod procfs;
pub mod quoted_string;
pub(crate) mod quote;
pub mod rotate;
#[cfg(target_os = "linux")]
pub mod sockaddr;
Expand Down
179 changes: 179 additions & 0 deletions src/quote.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
use std::io::{Result, Write};

const HEXDIGITS: &[u8; 16] = b"0123456789abcdef";

/// Adapter that applies backslash-coding according to JSON rules to
/// the bytes written.
pub(crate) struct BackslashEscapeWriter<'a, W>(pub &'a mut W)
where
W: ?Sized + Write;

impl<'a, W> Write for BackslashEscapeWriter<'a, W>
where
W: ?Sized + Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize> {
let mut quoted = [b'\\', b'u', b'0', b'0', b'0', b'0'];
let mut start_unquoted = 0;
for (n, c) in buf.iter().enumerate() {
let quoted = match c {
b'"' => &br#"\""#[..],
b'\\' => &br#"\\"#[..],
b'\x08' => &br#"\b"#[..],
b'\x0c' => &br#"\f"#[..],
b'\n' => &br#"\n"#[..],
b'\r' => &br#"\r"#[..],
b'\t' => &br#"\t"#[..],
c if *c < 32 => {
quoted[4] = HEXDIGITS[((*c & 0xf0) >> 4) as usize];
quoted[5] = HEXDIGITS[(*c & 0x0f) as usize];
&quoted
}
_ => continue,
};
self.0.write_all(&buf[start_unquoted..n])?;
self.0.write_all(quoted)?;
start_unquoted = n + 1;
}
self.0.write_all(&buf[start_unquoted..])?;
Ok(buf.len())
}
fn flush(&mut self) -> Result<()> {
self.0.flush()
}
}

fn write_quoted_byte<W>(writer: &mut W, value: u8) -> Result<()>
where
W: ?Sized + Write,
{
let value = value as usize;
writer.write_all(&[b'%', HEXDIGITS[value >> 4], HEXDIGITS[value & 0x0f]])
}

/// Adapter that applies URI-escaping (except ' ' -> '+') to the bytes writen.
///
/// Printable ASCII characters except `%`, `+`, and `\b`, `\f`, `\n`,
/// `\r`, `\t` are left as-is.
///
/// This is the "inner" encoding of the JSON strings produced by Laurel.
pub(crate) struct URIEscapeWriter<'a, W>(pub &'a mut W)
where
W: ?Sized + Write;

impl<'a, W> Write for URIEscapeWriter<'a, W>
where
W: ?Sized + Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize> {
let mut utf8state: Option<u8> = None;
let mut stash = tinyvec::array_vec!([u8; 4]);
let mut start_unquoted = 0;
for (n, c) in buf.iter().enumerate() {
loop {
match utf8state {
None => {
if *c >= 32
&& *c < 127
&& ![b'%', b'+', b'\x08', b'\x0c', b'\n', b'\r', b'\t'].contains(c)
{
// simple byte, collect to be output as-is.
break;
}
self.0.write_all(&buf[start_unquoted..n])?;
start_unquoted = n + 1;
let len = match *c {
n if n & 0b11100000 == 0b11000000 => 1,
n if n & 0b11110000 == 0b11100000 => 2,
n if n & 0b11111000 == 0b11110000 => 3,
_ => {
// simple non-representable byte
write_quoted_byte(self.0, *c)?;
break;
}
};
stash.clear();
stash.push(*c);
utf8state = Some(len);
break;
}
Some(ref mut len) => {
if *c & 0b11000000 == 0b10000000 {
start_unquoted = n + 1;
stash.push(*c);
*len -= 1;
// Complete UTF-8 multi-byte-sequence. Write.
if *len == 0 {
match std::str::from_utf8(&stash) {
Ok(_) => self.0.write_all(&stash)?,
_ => stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?,
}
utf8state = None;
}
break;
} else {
// Incomplete UTF-8 multi-byte sequence.
// Write and re-evaluate current byte.
stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?;
utf8state = None;
}
}
}
}
}
// invalid UTF-8 multi-byte-sequence at end of input.
match utf8state {
Some(_) => stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?,
None => self.0.write_all(&buf[start_unquoted..])?,
};
Ok(buf.len())
}
fn flush(&mut self) -> Result<()> {
self.0.flush()
}
}

#[cfg(test)]
mod test {
use super::URIEscapeWriter;
use std::io::Write;

fn uri_escaped(value: &[u8]) -> String {
let mut buf = Vec::with_capacity(value.len());
URIEscapeWriter(&mut buf).write(&value).unwrap();
String::from_utf8(buf).unwrap()
}

#[test]
fn uri_escape() {
assert_eq!(" ", uri_escaped(b" "));
assert_eq!("asdf", uri_escaped(b"asdf"));
assert_eq!("%2b", uri_escaped(b"+"));
assert_eq!("%25", uri_escaped(b"%"));
assert_eq!("%2b%2b%2b", uri_escaped(b"+++"));
assert_eq!("%25%25%25", uri_escaped(b"%%%"));
assert_eq!("%25%2b%25", uri_escaped(b"%+%"));
assert_eq!("ä", uri_escaped(b"\xc3\xa4"));
assert_eq!("€", uri_escaped(b"\xe2\x82\xac"));
assert_eq!("💖", uri_escaped(b"\xf0\x9f\x92\x96"));
assert_eq!("äöü", uri_escaped(b"\xc3\xa4\xc3\xb6\xc3\xbc"));
assert_eq!(
"abcdäöüefgh",
uri_escaped(b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh")
);
assert_eq!("🄻🄰🅄🅁🄴🄻", uri_escaped(b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"));
assert_eq!("%c3ä", uri_escaped(b"\xc3\xc3\xa4"));
assert_eq!("%f0💖", uri_escaped(b"\xf0\xf0\x9f\x92\x96"));
assert_eq!("%f0💖%f0", uri_escaped(b"\xf0\xf0\x9f\x92\x96\xf0"));
assert_eq!("%f0💖asdf", uri_escaped(b"\xf0\xf0\x9f\x92\x96asdf"));
assert_eq!("%f0%9f💖", uri_escaped(b"\xf0\x9f\xf0\x9f\x92\x96"));
assert_eq!("%f0%9f%92💖", uri_escaped(b"\xf0\x9f\x92\xf0\x9f\x92\x96"));
// This will probably need some corner cases.
}
}
Loading

0 comments on commit 2776abc

Please sign in to comment.