Skip to content

Decouple serialization and JSON log format more clearly #202

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/bin/laurel/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ impl Logger {
if let Some(prefix) = &self.prefix {
self.output.write_all(prefix.as_bytes()).unwrap();
}
serde_json::to_writer(&mut self.output, &message).unwrap();
laurel::json::to_writer(&mut self.output, &message).unwrap();
self.output.write_all(b"\n").unwrap();
self.output.flush().unwrap();
}
Expand Down
3 changes: 1 addition & 2 deletions src/coalesce.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1018,14 +1018,13 @@ impl Drop for Coalesce<'_, '_> {
#[cfg(test)]
mod test {
use super::*;
use serde_json;
use std::cell::RefCell;
use std::io::{BufRead, BufReader};
use std::rc::Rc;

fn event_to_json(e: &Event) -> String {
let mut out = vec![];
serde_json::to_writer(&mut out, e).unwrap();
crate::json::to_writer(&mut out, e).unwrap();
String::from_utf8_lossy(&out).to_string()
}

Expand Down
71 changes: 71 additions & 0 deletions src/json.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
use std::io::{Result, Write};

use crate::quote::*;

/// A Formatter for serde_josn that outputs byte buffers as
/// URI-encodeed strings.
#[derive(Clone, Debug)]
pub struct SpecialFormatter;

impl serde_json::ser::Formatter for SpecialFormatter {
fn write_byte_array<W>(&mut self, writer: &mut W, value: &[u8]) -> Result<()>
where
W: ?Sized + Write,
{
self.begin_string(writer)?;
URIEscapeWriter(&mut BackslashEscapeWriter(writer))
.write(value)
.map(|_| ())?;
self.end_string(writer)
}
}

pub fn to_writer<W, T>(writer: W, value: &T) -> serde_json::Result<()>
where
W: Write,
T: ?Sized + serde::Serialize,
{
let mut ser = serde_json::Serializer::with_formatter(writer, SpecialFormatter);
value.serialize(&mut ser)
}

#[cfg(test)]
mod test {
use super::to_writer;
use crate::types::Bytes;

fn serialized(value: &[u8]) -> String {
let mut buf = vec![];
to_writer(&mut buf, &Bytes(&value)).unwrap();
String::from_utf8(buf).unwrap()
}

#[test]
fn json_serialize() {
for (buf, expected) in &[
(&b" "[..], r#"" ""#),
(&b"asdf"[..], r#""asdf""#),
(&b"+"[..], r#""%2b""#),
(&b"%"[..], r#""%25""#),
(&b"+++"[..], r#""%2b%2b%2b""#),
(&b"%%%"[..], r#""%25%25%25""#),
(&b"%+%"[..], r#""%25%2b%25""#),
(&b"\xc3\xa4"[..], r#""ä""#),
(&b"\xe2\x82\xac"[..], r#""€""#),
(&b"\xf0\x9f\x92\x96"[..], r#""💖""#),
(&b"\xc3\xa4\xc3\xb6\xc3\xbc"[..], r#""äöü""#),
(&b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh"[..], r#""abcdäöüefgh""#),
(&b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"[..], r#""🄻🄰🅄🅁🄴🄻""#),
(&b"\xc3\xc3\xa4"[..], r#""%c3ä""#),
(&b"\xf0\xf0\x9f\x92\x96"[..], r#""%f0💖""#),
(&b"\xf0\x9f\xf0\x9f\x92\x96"[..], r#""%f0%9f💖""#),
(&b"\xf0\x9f\x92\xf0\x9f\x92\x96"[..], r#""%f0%9f%92💖""#),

(&b"\xed\xa0\x80"[..], r#""%ed%a0%80""#), // illegal surrogate codepoint 0xd800
(&b"\xed\xa3\xbf"[..], r#""%ed%a3%bf""#), // illegal surrogate codepoint 0xd8ff
(&b"\xed\xbf\xbf"[..], r#""%ed%bf%bf""#), // illegal surrogate codepoint 0xdfff
] {
assert_eq!(serialized(buf), *expected);
}
}
}
3 changes: 2 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
pub mod coalesce;
pub mod config;
pub mod constants;
pub mod json;
pub mod label_matcher;
pub mod logger;
pub mod parser;
pub mod proc;
#[cfg(all(feature = "procfs", target_os = "linux"))]
pub mod procfs;
pub mod quoted_string;
pub(crate) mod quote;
pub mod rotate;
#[cfg(target_os = "linux")]
pub mod sockaddr;
Expand Down
179 changes: 179 additions & 0 deletions src/quote.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
use std::io::{Result, Write};

const HEXDIGITS: &[u8; 16] = b"0123456789abcdef";

/// Adapter that applies backslash-coding according to JSON rules to
/// the bytes written.
pub(crate) struct BackslashEscapeWriter<'a, W>(pub &'a mut W)
where
W: ?Sized + Write;

impl<'a, W> Write for BackslashEscapeWriter<'a, W>
where
W: ?Sized + Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize> {
let mut quoted = [b'\\', b'u', b'0', b'0', b'0', b'0'];
let mut start_unquoted = 0;
for (n, c) in buf.iter().enumerate() {
let quoted = match c {
b'"' => &br#"\""#[..],
b'\\' => &br#"\\"#[..],
b'\x08' => &br#"\b"#[..],
b'\x0c' => &br#"\f"#[..],
b'\n' => &br#"\n"#[..],
b'\r' => &br#"\r"#[..],
b'\t' => &br#"\t"#[..],
c if *c < 32 => {
quoted[4] = HEXDIGITS[((*c & 0xf0) >> 4) as usize];
quoted[5] = HEXDIGITS[(*c & 0x0f) as usize];
&quoted
}
_ => continue,
};
self.0.write_all(&buf[start_unquoted..n])?;
self.0.write_all(quoted)?;
start_unquoted = n + 1;
}
self.0.write_all(&buf[start_unquoted..])?;
Ok(buf.len())
}
fn flush(&mut self) -> Result<()> {
self.0.flush()
}
}

fn write_quoted_byte<W>(writer: &mut W, value: u8) -> Result<()>
where
W: ?Sized + Write,
{
let value = value as usize;
writer.write_all(&[b'%', HEXDIGITS[value >> 4], HEXDIGITS[value & 0x0f]])
}

/// Adapter that applies URI-escaping (except ' ' -> '+') to the bytes writen.
///
/// Printable ASCII characters except `%`, `+`, and `\b`, `\f`, `\n`,
/// `\r`, `\t` are left as-is.
///
/// This is the "inner" encoding of the JSON strings produced by Laurel.
pub(crate) struct URIEscapeWriter<'a, W>(pub &'a mut W)
where
W: ?Sized + Write;

impl<'a, W> Write for URIEscapeWriter<'a, W>
where
W: ?Sized + Write,
{
fn write(&mut self, buf: &[u8]) -> Result<usize> {
let mut utf8state: Option<u8> = None;
let mut stash = tinyvec::array_vec!([u8; 4]);
let mut start_unquoted = 0;
for (n, c) in buf.iter().enumerate() {
loop {
match utf8state {
None => {
if *c >= 32
&& *c < 127
&& ![b'%', b'+', b'\x08', b'\x0c', b'\n', b'\r', b'\t'].contains(c)
{
// simple byte, collect to be output as-is.
break;
}
self.0.write_all(&buf[start_unquoted..n])?;
start_unquoted = n + 1;
let len = match *c {
n if n & 0b11100000 == 0b11000000 => 1,
n if n & 0b11110000 == 0b11100000 => 2,
n if n & 0b11111000 == 0b11110000 => 3,
_ => {
// simple non-representable byte
write_quoted_byte(self.0, *c)?;
break;
}
};
stash.clear();
stash.push(*c);
utf8state = Some(len);
break;
}
Some(ref mut len) => {
if *c & 0b11000000 == 0b10000000 {
start_unquoted = n + 1;
stash.push(*c);
*len -= 1;
// Complete UTF-8 multi-byte-sequence. Write.
if *len == 0 {
match std::str::from_utf8(&stash) {
Ok(_) => self.0.write_all(&stash)?,
_ => stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?,
}
utf8state = None;
}
break;
} else {
// Incomplete UTF-8 multi-byte sequence.
// Write and re-evaluate current byte.
stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?;
utf8state = None;
}
}
}
}
}
// invalid UTF-8 multi-byte-sequence at end of input.
match utf8state {
Some(_) => stash
.iter()
.try_for_each(|c| write_quoted_byte(self.0, *c))?,
None => self.0.write_all(&buf[start_unquoted..])?,
};
Ok(buf.len())
}
fn flush(&mut self) -> Result<()> {
self.0.flush()
}
}

#[cfg(test)]
mod test {
use super::URIEscapeWriter;
use std::io::Write;

fn uri_escaped(value: &[u8]) -> String {
let mut buf = Vec::with_capacity(value.len());
URIEscapeWriter(&mut buf).write(&value).unwrap();
String::from_utf8(buf).unwrap()
}

#[test]
fn uri_escape() {
assert_eq!(" ", uri_escaped(b" "));
assert_eq!("asdf", uri_escaped(b"asdf"));
assert_eq!("%2b", uri_escaped(b"+"));
assert_eq!("%25", uri_escaped(b"%"));
assert_eq!("%2b%2b%2b", uri_escaped(b"+++"));
assert_eq!("%25%25%25", uri_escaped(b"%%%"));
assert_eq!("%25%2b%25", uri_escaped(b"%+%"));
assert_eq!("ä", uri_escaped(b"\xc3\xa4"));
assert_eq!("€", uri_escaped(b"\xe2\x82\xac"));
assert_eq!("💖", uri_escaped(b"\xf0\x9f\x92\x96"));
assert_eq!("äöü", uri_escaped(b"\xc3\xa4\xc3\xb6\xc3\xbc"));
assert_eq!(
"abcdäöüefgh",
uri_escaped(b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh")
);
assert_eq!("🄻🄰🅄🅁🄴🄻", uri_escaped(b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"));
assert_eq!("%c3ä", uri_escaped(b"\xc3\xc3\xa4"));
assert_eq!("%f0💖", uri_escaped(b"\xf0\xf0\x9f\x92\x96"));
assert_eq!("%f0💖%f0", uri_escaped(b"\xf0\xf0\x9f\x92\x96\xf0"));
assert_eq!("%f0💖asdf", uri_escaped(b"\xf0\xf0\x9f\x92\x96asdf"));
assert_eq!("%f0%9f💖", uri_escaped(b"\xf0\x9f\xf0\x9f\x92\x96"));
assert_eq!("%f0%9f%92💖", uri_escaped(b"\xf0\x9f\x92\xf0\x9f\x92\x96"));
// This will probably need some corner cases.
}
}
Loading