diff --git a/src/bin/laurel/main.rs b/src/bin/laurel/main.rs index 7320652..a416861 100644 --- a/src/bin/laurel/main.rs +++ b/src/bin/laurel/main.rs @@ -92,7 +92,7 @@ impl Logger { if let Some(prefix) = &self.prefix { self.output.write_all(prefix.as_bytes()).unwrap(); } - serde_json::to_writer(&mut self.output, &message).unwrap(); + laurel::json::to_writer(&mut self.output, &message).unwrap(); self.output.write_all(b"\n").unwrap(); self.output.flush().unwrap(); } diff --git a/src/coalesce.rs b/src/coalesce.rs index 6c30ed8..8a14164 100644 --- a/src/coalesce.rs +++ b/src/coalesce.rs @@ -1018,14 +1018,13 @@ impl Drop for Coalesce<'_, '_> { #[cfg(test)] mod test { use super::*; - use serde_json; use std::cell::RefCell; use std::io::{BufRead, BufReader}; use std::rc::Rc; fn event_to_json(e: &Event) -> String { let mut out = vec![]; - serde_json::to_writer(&mut out, e).unwrap(); + crate::json::to_writer(&mut out, e).unwrap(); String::from_utf8_lossy(&out).to_string() } diff --git a/src/json.rs b/src/json.rs new file mode 100644 index 0000000..99b68c8 --- /dev/null +++ b/src/json.rs @@ -0,0 +1,71 @@ +use std::io::{Result, Write}; + +use crate::quote::*; + +/// A Formatter for serde_josn that outputs byte buffers as +/// URI-encodeed strings. +#[derive(Clone, Debug)] +pub struct SpecialFormatter; + +impl serde_json::ser::Formatter for SpecialFormatter { + fn write_byte_array(&mut self, writer: &mut W, value: &[u8]) -> Result<()> + where + W: ?Sized + Write, + { + self.begin_string(writer)?; + URIEscapeWriter(&mut BackslashEscapeWriter(writer)) + .write(value) + .map(|_| ())?; + self.end_string(writer) + } +} + +pub fn to_writer(writer: W, value: &T) -> serde_json::Result<()> +where + W: Write, + T: ?Sized + serde::Serialize, +{ + let mut ser = serde_json::Serializer::with_formatter(writer, SpecialFormatter); + value.serialize(&mut ser) +} + +#[cfg(test)] +mod test { + use super::to_writer; + use crate::types::Bytes; + + fn serialized(value: &[u8]) -> String { + let mut buf = vec![]; + to_writer(&mut buf, &Bytes(&value)).unwrap(); + String::from_utf8(buf).unwrap() + } + + #[test] + fn json_serialize() { + for (buf, expected) in &[ + (&b" "[..], r#"" ""#), + (&b"asdf"[..], r#""asdf""#), + (&b"+"[..], r#""%2b""#), + (&b"%"[..], r#""%25""#), + (&b"+++"[..], r#""%2b%2b%2b""#), + (&b"%%%"[..], r#""%25%25%25""#), + (&b"%+%"[..], r#""%25%2b%25""#), + (&b"\xc3\xa4"[..], r#""ä""#), + (&b"\xe2\x82\xac"[..], r#""€""#), + (&b"\xf0\x9f\x92\x96"[..], r#""💖""#), + (&b"\xc3\xa4\xc3\xb6\xc3\xbc"[..], r#""äöü""#), + (&b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh"[..], r#""abcdäöüefgh""#), + (&b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"[..], r#""🄻🄰🅄🅁🄴🄻""#), + (&b"\xc3\xc3\xa4"[..], r#""%c3ä""#), + (&b"\xf0\xf0\x9f\x92\x96"[..], r#""%f0💖""#), + (&b"\xf0\x9f\xf0\x9f\x92\x96"[..], r#""%f0%9f💖""#), + (&b"\xf0\x9f\x92\xf0\x9f\x92\x96"[..], r#""%f0%9f%92💖""#), + + (&b"\xed\xa0\x80"[..], r#""%ed%a0%80""#), // illegal surrogate codepoint 0xd800 + (&b"\xed\xa3\xbf"[..], r#""%ed%a3%bf""#), // illegal surrogate codepoint 0xd8ff + (&b"\xed\xbf\xbf"[..], r#""%ed%bf%bf""#), // illegal surrogate codepoint 0xdfff + ] { + assert_eq!(serialized(buf), *expected); + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 8731d5b..8ef2057 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,13 +3,14 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION"); pub mod coalesce; pub mod config; pub mod constants; +pub mod json; pub mod label_matcher; pub mod logger; pub mod parser; pub mod proc; #[cfg(all(feature = "procfs", target_os = "linux"))] pub mod procfs; -pub mod quoted_string; +pub(crate) mod quote; pub mod rotate; #[cfg(target_os = "linux")] pub mod sockaddr; diff --git a/src/quote.rs b/src/quote.rs new file mode 100644 index 0000000..8f05f50 --- /dev/null +++ b/src/quote.rs @@ -0,0 +1,179 @@ +use std::io::{Result, Write}; + +const HEXDIGITS: &[u8; 16] = b"0123456789abcdef"; + +/// Adapter that applies backslash-coding according to JSON rules to +/// the bytes written. +pub(crate) struct BackslashEscapeWriter<'a, W>(pub &'a mut W) +where + W: ?Sized + Write; + +impl<'a, W> Write for BackslashEscapeWriter<'a, W> +where + W: ?Sized + Write, +{ + fn write(&mut self, buf: &[u8]) -> Result { + let mut quoted = [b'\\', b'u', b'0', b'0', b'0', b'0']; + let mut start_unquoted = 0; + for (n, c) in buf.iter().enumerate() { + let quoted = match c { + b'"' => &br#"\""#[..], + b'\\' => &br#"\\"#[..], + b'\x08' => &br#"\b"#[..], + b'\x0c' => &br#"\f"#[..], + b'\n' => &br#"\n"#[..], + b'\r' => &br#"\r"#[..], + b'\t' => &br#"\t"#[..], + c if *c < 32 => { + quoted[4] = HEXDIGITS[((*c & 0xf0) >> 4) as usize]; + quoted[5] = HEXDIGITS[(*c & 0x0f) as usize]; + "ed + } + _ => continue, + }; + self.0.write_all(&buf[start_unquoted..n])?; + self.0.write_all(quoted)?; + start_unquoted = n + 1; + } + self.0.write_all(&buf[start_unquoted..])?; + Ok(buf.len()) + } + fn flush(&mut self) -> Result<()> { + self.0.flush() + } +} + +fn write_quoted_byte(writer: &mut W, value: u8) -> Result<()> +where + W: ?Sized + Write, +{ + let value = value as usize; + writer.write_all(&[b'%', HEXDIGITS[value >> 4], HEXDIGITS[value & 0x0f]]) +} + +/// Adapter that applies URI-escaping (except ' ' -> '+') to the bytes writen. +/// +/// Printable ASCII characters except `%`, `+`, and `\b`, `\f`, `\n`, +/// `\r`, `\t` are left as-is. +/// +/// This is the "inner" encoding of the JSON strings produced by Laurel. +pub(crate) struct URIEscapeWriter<'a, W>(pub &'a mut W) +where + W: ?Sized + Write; + +impl<'a, W> Write for URIEscapeWriter<'a, W> +where + W: ?Sized + Write, +{ + fn write(&mut self, buf: &[u8]) -> Result { + let mut utf8state: Option = None; + let mut stash = tinyvec::array_vec!([u8; 4]); + let mut start_unquoted = 0; + for (n, c) in buf.iter().enumerate() { + loop { + match utf8state { + None => { + if *c >= 32 + && *c < 127 + && ![b'%', b'+', b'\x08', b'\x0c', b'\n', b'\r', b'\t'].contains(c) + { + // simple byte, collect to be output as-is. + break; + } + self.0.write_all(&buf[start_unquoted..n])?; + start_unquoted = n + 1; + let len = match *c { + n if n & 0b11100000 == 0b11000000 => 1, + n if n & 0b11110000 == 0b11100000 => 2, + n if n & 0b11111000 == 0b11110000 => 3, + _ => { + // simple non-representable byte + write_quoted_byte(self.0, *c)?; + break; + } + }; + stash.clear(); + stash.push(*c); + utf8state = Some(len); + break; + } + Some(ref mut len) => { + if *c & 0b11000000 == 0b10000000 { + start_unquoted = n + 1; + stash.push(*c); + *len -= 1; + // Complete UTF-8 multi-byte-sequence. Write. + if *len == 0 { + match std::str::from_utf8(&stash) { + Ok(_) => self.0.write_all(&stash)?, + _ => stash + .iter() + .try_for_each(|c| write_quoted_byte(self.0, *c))?, + } + utf8state = None; + } + break; + } else { + // Incomplete UTF-8 multi-byte sequence. + // Write and re-evaluate current byte. + stash + .iter() + .try_for_each(|c| write_quoted_byte(self.0, *c))?; + utf8state = None; + } + } + } + } + } + // invalid UTF-8 multi-byte-sequence at end of input. + match utf8state { + Some(_) => stash + .iter() + .try_for_each(|c| write_quoted_byte(self.0, *c))?, + None => self.0.write_all(&buf[start_unquoted..])?, + }; + Ok(buf.len()) + } + fn flush(&mut self) -> Result<()> { + self.0.flush() + } +} + +#[cfg(test)] +mod test { + use super::URIEscapeWriter; + use std::io::Write; + + fn uri_escaped(value: &[u8]) -> String { + let mut buf = Vec::with_capacity(value.len()); + URIEscapeWriter(&mut buf).write(&value).unwrap(); + String::from_utf8(buf).unwrap() + } + + #[test] + fn uri_escape() { + assert_eq!(" ", uri_escaped(b" ")); + assert_eq!("asdf", uri_escaped(b"asdf")); + assert_eq!("%2b", uri_escaped(b"+")); + assert_eq!("%25", uri_escaped(b"%")); + assert_eq!("%2b%2b%2b", uri_escaped(b"+++")); + assert_eq!("%25%25%25", uri_escaped(b"%%%")); + assert_eq!("%25%2b%25", uri_escaped(b"%+%")); + assert_eq!("ä", uri_escaped(b"\xc3\xa4")); + assert_eq!("€", uri_escaped(b"\xe2\x82\xac")); + assert_eq!("💖", uri_escaped(b"\xf0\x9f\x92\x96")); + assert_eq!("äöü", uri_escaped(b"\xc3\xa4\xc3\xb6\xc3\xbc")); + assert_eq!( + "abcdäöüefgh", + uri_escaped(b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh") + ); + assert_eq!("🄻🄰🅄🅁🄴🄻", uri_escaped(b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb")); + assert_eq!("%c3ä", uri_escaped(b"\xc3\xc3\xa4")); + assert_eq!("%f0💖", uri_escaped(b"\xf0\xf0\x9f\x92\x96")); + assert_eq!("%f0💖%f0", uri_escaped(b"\xf0\xf0\x9f\x92\x96\xf0")); + assert_eq!("%f0💖asdf", uri_escaped(b"\xf0\xf0\x9f\x92\x96asdf")); + assert_eq!("%f0%9f💖", uri_escaped(b"\xf0\x9f\xf0\x9f\x92\x96")); + assert_eq!("%f0%9f%92💖", uri_escaped(b"\xf0\x9f\x92\xf0\x9f\x92\x96")); + // This will probably need some corner cases. + } +} diff --git a/src/quoted_string.rs b/src/quoted_string.rs deleted file mode 100644 index 075f46c..0000000 --- a/src/quoted_string.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::str; - -/// Format byte sequence as a string that is suitable for serializing -/// to the audit log -pub(crate) trait ToQuotedString { - fn to_quoted_string(&self) -> String; -} - -const HEXDIGITS: &[u8; 16] = b"0123456789abcdef"; - -fn push_byte_quoted(sb: &mut Vec, byte: u8) { - let byte = byte as usize; - // safety: We have created a 3 byte ASCII string, i.e. valid Unicode. - sb.extend(&[b'%', HEXDIGITS[byte >> 4], HEXDIGITS[byte & 15]]); -} - -impl ToQuotedString for [u8] { - fn to_quoted_string(self: &[u8]) -> String { - let mut sb: Vec = Vec::with_capacity(self.len()); - // Are we currently inside a UTF-8 multibyte sequence? - let mut utf8state: Option = None; - let mut bytes = Vec::with_capacity(3); - for c in self { - loop { - match utf8state { - None => { - let len: u8 = if *c >= 32 && *c < 127 && *c != b'%' && *c != b'+' { - // simple byte, psuh as-is. - sb.push(*c); - break; - } else if *c & 0b11100000 == 0b11000000 { - 1 - } else if *c & 0b11110000 == 0b11100000 { - 2 - } else if *c & 0b11111000 == 0b11110000 { - 3 - } else { - // simple non-representable byte - push_byte_quoted(&mut sb, *c); - break; - }; - bytes.clear(); - bytes.push(*c); - utf8state = Some(len); - break; - } - Some(ref mut len) => { - if *c & 0b11000000 == 0b10000000 { - bytes.push(*c); - *len -= 1; - if *len == 0 { - match str::from_utf8(&bytes) { - Ok(s) => sb.extend(s.bytes()), - _ => bytes.iter().for_each(|c| push_byte_quoted(&mut sb, *c)), - } - utf8state = None; - } - break; - } else { - // incomplete UTF-8 multi-byte sequence, - // output collected bytes. - bytes.iter().for_each(|c| push_byte_quoted(&mut sb, *c)); - utf8state = None; - } - } - } - } - } - if utf8state.is_some() { - bytes.iter().for_each(|c| push_byte_quoted(&mut sb, *c)); - } - // safety: We have verified that individual bytes and byte - // sequences that were added were valid UTF-8 characters or - // character sequences. - unsafe { String::from_utf8_unchecked(sb) } - } -} - -#[cfg(test)] -mod test { - use super::ToQuotedString; - #[test] - fn to_quoted_string() { - assert_eq!(" ", b" ".to_quoted_string()); - assert_eq!("asdf", b"asdf".to_quoted_string()); - assert_eq!("%2b", b"+".to_quoted_string()); - assert_eq!("%25", b"%".to_quoted_string()); - assert_eq!("%2b%2b%2b", b"+++".to_quoted_string()); - assert_eq!("%25%25%25", b"%%%".to_quoted_string()); - assert_eq!("%25%2b%25", b"%+%".to_quoted_string()); - assert_eq!("ä", b"\xc3\xa4".to_quoted_string()); - assert_eq!("€", b"\xe2\x82\xac".to_quoted_string()); - assert_eq!("💖", b"\xf0\x9f\x92\x96".to_quoted_string()); - assert_eq!("äöü", b"\xc3\xa4\xc3\xb6\xc3\xbc".to_quoted_string()); - assert_eq!( - "abcdäöüefgh", - b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh".to_quoted_string() - ); - assert_eq!("🄻🄰🅄🅁🄴🄻", b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb".to_quoted_string()); - assert_eq!("%c3ä", b"\xc3\xc3\xa4".to_quoted_string()); - assert_eq!("%f0💖", b"\xf0\xf0\x9f\x92\x96".to_quoted_string()); - assert_eq!("%f0%9f💖", b"\xf0\x9f\xf0\x9f\x92\x96".to_quoted_string()); - assert_eq!( - "%f0%9f%92💖", - b"\xf0\x9f\x92\xf0\x9f\x92\x96".to_quoted_string() - ); - // This will probably need some corner cases. - } -} diff --git a/src/types.rs b/src/types.rs index a0eb3c1..463eceb 100644 --- a/src/types.rs +++ b/src/types.rs @@ -11,7 +11,6 @@ use serde::ser::SerializeMap; use serde::{Serialize, Serializer}; use crate::constants::*; -use crate::quoted_string::ToQuotedString; /// Collect records in [`EventBody`] context as single or multiple /// instances. @@ -71,7 +70,7 @@ impl Serialize for Event<'_> { map.serialize_value(&self.id)?; if let Some(node) = &self.node { map.serialize_key("NODE")?; - map.serialize_value(&node.as_slice().to_quoted_string())?; + map.serialize_value(&node)?; } for (k, v) in &self.body { map.serialize_entry(&k, &v)?; @@ -775,21 +774,21 @@ impl Serialize for Value<'_> { fn serialize(&self, s: S) -> Result { match self { Value::Empty => s.serialize_none(), - Value::Str(r, q) => { - let (q1, q2) = if let Quote::Braces = q { - ("{", "}") - } else { - ("", "") - }; - s.collect_str(&format_args!("{}{}{}", q1, r.to_quoted_string(), q2)) + Value::Str(r, Quote::Braces) => { + let mut buf = Vec::with_capacity(r.len() + 2); + buf.push(b'{'); + buf.extend(*r); + buf.push(b'}'); + s.serialize_bytes(&buf) } + Value::Str(r, _) => s.serialize_bytes(r), Value::Segments(segs) => { let l = segs.iter().map(|r| r.len()).sum(); - let mut sb = String::with_capacity(l); + let mut buf = Vec::with_capacity(l); for seg in segs { - sb.push_str(&seg.to_quoted_string()); + buf.extend(*seg); } - s.collect_str(&sb) + s.serialize_bytes(&buf) } Value::List(vs) => s.collect_seq(vs.iter()), Value::StringifiedList(vs) => { @@ -809,25 +808,10 @@ impl Serialize for Value<'_> { buf.extend(v.clone().try_into().unwrap_or_else(|_| vec![b'x'])); } } - s.serialize_str(&buf.to_quoted_string()) + s.serialize_bytes(&buf) } Value::Number(n) => n.serialize(s), - Value::Map(vs) => { - let mut map = s.serialize_map(Some(vs.len()))?; - for (k, v) in vs { - match k { - Key::Name(n) => map.serialize_key(&n.as_slice().to_quoted_string())?, - Key::Literal(n) => map.serialize_key(n)?, - _ => todo!(), - } - match v { - Value::Str(r, _q) => map.serialize_value(&r.to_quoted_string())?, - Value::Number(n) => map.serialize_value(&n)?, - _ => todo!(), - } - } - map.end() - } + Value::Map(vs) => s.collect_map(vs.iter().cloned()), Value::Skipped((args, bytes)) => { let mut map = s.serialize_map(Some(2))?; map.serialize_entry("skipped_args", args)?; @@ -835,7 +819,7 @@ impl Serialize for Value<'_> { map.end() } Value::Literal(v) => s.collect_str(v), - Value::Owned(v) => s.collect_str(&v.to_quoted_string()), + Value::Owned(v) => Bytes(v).serialize(s), } } } @@ -913,3 +897,12 @@ impl Offset for Range { } } } + +/// Helper type to enforce that serialize_bytes() is used in serialization. +pub(crate) struct Bytes<'a>(pub &'a [u8]); + +impl<'a> Serialize for Bytes<'a> { + fn serialize(&self, s: S) -> Result { + s.serialize_bytes(self.0) + } +}