Decouple serialization and JSON log format more clearly

Add a custom serde_json Formatter with support for byte arrays. SpecialFormatter adds a write_byte_array method that outputs URI-encoded strings. Move percent encoding from QuotedString type to URIEscapeWriter which works as a filter for any type that implements the std::io::Write trait. Where possible, use serialize_bytes(), either directly or via a Bytes helper type.
threathunters-io · Feb 26, 2024 · 2776abc · 2776abc
1 parent 67bac75
commit 2776abc
Show file tree

Hide file tree

Showing 7 changed files with 277 additions and 143 deletions.
diff --git a/src/bin/laurel/main.rs b/src/bin/laurel/main.rs
@@ -92,7 +92,7 @@ impl Logger {
         if let Some(prefix) = &self.prefix {
             self.output.write_all(prefix.as_bytes()).unwrap();
         }
-        serde_json::to_writer(&mut self.output, &message).unwrap();
+        laurel::json::to_writer(&mut self.output, &message).unwrap();
         self.output.write_all(b"\n").unwrap();
         self.output.flush().unwrap();
     }

diff --git a/src/coalesce.rs b/src/coalesce.rs
@@ -1018,14 +1018,13 @@ impl Drop for Coalesce<'_, '_> {
 #[cfg(test)]
 mod test {
     use super::*;
-    use serde_json;
     use std::cell::RefCell;
     use std::io::{BufRead, BufReader};
     use std::rc::Rc;
 
     fn event_to_json(e: &Event) -> String {
         let mut out = vec![];
-        serde_json::to_writer(&mut out, e).unwrap();
+        crate::json::to_writer(&mut out, e).unwrap();
         String::from_utf8_lossy(&out).to_string()
     }
 

diff --git a/src/json.rs b/src/json.rs
@@ -0,0 +1,71 @@
+use std::io::{Result, Write};
+
+use crate::quote::*;
+
+/// A Formatter for serde_josn that outputs byte buffers as
+/// URI-encodeed strings.
+#[derive(Clone, Debug)]
+pub struct SpecialFormatter;
+
+impl serde_json::ser::Formatter for SpecialFormatter {
+    fn write_byte_array<W>(&mut self, writer: &mut W, value: &[u8]) -> Result<()>
+    where
+        W: ?Sized + Write,
+    {
+        self.begin_string(writer)?;
+        URIEscapeWriter(&mut BackslashEscapeWriter(writer))
+            .write(value)
+            .map(|_| ())?;
+        self.end_string(writer)
+    }
+}
+
+pub fn to_writer<W, T>(writer: W, value: &T) -> serde_json::Result<()>
+where
+    W: Write,
+    T: ?Sized + serde::Serialize,
+{
+    let mut ser = serde_json::Serializer::with_formatter(writer, SpecialFormatter);
+    value.serialize(&mut ser)
+}
+
+#[cfg(test)]
+mod test {
+    use super::to_writer;
+    use crate::types::Bytes;
+
+    fn serialized(value: &[u8]) -> String {
+        let mut buf = vec![];
+        to_writer(&mut buf, &Bytes(&value)).unwrap();
+        String::from_utf8(buf).unwrap()
+    }
+
+    #[test]
+    fn json_serialize() {
+        for (buf, expected) in &[
+            (&b" "[..], r#"" ""#),
+            (&b"asdf"[..], r#""asdf""#),
+            (&b"+"[..], r#""%2b""#),
+            (&b"%"[..], r#""%25""#),
+            (&b"+++"[..], r#""%2b%2b%2b""#),
+            (&b"%%%"[..], r#""%25%25%25""#),
+            (&b"%+%"[..], r#""%25%2b%25""#),
+            (&b"\xc3\xa4"[..], r#""ä""#),
+            (&b"\xe2\x82\xac"[..], r#""€""#),
+            (&b"\xf0\x9f\x92\x96"[..], r#""💖""#),
+            (&b"\xc3\xa4\xc3\xb6\xc3\xbc"[..], r#""äöü""#),
+            (&b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh"[..], r#""abcdäöüefgh""#),
+            (&b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"[..], r#""🄻🄰🅄🅁🄴🄻""#),
+            (&b"\xc3\xc3\xa4"[..], r#""%c3ä""#),
+            (&b"\xf0\xf0\x9f\x92\x96"[..], r#""%f0💖""#),
+            (&b"\xf0\x9f\xf0\x9f\x92\x96"[..], r#""%f0%9f💖""#),
+            (&b"\xf0\x9f\x92\xf0\x9f\x92\x96"[..], r#""%f0%9f%92💖""#),
+
+            (&b"\xed\xa0\x80"[..], r#""%ed%a0%80""#), // illegal surrogate codepoint 0xd800
+            (&b"\xed\xa3\xbf"[..], r#""%ed%a3%bf""#), // illegal surrogate codepoint 0xd8ff
+            (&b"\xed\xbf\xbf"[..], r#""%ed%bf%bf""#), // illegal surrogate codepoint 0xdfff
+        ] {
+            assert_eq!(serialized(buf), *expected);
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -3,13 +3,14 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 pub mod coalesce;
 pub mod config;
 pub mod constants;
+pub mod json;
 pub mod label_matcher;
 pub mod logger;
 pub mod parser;
 pub mod proc;
 #[cfg(all(feature = "procfs", target_os = "linux"))]
 pub mod procfs;
-pub mod quoted_string;
+pub(crate) mod quote;
 pub mod rotate;
 #[cfg(target_os = "linux")]
 pub mod sockaddr;

diff --git a/src/quote.rs b/src/quote.rs
@@ -0,0 +1,179 @@
+use std::io::{Result, Write};
+
+const HEXDIGITS: &[u8; 16] = b"0123456789abcdef";
+
+/// Adapter that applies backslash-coding according to JSON rules to
+/// the bytes written.
+pub(crate) struct BackslashEscapeWriter<'a, W>(pub &'a mut W)
+where
+    W: ?Sized + Write;
+
+impl<'a, W> Write for BackslashEscapeWriter<'a, W>
+where
+    W: ?Sized + Write,
+{
+    fn write(&mut self, buf: &[u8]) -> Result<usize> {
+        let mut quoted = [b'\\', b'u', b'0', b'0', b'0', b'0'];
+        let mut start_unquoted = 0;
+        for (n, c) in buf.iter().enumerate() {
+            let quoted = match c {
+                b'"' => &br#"\""#[..],
+                b'\\' => &br#"\\"#[..],
+                b'\x08' => &br#"\b"#[..],
+                b'\x0c' => &br#"\f"#[..],
+                b'\n' => &br#"\n"#[..],
+                b'\r' => &br#"\r"#[..],
+                b'\t' => &br#"\t"#[..],
+                c if *c < 32 => {
+                    quoted[4] = HEXDIGITS[((*c & 0xf0) >> 4) as usize];
+                    quoted[5] = HEXDIGITS[(*c & 0x0f) as usize];
+                    &quoted
+                }
+                _ => continue,
+            };
+            self.0.write_all(&buf[start_unquoted..n])?;
+            self.0.write_all(quoted)?;
+            start_unquoted = n + 1;
+        }
+        self.0.write_all(&buf[start_unquoted..])?;
+        Ok(buf.len())
+    }
+    fn flush(&mut self) -> Result<()> {
+        self.0.flush()
+    }
+}
+
+fn write_quoted_byte<W>(writer: &mut W, value: u8) -> Result<()>
+where
+    W: ?Sized + Write,
+{
+    let value = value as usize;
+    writer.write_all(&[b'%', HEXDIGITS[value >> 4], HEXDIGITS[value & 0x0f]])
+}
+
+/// Adapter that applies URI-escaping (except ' ' -> '+') to the bytes writen.
+///
+/// Printable ASCII characters except `%`, `+`, and `\b`, `\f`, `\n`,
+/// `\r`, `\t` are left as-is.
+///
+/// This is the "inner" encoding of the JSON strings produced by Laurel.
+pub(crate) struct URIEscapeWriter<'a, W>(pub &'a mut W)
+where
+    W: ?Sized + Write;
+
+impl<'a, W> Write for URIEscapeWriter<'a, W>
+where
+    W: ?Sized + Write,
+{
+    fn write(&mut self, buf: &[u8]) -> Result<usize> {
+        let mut utf8state: Option<u8> = None;
+        let mut stash = tinyvec::array_vec!([u8; 4]);
+        let mut start_unquoted = 0;
+        for (n, c) in buf.iter().enumerate() {
+            loop {
+                match utf8state {
+                    None => {
+                        if *c >= 32
+                            && *c < 127
+                            && ![b'%', b'+', b'\x08', b'\x0c', b'\n', b'\r', b'\t'].contains(c)
+                        {
+                            // simple byte, collect to be output as-is.
+                            break;
+                        }
+                        self.0.write_all(&buf[start_unquoted..n])?;
+                        start_unquoted = n + 1;
+                        let len = match *c {
+                            n if n & 0b11100000 == 0b11000000 => 1,
+                            n if n & 0b11110000 == 0b11100000 => 2,
+                            n if n & 0b11111000 == 0b11110000 => 3,
+                            _ => {
+                                // simple non-representable byte
+                                write_quoted_byte(self.0, *c)?;
+                                break;
+                            }
+                        };
+                        stash.clear();
+                        stash.push(*c);
+                        utf8state = Some(len);
+                        break;
+                    }
+                    Some(ref mut len) => {
+                        if *c & 0b11000000 == 0b10000000 {
+                            start_unquoted = n + 1;
+                            stash.push(*c);
+                            *len -= 1;
+                            // Complete UTF-8 multi-byte-sequence. Write.
+                            if *len == 0 {
+                                match std::str::from_utf8(&stash) {
+                                    Ok(_) => self.0.write_all(&stash)?,
+                                    _ => stash
+                                        .iter()
+                                        .try_for_each(|c| write_quoted_byte(self.0, *c))?,
+                                }
+                                utf8state = None;
+                            }
+                            break;
+                        } else {
+                            // Incomplete UTF-8 multi-byte sequence.
+                            // Write and re-evaluate current byte.
+                            stash
+                                .iter()
+                                .try_for_each(|c| write_quoted_byte(self.0, *c))?;
+                            utf8state = None;
+                        }
+                    }
+                }
+            }
+        }
+        // invalid UTF-8 multi-byte-sequence at end of input.
+        match utf8state {
+            Some(_) => stash
+                .iter()
+                .try_for_each(|c| write_quoted_byte(self.0, *c))?,
+            None => self.0.write_all(&buf[start_unquoted..])?,
+        };
+        Ok(buf.len())
+    }
+    fn flush(&mut self) -> Result<()> {
+        self.0.flush()
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::URIEscapeWriter;
+    use std::io::Write;
+
+    fn uri_escaped(value: &[u8]) -> String {
+        let mut buf = Vec::with_capacity(value.len());
+        URIEscapeWriter(&mut buf).write(&value).unwrap();
+        String::from_utf8(buf).unwrap()
+    }
+
+    #[test]
+    fn uri_escape() {
+        assert_eq!(" ", uri_escaped(b" "));
+        assert_eq!("asdf", uri_escaped(b"asdf"));
+        assert_eq!("%2b", uri_escaped(b"+"));
+        assert_eq!("%25", uri_escaped(b"%"));
+        assert_eq!("%2b%2b%2b", uri_escaped(b"+++"));
+        assert_eq!("%25%25%25", uri_escaped(b"%%%"));
+        assert_eq!("%25%2b%25", uri_escaped(b"%+%"));
+        assert_eq!("ä", uri_escaped(b"\xc3\xa4"));
+        assert_eq!("€", uri_escaped(b"\xe2\x82\xac"));
+        assert_eq!("💖", uri_escaped(b"\xf0\x9f\x92\x96"));
+        assert_eq!("äöü", uri_escaped(b"\xc3\xa4\xc3\xb6\xc3\xbc"));
+        assert_eq!(
+            "abcdäöüefgh",
+            uri_escaped(b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh")
+        );
+        assert_eq!("🄻🄰🅄🅁🄴🄻", uri_escaped(b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb"));
+        assert_eq!("%c3ä", uri_escaped(b"\xc3\xc3\xa4"));
+        assert_eq!("%f0💖", uri_escaped(b"\xf0\xf0\x9f\x92\x96"));
+        assert_eq!("%f0💖%f0", uri_escaped(b"\xf0\xf0\x9f\x92\x96\xf0"));
+        assert_eq!("%f0💖asdf", uri_escaped(b"\xf0\xf0\x9f\x92\x96asdf"));
+        assert_eq!("%f0%9f💖", uri_escaped(b"\xf0\x9f\xf0\x9f\x92\x96"));
+        assert_eq!("%f0%9f%92💖", uri_escaped(b"\xf0\x9f\x92\xf0\x9f\x92\x96"));
+        // This will probably need some corner cases.
+    }
+}