From 15c27df21b84247dee69633b2255fc0fbdc2c949 Mon Sep 17 00:00:00 2001
From: Julien Cretin <cretin@google.com>
Date: Mon, 13 Nov 2023 22:41:49 +0100
Subject: [PATCH] Add Encoder for concatenated inputs (#89)

Fixes #81
---
 lib/CHANGELOG.md                    |  1 +
 lib/fuzz/Cargo.toml                 |  6 ++
 lib/fuzz/fuzz_targets/encoder.rs    | 21 +++++++
 lib/fuzz/fuzz_targets/round_trip.rs |  5 +-
 lib/fuzz/src/lib.rs                 | 49 ++++++++++++++-
 lib/src/lib.rs                      | 93 ++++++++++++++++++++++++++++-
 lib/tests/lib.rs                    | 25 ++++++++
 7 files changed, 193 insertions(+), 7 deletions(-)
 create mode 100644 lib/fuzz/fuzz_targets/encoder.rs
diff --git a/lib/CHANGELOG.md b/lib/CHANGELOG.md
index e1a915f..263eb41 100644
--- a/lib/CHANGELOG.md
+++ b/lib/CHANGELOG.md
@@ -4,6 +4,7 @@
 
 ### Minor
 
+- Add `Encoder` and `Encoding::new_encoder()` for fragmented inputs (fixes #81)
 - Make some functions `must_use`
 - Bump MSRV from 1.47 to 1.48
 
diff --git a/lib/fuzz/Cargo.toml b/lib/fuzz/Cargo.toml
index 5e1edf0..077ecdc 100644
--- a/lib/fuzz/Cargo.toml
+++ b/lib/fuzz/Cargo.toml
@@ -17,3 +17,9 @@ name = "round_trip"
 path = "fuzz_targets/round_trip.rs"
 test = false
 doc = false
+
+[[bin]]
+name = "encoder"
+path = "fuzz_targets/encoder.rs"
+test = false
+doc = false
diff --git a/lib/fuzz/fuzz_targets/encoder.rs b/lib/fuzz/fuzz_targets/encoder.rs
new file mode 100644
index 0000000..e832680
--- /dev/null
+++ b/lib/fuzz/fuzz_targets/encoder.rs
@@ -0,0 +1,21 @@
+#![no_main]
+
+use data_encoding_fuzz::{generate_bytes, generate_encoding, generate_usize};
+use libfuzzer_sys::fuzz_target;
+
+fuzz_target!(|data: &[u8]| {
+    let mut data = data;
+    let encoding = generate_encoding(&mut data);
+    let mut output = String::new();
+    let mut input = Vec::new();
+    let mut encoder = encoding.new_encoder(&mut output);
+    while !data.is_empty() {
+        let len = generate_usize(&mut data, 0, 3 * 256 - 1);
+        let chunk = generate_bytes(&mut data, len);
+        input.extend_from_slice(chunk);
+        encoder.append(chunk);
+    }
+    encoder.finalize();
+    let expected = encoding.encode(&input);
+    assert_eq!(output, expected);
+});
diff --git a/lib/fuzz/fuzz_targets/round_trip.rs b/lib/fuzz/fuzz_targets/round_trip.rs
index bbc8887..4c7e5b2 100644
--- a/lib/fuzz/fuzz_targets/round_trip.rs
+++ b/lib/fuzz/fuzz_targets/round_trip.rs
@@ -1,10 +1,7 @@
 #![no_main]
 
-#[macro_use]
-extern crate libfuzzer_sys;
-extern crate data_encoding_fuzz;
-
 use data_encoding_fuzz::{decode_prefix, generate_encoding};
+use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
     let mut data = data;
diff --git a/lib/fuzz/src/lib.rs b/lib/fuzz/src/lib.rs
index ef50f96..d196554 100644
--- a/lib/fuzz/src/lib.rs
+++ b/lib/fuzz/src/lib.rs
@@ -80,7 +80,29 @@ pub fn generate_specification(data: &mut &[u8]) -> Specification {
     spec
 }
 
-fn generate(data: &mut &[u8], min: u8, max: u8) -> u8 {
+pub fn generate_bytes<'a>(data: &'_ mut &'a [u8], len: usize) -> &'a [u8] {
+    let len = std::cmp::min(len, data.len());
+    let res = &data[.. len];
+    *data = &data[len ..];
+    res
+}
+
+pub fn generate_usize(data: &mut &[u8], min: usize, max: usize) -> usize {
+    let log = match (max - min).checked_ilog2() {
+        None => return min,
+        Some(x) => x,
+    };
+    let mut res = 0;
+    for _ in 0 .. log / 8 + 1 {
+        res = res << 8 | generate(data, 0, 255) as usize;
+    }
+    if usize::MIN < min || max < usize::MAX {
+        res = min + res % (max - min + 1);
+    }
+    res
+}
+
+pub fn generate(data: &mut &[u8], min: u8, max: u8) -> u8 {
     if data.is_empty() {
         return min;
     }
@@ -106,3 +128,28 @@ pub fn decode_prefix(encoding: &Encoding, input: &mut &[u8]) -> Vec<u8> {
     }
     output
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn generate_usize_ok() {
+        #[track_caller]
+        fn test(mut data: &[u8], min: usize, max: usize, expected: usize) {
+            assert_eq!(generate_usize(&mut data, min, max), expected);
+            assert_eq!(data, &[]);
+        }
+        test(&[], 0, 0, 0);
+        test(&[], 0, 0xffff, 0);
+        test(&[0], 0, 0xffff, 0);
+        test(&[0x23], 0, 0xffff, 0x2300);
+        test(&[0x23, 0x58], 0, 0xffff, 0x2358);
+        test(&[0x23, 0x58], 0x10000, 0x1ffff, 0x12358);
+        test(&[0], 0, 1, 0);
+        test(&[1], 0, 1, 1);
+        test(&[2], 0, 1, 0);
+        test(&[128], 0, 255, 128);
+        test(&[1, 0], 0, 256, 256);
+    }
+}
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
index 891dc88..b185ec3 100644
--- a/lib/src/lib.rs
+++ b/lib/src/lib.rs
@@ -77,8 +77,8 @@
 //! - They are deterministic: their output only depends on their input
 //! - They have no side-effects: they do not modify any hidden mutable state
 //! - They are correct: encoding followed by decoding gives the initial data
-//! - They are canonical (unless [`is_canonical`] returns false): decoding followed by encoding gives the
-//!   initial data
+//! - They are canonical (unless [`is_canonical`] returns false): decoding followed by encoding
+//!   gives the initial data
 //!
 //! This last property is usually not satisfied by base64 implementations. This is a matter of
 //! choice and this crate has made the choice to let the user choose. Support for canonical encoding
@@ -1315,6 +1315,14 @@ impl Encoding {
         self.encode_mut(input, &mut output[output_len ..]);
     }
 
+    /// Returns an object to encode a fragmented input and append it to `output`
+    ///
+    /// See the documentation of [`Encoder`] for more details and examples.
+    #[cfg(feature = "alloc")]
+    pub fn new_encoder<'a>(&'a self, output: &'a mut String) -> Encoder<'a> {
+        Encoder::new(self, output)
+    }
+
     /// Returns encoded `input`
     ///
     /// # Examples
@@ -1538,6 +1546,87 @@ impl Encoding {
     }
 }
 
+/// Encodes fragmented input to an output
+///
+/// It is equivalent to use an [`Encoder`] with multiple calls to [`Encoder::append()`] than to
+/// first concatenate all the input and then use [`Encoding::encode_append()`]. In particular, this
+/// function will not introduce padding or wrapping between inputs.
+///
+/// # Examples
+///
+/// ```rust
+/// // This is a bit inconvenient but we can't take a long-term reference to data_encoding::BASE64
+/// // because it's a constant. We need to use a static which has an address instead. This will be
+/// // fixed in version 3 of the library.
+/// static BASE64: data_encoding::Encoding = data_encoding::BASE64;
+/// let mut output = String::new();
+/// let mut encoder = BASE64.new_encoder(&mut output);
+/// encoder.append(b"hello");
+/// encoder.append(b"world");
+/// encoder.finalize();
+/// assert_eq!(output, BASE64.encode(b"helloworld"));
+/// ```
+#[derive(Debug)]
+#[cfg(feature = "alloc")]
+pub struct Encoder<'a> {
+    encoding: &'a Encoding,
+    output: &'a mut String,
+    buffer: [u8; 255],
+    length: u8,
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Drop for Encoder<'a> {
+    fn drop(&mut self) {
+        self.encoding.encode_append(&self.buffer[.. self.length as usize], self.output);
+    }
+}
+
+#[cfg(feature = "alloc")]
+impl<'a> Encoder<'a> {
+    fn new(encoding: &'a Encoding, output: &'a mut String) -> Self {
+        Encoder { encoding, output, buffer: [0; 255], length: 0 }
+    }
+
+    /// Encodes the provided input fragment and appends the result to the output
+    pub fn append(&mut self, mut input: &[u8]) {
+        let bit = self.encoding.bit();
+        #[allow(clippy::cast_possible_truncation)] // no truncation
+        let max = match self.encoding.wrap() {
+            Some((x, _)) => (x / dec(bit) * enc(bit)) as u8,
+            None => enc(bit) as u8,
+        };
+        if self.length != 0 {
+            let len = self.length;
+            #[allow(clippy::cast_possible_truncation)] // no truncation
+            let add = core::cmp::min((max - len) as usize, input.len()) as u8;
+            self.buffer[len as usize ..][.. add as usize].copy_from_slice(&input[.. add as usize]);
+            self.length += add;
+            input = &input[add as usize ..];
+            if self.length != max {
+                debug_assert!(self.length < max);
+                debug_assert!(input.is_empty());
+                return;
+            }
+            self.encoding.encode_append(&self.buffer[.. max as usize], self.output);
+            self.length = 0;
+        }
+        let len = floor(input.len(), max as usize);
+        self.encoding.encode_append(&input[.. len], self.output);
+        input = &input[len ..];
+        #[allow(clippy::cast_possible_truncation)] // no truncation
+        let len = input.len() as u8;
+        self.buffer[.. len as usize].copy_from_slice(input);
+        self.length = len;
+    }
+
+    /// Makes sure all inputs have been encoded and appended to the output
+    ///
+    /// This is equivalent to dropping the encoder and required for correctness, otherwise some
+    /// encoded data may be missing at the end.
+    pub fn finalize(self) {}
+}
+
 #[derive(Debug, Copy, Clone)]
 #[cfg(feature = "alloc")]
 enum SpecificationErrorImpl {
diff --git a/lib/tests/lib.rs b/lib/tests/lib.rs
index 4c4a1b5..deb708c 100644
--- a/lib/tests/lib.rs
+++ b/lib/tests/lib.rs
@@ -670,3 +670,28 @@ fn encode_append() {
     test(b"fo", "", "Zm8=");
     test(b"fo", "ba", "baZm8=");
 }
+
+#[test]
+fn encoder() {
+    #[track_caller]
+    fn test(inputs: &[&[u8]], expected: &str) {
+        let mut output = String::new();
+        static BASE: Encoding = data_encoding::BASE64;
+        let mut encoder = BASE.new_encoder(&mut output);
+        for input in inputs {
+            encoder.append(input);
+        }
+        encoder.finalize();
+        assert_eq!(output, expected);
+    }
+    test(&[], "");
+    test(&[b""], "");
+    test(&[b"", b""], "");
+    test(&[b"f", b""], "Zg==");
+    test(&[b"", b"f"], "Zg==");
+    test(&[b"f", b"o"], "Zm8=");
+    test(&[b"fo", b"o"], "Zm9v");
+    test(&[b"fo", b"ob"], "Zm9vYg==");
+    test(&[b"foob", b"a"], "Zm9vYmE=");
+    test(&[b"foob", b"ar"], "Zm9vYmFy");
+}