From 15c27df21b84247dee69633b2255fc0fbdc2c949 Mon Sep 17 00:00:00 2001 From: Julien Cretin Date: Mon, 13 Nov 2023 22:41:49 +0100 Subject: [PATCH] Add Encoder for concatenated inputs (#89) Fixes #81 --- lib/CHANGELOG.md | 1 + lib/fuzz/Cargo.toml | 6 ++ lib/fuzz/fuzz_targets/encoder.rs | 21 +++++++ lib/fuzz/fuzz_targets/round_trip.rs | 5 +- lib/fuzz/src/lib.rs | 49 ++++++++++++++- lib/src/lib.rs | 93 ++++++++++++++++++++++++++++- lib/tests/lib.rs | 25 ++++++++ 7 files changed, 193 insertions(+), 7 deletions(-) create mode 100644 lib/fuzz/fuzz_targets/encoder.rs diff --git a/lib/CHANGELOG.md b/lib/CHANGELOG.md index e1a915f..263eb41 100644 --- a/lib/CHANGELOG.md +++ b/lib/CHANGELOG.md @@ -4,6 +4,7 @@ ### Minor +- Add `Encoder` and `Encoding::new_encoder()` for fragmented inputs (fixes #81) - Make some functions `must_use` - Bump MSRV from 1.47 to 1.48 diff --git a/lib/fuzz/Cargo.toml b/lib/fuzz/Cargo.toml index 5e1edf0..077ecdc 100644 --- a/lib/fuzz/Cargo.toml +++ b/lib/fuzz/Cargo.toml @@ -17,3 +17,9 @@ name = "round_trip" path = "fuzz_targets/round_trip.rs" test = false doc = false + +[[bin]] +name = "encoder" +path = "fuzz_targets/encoder.rs" +test = false +doc = false diff --git a/lib/fuzz/fuzz_targets/encoder.rs b/lib/fuzz/fuzz_targets/encoder.rs new file mode 100644 index 0000000..e832680 --- /dev/null +++ b/lib/fuzz/fuzz_targets/encoder.rs @@ -0,0 +1,21 @@ +#![no_main] + +use data_encoding_fuzz::{generate_bytes, generate_encoding, generate_usize}; +use libfuzzer_sys::fuzz_target; + +fuzz_target!(|data: &[u8]| { + let mut data = data; + let encoding = generate_encoding(&mut data); + let mut output = String::new(); + let mut input = Vec::new(); + let mut encoder = encoding.new_encoder(&mut output); + while !data.is_empty() { + let len = generate_usize(&mut data, 0, 3 * 256 - 1); + let chunk = generate_bytes(&mut data, len); + input.extend_from_slice(chunk); + encoder.append(chunk); + } + encoder.finalize(); + let expected = encoding.encode(&input); + assert_eq!(output, expected); +}); diff --git a/lib/fuzz/fuzz_targets/round_trip.rs b/lib/fuzz/fuzz_targets/round_trip.rs index bbc8887..4c7e5b2 100644 --- a/lib/fuzz/fuzz_targets/round_trip.rs +++ b/lib/fuzz/fuzz_targets/round_trip.rs @@ -1,10 +1,7 @@ #![no_main] -#[macro_use] -extern crate libfuzzer_sys; -extern crate data_encoding_fuzz; - use data_encoding_fuzz::{decode_prefix, generate_encoding}; +use libfuzzer_sys::fuzz_target; fuzz_target!(|data: &[u8]| { let mut data = data; diff --git a/lib/fuzz/src/lib.rs b/lib/fuzz/src/lib.rs index ef50f96..d196554 100644 --- a/lib/fuzz/src/lib.rs +++ b/lib/fuzz/src/lib.rs @@ -80,7 +80,29 @@ pub fn generate_specification(data: &mut &[u8]) -> Specification { spec } -fn generate(data: &mut &[u8], min: u8, max: u8) -> u8 { +pub fn generate_bytes<'a>(data: &'_ mut &'a [u8], len: usize) -> &'a [u8] { + let len = std::cmp::min(len, data.len()); + let res = &data[.. len]; + *data = &data[len ..]; + res +} + +pub fn generate_usize(data: &mut &[u8], min: usize, max: usize) -> usize { + let log = match (max - min).checked_ilog2() { + None => return min, + Some(x) => x, + }; + let mut res = 0; + for _ in 0 .. log / 8 + 1 { + res = res << 8 | generate(data, 0, 255) as usize; + } + if usize::MIN < min || max < usize::MAX { + res = min + res % (max - min + 1); + } + res +} + +pub fn generate(data: &mut &[u8], min: u8, max: u8) -> u8 { if data.is_empty() { return min; } @@ -106,3 +128,28 @@ pub fn decode_prefix(encoding: &Encoding, input: &mut &[u8]) -> Vec { } output } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn generate_usize_ok() { + #[track_caller] + fn test(mut data: &[u8], min: usize, max: usize, expected: usize) { + assert_eq!(generate_usize(&mut data, min, max), expected); + assert_eq!(data, &[]); + } + test(&[], 0, 0, 0); + test(&[], 0, 0xffff, 0); + test(&[0], 0, 0xffff, 0); + test(&[0x23], 0, 0xffff, 0x2300); + test(&[0x23, 0x58], 0, 0xffff, 0x2358); + test(&[0x23, 0x58], 0x10000, 0x1ffff, 0x12358); + test(&[0], 0, 1, 0); + test(&[1], 0, 1, 1); + test(&[2], 0, 1, 0); + test(&[128], 0, 255, 128); + test(&[1, 0], 0, 256, 256); + } +} diff --git a/lib/src/lib.rs b/lib/src/lib.rs index 891dc88..b185ec3 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -77,8 +77,8 @@ //! - They are deterministic: their output only depends on their input //! - They have no side-effects: they do not modify any hidden mutable state //! - They are correct: encoding followed by decoding gives the initial data -//! - They are canonical (unless [`is_canonical`] returns false): decoding followed by encoding gives the -//! initial data +//! - They are canonical (unless [`is_canonical`] returns false): decoding followed by encoding +//! gives the initial data //! //! This last property is usually not satisfied by base64 implementations. This is a matter of //! choice and this crate has made the choice to let the user choose. Support for canonical encoding @@ -1315,6 +1315,14 @@ impl Encoding { self.encode_mut(input, &mut output[output_len ..]); } + /// Returns an object to encode a fragmented input and append it to `output` + /// + /// See the documentation of [`Encoder`] for more details and examples. + #[cfg(feature = "alloc")] + pub fn new_encoder<'a>(&'a self, output: &'a mut String) -> Encoder<'a> { + Encoder::new(self, output) + } + /// Returns encoded `input` /// /// # Examples @@ -1538,6 +1546,87 @@ impl Encoding { } } +/// Encodes fragmented input to an output +/// +/// It is equivalent to use an [`Encoder`] with multiple calls to [`Encoder::append()`] than to +/// first concatenate all the input and then use [`Encoding::encode_append()`]. In particular, this +/// function will not introduce padding or wrapping between inputs. +/// +/// # Examples +/// +/// ```rust +/// // This is a bit inconvenient but we can't take a long-term reference to data_encoding::BASE64 +/// // because it's a constant. We need to use a static which has an address instead. This will be +/// // fixed in version 3 of the library. +/// static BASE64: data_encoding::Encoding = data_encoding::BASE64; +/// let mut output = String::new(); +/// let mut encoder = BASE64.new_encoder(&mut output); +/// encoder.append(b"hello"); +/// encoder.append(b"world"); +/// encoder.finalize(); +/// assert_eq!(output, BASE64.encode(b"helloworld")); +/// ``` +#[derive(Debug)] +#[cfg(feature = "alloc")] +pub struct Encoder<'a> { + encoding: &'a Encoding, + output: &'a mut String, + buffer: [u8; 255], + length: u8, +} + +#[cfg(feature = "alloc")] +impl<'a> Drop for Encoder<'a> { + fn drop(&mut self) { + self.encoding.encode_append(&self.buffer[.. self.length as usize], self.output); + } +} + +#[cfg(feature = "alloc")] +impl<'a> Encoder<'a> { + fn new(encoding: &'a Encoding, output: &'a mut String) -> Self { + Encoder { encoding, output, buffer: [0; 255], length: 0 } + } + + /// Encodes the provided input fragment and appends the result to the output + pub fn append(&mut self, mut input: &[u8]) { + let bit = self.encoding.bit(); + #[allow(clippy::cast_possible_truncation)] // no truncation + let max = match self.encoding.wrap() { + Some((x, _)) => (x / dec(bit) * enc(bit)) as u8, + None => enc(bit) as u8, + }; + if self.length != 0 { + let len = self.length; + #[allow(clippy::cast_possible_truncation)] // no truncation + let add = core::cmp::min((max - len) as usize, input.len()) as u8; + self.buffer[len as usize ..][.. add as usize].copy_from_slice(&input[.. add as usize]); + self.length += add; + input = &input[add as usize ..]; + if self.length != max { + debug_assert!(self.length < max); + debug_assert!(input.is_empty()); + return; + } + self.encoding.encode_append(&self.buffer[.. max as usize], self.output); + self.length = 0; + } + let len = floor(input.len(), max as usize); + self.encoding.encode_append(&input[.. len], self.output); + input = &input[len ..]; + #[allow(clippy::cast_possible_truncation)] // no truncation + let len = input.len() as u8; + self.buffer[.. len as usize].copy_from_slice(input); + self.length = len; + } + + /// Makes sure all inputs have been encoded and appended to the output + /// + /// This is equivalent to dropping the encoder and required for correctness, otherwise some + /// encoded data may be missing at the end. + pub fn finalize(self) {} +} + #[derive(Debug, Copy, Clone)] #[cfg(feature = "alloc")] enum SpecificationErrorImpl { diff --git a/lib/tests/lib.rs b/lib/tests/lib.rs index 4c4a1b5..deb708c 100644 --- a/lib/tests/lib.rs +++ b/lib/tests/lib.rs @@ -670,3 +670,28 @@ fn encode_append() { test(b"fo", "", "Zm8="); test(b"fo", "ba", "baZm8="); } + +#[test] +fn encoder() { + #[track_caller] + fn test(inputs: &[&[u8]], expected: &str) { + let mut output = String::new(); + static BASE: Encoding = data_encoding::BASE64; + let mut encoder = BASE.new_encoder(&mut output); + for input in inputs { + encoder.append(input); + } + encoder.finalize(); + assert_eq!(output, expected); + } + test(&[], ""); + test(&[b""], ""); + test(&[b"", b""], ""); + test(&[b"f", b""], "Zg=="); + test(&[b"", b"f"], "Zg=="); + test(&[b"f", b"o"], "Zm8="); + test(&[b"fo", b"o"], "Zm9v"); + test(&[b"fo", b"ob"], "Zm9vYg=="); + test(&[b"foob", b"a"], "Zm9vYmE="); + test(&[b"foob", b"ar"], "Zm9vYmFy"); +}