From 5ce9fe0e85d636d7245724a4d260a15955cdcc98 Mon Sep 17 00:00:00 2001 From: Kornel Date: Thu, 31 Oct 2024 16:50:38 +0000 Subject: [PATCH] Avoid heap allocations in text encoding --- src/base/bytes.rs | 75 ----------- src/html/mod.rs | 63 ++++++++++ src/lib.rs | 5 + src/rewritable_units/document_end.rs | 7 +- src/rewritable_units/mod.rs | 2 + src/rewritable_units/mutations.rs | 30 +---- src/rewritable_units/text_encoder.rs | 146 ++++++++++++++++++++++ src/rewritable_units/tokens/attributes.rs | 3 +- src/rewritable_units/tokens/mod.rs | 14 ++- 9 files changed, 234 insertions(+), 111 deletions(-) create mode 100644 src/rewritable_units/text_encoder.rs diff --git a/src/base/bytes.rs b/src/base/bytes.rs index d5ea253f..b2591145 100644 --- a/src/base/bytes.rs +++ b/src/base/bytes.rs @@ -1,6 +1,5 @@ use super::Range; use encoding_rs::{Encoding, WINDOWS_1252}; -use memchr::{memchr, memchr3}; use std::borrow::Cow; use std::fmt::{self, Debug}; use std::ops::Deref; @@ -84,80 +83,6 @@ impl<'b> Bytes<'b> { } } -macro_rules! impl_replace_byte { - ($self:tt, $output_handler:ident, $impls:ident) => { - let mut tail: &[u8] = $self; - - loop { - match $impls!(@find tail) { - Some(pos) => { - let replacement = $impls!(@get_replacement tail, pos); - let chunk = &tail[..pos]; - - if !chunk.is_empty() { - $output_handler(chunk); - } - - $output_handler(&replacement); - tail = &tail[pos + 1..]; - } - None => { - if !tail.is_empty() { - $output_handler(&tail); - } - break; - } - } - } - }; -} - -impl<'b> Bytes<'b> { - #[inline] - pub fn replace_byte(&self, (needle, repl): (u8, &[u8]), output_handler: &mut dyn FnMut(&[u8])) { - macro_rules! impls { - (@find $tail:ident) => { - memchr(needle, $tail) - }; - - (@get_replacement $tail:ident, $pos:ident) => { - repl - }; - } - - impl_replace_byte!(self, output_handler, impls); - } - - #[inline] - pub fn replace_byte3( - &self, - (needle1, repl1): (u8, &[u8]), - (needle2, repl2): (u8, &[u8]), - (needle3, repl3): (u8, &[u8]), - output_handler: &mut dyn FnMut(&[u8]), - ) { - macro_rules! impls { - (@find $tail:ident) => { - memchr3(needle1, needle2, needle3, $tail) - }; - - (@get_replacement $tail:ident, $pos:ident) => {{ - let matched = $tail[$pos]; - - if matched == needle1 { - repl1 - } else if matched == needle2 { - repl2 - } else { - repl3 - } - }}; - } - - impl_replace_byte!(self, output_handler, impls); - } -} - impl<'b> From> for Bytes<'b> { #[inline] fn from(bytes: Cow<'b, [u8]>) -> Self { diff --git a/src/html/mod.rs b/src/html/mod.rs index 0257c9cb..ee0f0c0e 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -1,3 +1,6 @@ +use crate::base::Bytes; +use memchr::{memchr, memchr3}; + #[macro_use] mod tag; @@ -9,3 +12,63 @@ pub use self::local_name::{LocalName, LocalNameHash}; pub use self::namespace::Namespace; pub use self::tag::*; pub use self::text_type::TextType; + +/// Convert text to HTML +#[inline] +pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) { + loop { + if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) { + let Some((chunk_before, (matched, rest))) = content + .split_at_checked(pos) + .and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?))) + else { + return; + }; + content = rest; + let matched = matched.as_bytes()[0]; + + if !chunk_before.is_empty() { + (output_handler)(chunk_before); + } + (output_handler)(match matched { + b'<' => "<", + b'>' => ">", + _ => "&", + }); + } else { + if !content.is_empty() { + (output_handler)(content); + } + return; + } + } +} + +/// Replace `"` with `"` ONLY, leaving `&` unescaped +pub(crate) fn escape_double_quotes_only( + content: &Bytes<'_>, + output_handler: &mut dyn FnMut(&[u8]), +) { + let mut content = &**content; + loop { + if let Some(pos) = memchr(b'"', content) { + let Some((chunk_before, rest)) = content + .split_at_checked(pos) + .and_then(|(before, rest)| Some((before, rest.get(1..)?))) + else { + return; + }; + content = rest; + + if !chunk_before.is_empty() { + (output_handler)(chunk_before); + } + (output_handler)(b"""); + } else { + if !content.is_empty() { + (output_handler)(content); + } + return; + } + } +} diff --git a/src/lib.rs b/src/lib.rs index fa9ca074..398a829b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -155,6 +155,7 @@ pub mod test_utils { impl Output { #[must_use] + #[inline] pub fn new(encoding: &'static Encoding) -> Self { Self { bytes: Vec::default(), @@ -163,6 +164,8 @@ pub mod test_utils { } } + #[inline] + #[track_caller] pub fn push(&mut self, chunk: &[u8]) { if chunk.is_empty() { self.finalizing_chunk_received = true; @@ -178,6 +181,8 @@ pub mod test_utils { } impl From for String { + #[inline] + #[track_caller] fn from(output: Output) -> Self { assert!( output.finalizing_chunk_received, diff --git a/src/rewritable_units/document_end.rs b/src/rewritable_units/document_end.rs index 70952739..7075a41c 100644 --- a/src/rewritable_units/document_end.rs +++ b/src/rewritable_units/document_end.rs @@ -1,4 +1,4 @@ -use super::mutations::content_to_bytes; +use super::text_encoder::StreamingHandlerSink; use super::ContentType; use encoding_rs::Encoding; @@ -50,9 +50,10 @@ impl<'a> DocumentEnd<'a> { /// ``` #[inline] pub fn append(&mut self, content: &str, content_type: ContentType) { - content_to_bytes(content, content_type, self.encoding, &mut |c: &[u8]| { + StreamingHandlerSink::new(self.encoding, &mut |c| { self.output_sink.handle_chunk(c); - }); + }) + .write_str_chunk(content, content_type); } } diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index f1b411a3..6843ab69 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -4,6 +4,7 @@ pub use self::document_end::*; pub use self::element::*; pub use self::mutations::ContentType; pub(crate) use self::mutations::{Mutations, StringChunk}; +pub use self::text_encoder::StreamingHandlerSink; pub use self::tokens::*; /// Data that can be attached to a rewritable unit by a user and shared between content handler @@ -84,6 +85,7 @@ mod mutations; mod document_end; mod element; +mod text_encoder; mod tokens; #[cfg(test)] diff --git a/src/rewritable_units/mutations.rs b/src/rewritable_units/mutations.rs index 82cdec43..1d668ef7 100644 --- a/src/rewritable_units/mutations.rs +++ b/src/rewritable_units/mutations.rs @@ -1,4 +1,4 @@ -use crate::base::Bytes; +use super::text_encoder::StreamingHandlerSink; use encoding_rs::Encoding; use std::error::Error as StdError; @@ -16,26 +16,6 @@ pub enum ContentType { Text, } -#[inline] -pub(super) fn content_to_bytes( - content: &str, - content_type: ContentType, - encoding: &'static Encoding, - output_handler: &mut dyn FnMut(&[u8]), -) { - let bytes = Bytes::from_str(content, encoding); - - match content_type { - ContentType::Html => output_handler(&bytes), - ContentType::Text => bytes.replace_byte3( - (b'<', b"<"), - (b'>', b">"), - (b'&', b"&"), - &mut *output_handler, - ), - } -} - pub(crate) struct Mutations { pub content_before: DynamicString, pub replacement: DynamicString, @@ -112,15 +92,11 @@ impl DynamicString { self.chunks.push(chunk); } - pub fn into_bytes( - self, - encoding: &'static Encoding, - output_handler: &mut dyn FnMut(&[u8]), - ) -> BoxResult { + pub fn encode(self, sink: &mut StreamingHandlerSink<'_>) -> BoxResult { for chunk in self.chunks { match chunk { StringChunk::Buffer(content, content_type) => { - content_to_bytes(&content, content_type, encoding, output_handler); + sink.write_str_chunk(&content, content_type); } }; } diff --git a/src/rewritable_units/text_encoder.rs b/src/rewritable_units/text_encoder.rs new file mode 100644 index 00000000..234e9068 --- /dev/null +++ b/src/rewritable_units/text_encoder.rs @@ -0,0 +1,146 @@ +use super::ContentType; +use crate::html::escape_body_text; +use encoding_rs::{CoderResult, Encoder, Encoding, UTF_8}; + +/// Used to write chunks of text or markup in streaming mutation handlers. +/// +/// Argument to [`StreamingHandler::write_all`] +pub struct StreamingHandlerSink<'output_handler> { + non_utf8_encoder: Option, + + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSend(T); + /// let x: IsSend>; + /// ``` + /// + /// ```compile_fail + /// use lol_html::html_content::StreamingHandlerSink; + /// struct IsSync(T); + /// let x: IsSync>; + /// ``` + output_handler: &'output_handler mut dyn FnMut(&[u8]), +} + +impl<'output_handler> StreamingHandlerSink<'output_handler> { + #[inline(always)] + pub(crate) fn new( + encoding: &'static Encoding, + output_handler: &'output_handler mut dyn FnMut(&[u8]), + ) -> Self { + Self { + non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)), + output_handler, + } + } + + /// Writes the given UTF-8 string to the output, converting the encoding and [escaping](ContentType) if necessary. + /// + /// It may be called multiple times. The strings will be concatenated together. + #[inline] + pub fn write_str_chunk(&mut self, content: &str, content_type: ContentType) { + match content_type { + ContentType::Html => self.write_html(content), + ContentType::Text => self.write_body_text(content), + } + } + + pub(crate) fn write_html(&mut self, html: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + encoder.encode(html, self.output_handler); + } else if !html.is_empty() { + (self.output_handler)(html.as_bytes()); + } + } + + /// For text content, not attributes + pub(crate) fn write_body_text(&mut self, plaintext: &str) { + if let Some(encoder) = &mut self.non_utf8_encoder { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + encoder.encode(chunk, self.output_handler); + }); + } else { + escape_body_text(plaintext, &mut |chunk| { + debug_assert!(!chunk.is_empty()); + (self.output_handler)(chunk.as_bytes()); + }); + } + } + + pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) { + &mut self.output_handler + } +} + +enum Buffer { + Heap(Vec), + Stack([u8; 63]), // leave a byte for the tag +} + +struct TextEncoder { + encoder: Encoder, + buffer: Buffer, +} + +impl TextEncoder { + #[inline] + pub fn new(encoding: &'static Encoding) -> Self { + debug_assert!(encoding != UTF_8); + debug_assert!(encoding.is_ascii_compatible()); + Self { + encoder: encoding.new_encoder(), + buffer: Buffer::Stack([0; 63]), + } + } + + #[inline(never)] + fn encode(&mut self, mut content: &str, output_handler: &mut dyn FnMut(&[u8])) { + loop { + debug_assert!(!self.encoder.has_pending_state()); // ASCII-compatible encodings are not supposed to have it + let ascii_len = Encoding::ascii_valid_up_to(content.as_bytes()); + if let Some((ascii, remainder)) = content.split_at_checked(ascii_len) { + if !ascii.is_empty() { + (output_handler)(ascii.as_bytes()); + } + if remainder.is_empty() { + return; + } + content = remainder; + } + + let buffer = match &mut self.buffer { + Buffer::Heap(buf) => buf.as_mut_slice(), + // Long non-ASCII content could take lots of roundtrips through the encoder + buf if content.len() >= 1 << 20 => { + *buf = Buffer::Heap(vec![0; 4096]); + match buf { + Buffer::Heap(buf) => buf.as_mut(), + _ => unreachable!(), + } + } + Buffer::Stack(buf) => buf.as_mut_slice(), + }; + + let (result, read, written, _) = self.encoder.encode_from_utf8(content, buffer, false); + if written > 0 && written <= buffer.len() { + (output_handler)(&buffer[..written]); + } + if read >= content.len() { + return; + } + content = &content[..read]; + match result { + CoderResult::InputEmpty => return, + CoderResult::OutputFull => { + match &mut self.buffer { + Buffer::Heap(buf) if buf.len() >= 1024 => { + panic!("encoding_rs infinite loop"); // encoding_rs only needs a dozen bytes + } + buf => *buf = Buffer::Heap(vec![0; 1024]), + } + } + } + } + } +} diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index 61b31fdf..d358787b 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -1,5 +1,6 @@ use crate::base::Bytes; use crate::errors::RewritingError; +use crate::html::escape_double_quotes_only; use crate::parser::AttributeBuffer; use crate::rewritable_units::Serialize; use encoding_rs::Encoding; @@ -128,7 +129,7 @@ impl Serialize for &Attribute<'_> { None => { output_handler(&self.name); output_handler(b"=\""); - self.value.replace_byte((b'"', b"""), output_handler); + escape_double_quotes_only(&self.value, output_handler); output_handler(b"\""); } } diff --git a/src/rewritable_units/tokens/mod.rs b/src/rewritable_units/tokens/mod.rs index 91956cf9..e2b04f35 100644 --- a/src/rewritable_units/tokens/mod.rs +++ b/src/rewritable_units/tokens/mod.rs @@ -20,26 +20,30 @@ macro_rules! impl_serialize { mut self, output_handler: &mut dyn FnMut(&[u8]), ) -> Result<(), crate::errors::RewritingError> { + let mut encoder = crate::rewritable_units::text_encoder::StreamingHandlerSink::new( + self.mutations.encoding, + output_handler, + ); let content_before = ::std::mem::take(&mut self.mutations.content_before); content_before - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError)?; if !self.mutations.removed { match self.raw() { - Some(raw) => output_handler(raw), - None => self.serialize_from_parts(output_handler)?, + Some(raw) => (encoder.output_handler())(&raw), + None => self.serialize_from_parts(encoder.output_handler())?, } } else { self.mutations .replacement - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError)?; } self.mutations .content_after - .into_bytes(self.mutations.encoding, output_handler) + .encode(&mut encoder) .map_err(crate::errors::RewritingError::ContentHandlerError) } }