Skip to content

Commit

Permalink
Avoid heap allocations in text encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Nov 1, 2024
1 parent b6d7b40 commit 5ce9fe0
Show file tree
Hide file tree
Showing 9 changed files with 234 additions and 111 deletions.
75 changes: 0 additions & 75 deletions src/base/bytes.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use super::Range;
use encoding_rs::{Encoding, WINDOWS_1252};
use memchr::{memchr, memchr3};
use std::borrow::Cow;
use std::fmt::{self, Debug};
use std::ops::Deref;
Expand Down Expand Up @@ -84,80 +83,6 @@ impl<'b> Bytes<'b> {
}
}

macro_rules! impl_replace_byte {
($self:tt, $output_handler:ident, $impls:ident) => {
let mut tail: &[u8] = $self;

loop {
match $impls!(@find tail) {
Some(pos) => {
let replacement = $impls!(@get_replacement tail, pos);
let chunk = &tail[..pos];

if !chunk.is_empty() {
$output_handler(chunk);
}

$output_handler(&replacement);
tail = &tail[pos + 1..];
}
None => {
if !tail.is_empty() {
$output_handler(&tail);
}
break;
}
}
}
};
}

impl<'b> Bytes<'b> {
#[inline]
pub fn replace_byte(&self, (needle, repl): (u8, &[u8]), output_handler: &mut dyn FnMut(&[u8])) {
macro_rules! impls {
(@find $tail:ident) => {
memchr(needle, $tail)
};

(@get_replacement $tail:ident, $pos:ident) => {
repl
};
}

impl_replace_byte!(self, output_handler, impls);
}

#[inline]
pub fn replace_byte3(
&self,
(needle1, repl1): (u8, &[u8]),
(needle2, repl2): (u8, &[u8]),
(needle3, repl3): (u8, &[u8]),
output_handler: &mut dyn FnMut(&[u8]),
) {
macro_rules! impls {
(@find $tail:ident) => {
memchr3(needle1, needle2, needle3, $tail)
};

(@get_replacement $tail:ident, $pos:ident) => {{
let matched = $tail[$pos];

if matched == needle1 {
repl1
} else if matched == needle2 {
repl2
} else {
repl3
}
}};
}

impl_replace_byte!(self, output_handler, impls);
}
}

impl<'b> From<Cow<'b, [u8]>> for Bytes<'b> {
#[inline]
fn from(bytes: Cow<'b, [u8]>) -> Self {
Expand Down
63 changes: 63 additions & 0 deletions src/html/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
use crate::base::Bytes;
use memchr::{memchr, memchr3};

#[macro_use]
mod tag;

Expand All @@ -9,3 +12,63 @@ pub use self::local_name::{LocalName, LocalNameHash};
pub use self::namespace::Namespace;
pub use self::tag::*;
pub use self::text_type::TextType;

/// Convert text to HTML
#[inline]
pub(crate) fn escape_body_text(mut content: &str, output_handler: &mut impl FnMut(&str)) {
loop {
if let Some(pos) = memchr3(b'&', b'<', b'>', content.as_bytes()) {
let Some((chunk_before, (matched, rest))) = content
.split_at_checked(pos)
.and_then(|(before, rest)| Some((before, rest.split_at_checked(1)?)))
else {
return;
};
content = rest;
let matched = matched.as_bytes()[0];

if !chunk_before.is_empty() {
(output_handler)(chunk_before);
}
(output_handler)(match matched {
b'<' => "&lt;",
b'>' => "&gt;",
_ => "&amp;",
});
} else {
if !content.is_empty() {
(output_handler)(content);
}
return;
}
}
}

/// Replace `"` with `&quot;` ONLY, leaving `&` unescaped
pub(crate) fn escape_double_quotes_only(
content: &Bytes<'_>,
output_handler: &mut dyn FnMut(&[u8]),
) {
let mut content = &**content;
loop {
if let Some(pos) = memchr(b'"', content) {
let Some((chunk_before, rest)) = content
.split_at_checked(pos)
.and_then(|(before, rest)| Some((before, rest.get(1..)?)))
else {
return;
};
content = rest;

if !chunk_before.is_empty() {
(output_handler)(chunk_before);
}
(output_handler)(b"&quot;");
} else {
if !content.is_empty() {
(output_handler)(content);
}
return;
}
}
}
5 changes: 5 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ pub mod test_utils {

impl Output {
#[must_use]
#[inline]
pub fn new(encoding: &'static Encoding) -> Self {
Self {
bytes: Vec::default(),
Expand All @@ -163,6 +164,8 @@ pub mod test_utils {
}
}

#[inline]
#[track_caller]
pub fn push(&mut self, chunk: &[u8]) {
if chunk.is_empty() {
self.finalizing_chunk_received = true;
Expand All @@ -178,6 +181,8 @@ pub mod test_utils {
}

impl From<Output> for String {
#[inline]
#[track_caller]
fn from(output: Output) -> Self {
assert!(
output.finalizing_chunk_received,
Expand Down
7 changes: 4 additions & 3 deletions src/rewritable_units/document_end.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use super::mutations::content_to_bytes;
use super::text_encoder::StreamingHandlerSink;
use super::ContentType;
use encoding_rs::Encoding;

Expand Down Expand Up @@ -50,9 +50,10 @@ impl<'a> DocumentEnd<'a> {
/// ```
#[inline]
pub fn append(&mut self, content: &str, content_type: ContentType) {
content_to_bytes(content, content_type, self.encoding, &mut |c: &[u8]| {
StreamingHandlerSink::new(self.encoding, &mut |c| {
self.output_sink.handle_chunk(c);
});
})
.write_str_chunk(content, content_type);
}
}

Expand Down
2 changes: 2 additions & 0 deletions src/rewritable_units/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ pub use self::document_end::*;
pub use self::element::*;
pub use self::mutations::ContentType;
pub(crate) use self::mutations::{Mutations, StringChunk};
pub use self::text_encoder::StreamingHandlerSink;
pub use self::tokens::*;

/// Data that can be attached to a rewritable unit by a user and shared between content handler
Expand Down Expand Up @@ -84,6 +85,7 @@ mod mutations;

mod document_end;
mod element;
mod text_encoder;
mod tokens;

#[cfg(test)]
Expand Down
30 changes: 3 additions & 27 deletions src/rewritable_units/mutations.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::base::Bytes;
use super::text_encoder::StreamingHandlerSink;
use encoding_rs::Encoding;
use std::error::Error as StdError;

Expand All @@ -16,26 +16,6 @@ pub enum ContentType {
Text,
}

#[inline]
pub(super) fn content_to_bytes(
content: &str,
content_type: ContentType,
encoding: &'static Encoding,
output_handler: &mut dyn FnMut(&[u8]),
) {
let bytes = Bytes::from_str(content, encoding);

match content_type {
ContentType::Html => output_handler(&bytes),
ContentType::Text => bytes.replace_byte3(
(b'<', b"&lt;"),
(b'>', b"&gt;"),
(b'&', b"&amp;"),
&mut *output_handler,
),
}
}

pub(crate) struct Mutations {
pub content_before: DynamicString,
pub replacement: DynamicString,
Expand Down Expand Up @@ -112,15 +92,11 @@ impl DynamicString {
self.chunks.push(chunk);
}

pub fn into_bytes(
self,
encoding: &'static Encoding,
output_handler: &mut dyn FnMut(&[u8]),
) -> BoxResult {
pub fn encode(self, sink: &mut StreamingHandlerSink<'_>) -> BoxResult {
for chunk in self.chunks {
match chunk {
StringChunk::Buffer(content, content_type) => {
content_to_bytes(&content, content_type, encoding, output_handler);
sink.write_str_chunk(&content, content_type);
}
};
}
Expand Down
Loading

0 comments on commit 5ce9fe0

Please sign in to comment.