From df1d4ac6fb4764e2f9f2792c909290d1e44c36da Mon Sep 17 00:00:00 2001 From: Diogo Sousa Date: Tue, 6 Aug 2024 17:50:03 +0100 Subject: [PATCH] Send-parameterizable rewriter. --- Cargo.toml | 6 + README.md | 2 +- benches/cases/parsing.rs | 6 +- benches/cases/rewriting.rs | 4 +- benches/cases/selector_matching.rs | 10 +- c-api/src/rewriter_builder.rs | 18 +- examples/defer_scripts/main.rs | 2 +- examples/mixed_content_rewriter/main.rs | 2 +- fuzz/test_case/src/lib.rs | 8 +- src/base/encoding.rs | 276 ++++++++- src/lib.rs | 43 +- src/memory/limiter.rs | 19 +- src/parser/tree_builder_simulator/mod.rs | 6 +- src/rewritable_units/document_end.rs | 2 +- src/rewritable_units/element.rs | 62 +- src/rewritable_units/mod.rs | 10 +- src/rewritable_units/tokens/comment.rs | 6 +- src/rewritable_units/tokens/doctype.rs | 2 +- src/rewritable_units/tokens/text_chunk.rs | 10 +- src/rewriter/handlers_dispatcher.rs | 36 +- src/rewriter/mod.rs | 141 +++-- src/rewriter/rewrite_controller.rs | 14 +- src/rewriter/settings.rs | 538 +++++++++++++++--- src/selectors_vm/compiler.rs | 8 +- src/selectors_vm/mod.rs | 16 +- src/transform_stream/dispatcher.rs | 5 +- src/transform_stream/mod.rs | 7 +- tests/fixtures/element_content_replacement.rs | 2 +- tests/fixtures/selector_matching.rs | 2 +- .../harness/suites/html5lib_tests/decoder.rs | 6 +- 30 files changed, 993 insertions(+), 276 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9944c11e..42b3074c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,11 @@ include = [ autotests = false edition = "2021" +[lib] +# Disable libtest to make sure criterion can parse the command line flags. +# See https://bheisler.github.io/criterion.rs/book/faq.html and https://github.com/rust-lang/rust/issues/47241. +bench = false + [features] debug_trace = [] integration_test = [] @@ -55,6 +60,7 @@ hashbrown = { version = "0.13.1", features = ["serde"] } serde = "1.0.126" serde_derive = "1.0.19" serde_json = "1.0.65" +static_assertions = "1.1.0" rand = "0.8.5" rustc-test = "0.3.1" itertools = "0.10.1" diff --git a/README.md b/README.md index e1350db1..3d064b19 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,7 @@ fn main() -> Result<(), Box> { Ok(()) }) ], - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.extend_from_slice(c) ); diff --git a/benches/cases/parsing.rs b/benches/cases/parsing.rs index d89fa19e..a6229a09 100644 --- a/benches/cases/parsing.rs +++ b/benches/cases/parsing.rs @@ -3,7 +3,7 @@ use lol_html::*; define_group!( "Parsing", [ - ("Tag scanner", Settings::default()), + ("Tag scanner", Settings::new()), ( "Lexer", // NOTE: this switches parser to the lexer mode and doesn't @@ -11,7 +11,7 @@ define_group!( // we can get relatively fair comparison. Settings { document_content_handlers: vec![doctype!(noop_handler!())], - ..Settings::default() + ..Settings::new() } ), ( @@ -23,7 +23,7 @@ define_group!( // incoming chunks to produce correct text chunk rewritable units. Settings { document_content_handlers: vec![doc_text!(noop_handler!())], - ..Settings::default() + ..Settings::new() } ) ] diff --git a/benches/cases/rewriting.rs b/benches/cases/rewriting.rs index 7e1c1a50..3010f0b8 100644 --- a/benches/cases/rewriting.rs +++ b/benches/cases/rewriting.rs @@ -13,7 +13,7 @@ define_group!( Ok(()) })], - ..Settings::default() + ..Settings::new() } ), ( @@ -24,7 +24,7 @@ define_group!( Ok(()) })], - ..Settings::default() + ..Settings::new() } ) ] diff --git a/benches/cases/selector_matching.rs b/benches/cases/selector_matching.rs index ecacf875..e9a3ad29 100644 --- a/benches/cases/selector_matching.rs +++ b/benches/cases/selector_matching.rs @@ -7,28 +7,28 @@ define_group!( "Match-all selector", Settings { element_content_handlers: vec![element!("*", noop_handler!())], - ..Settings::default() + ..Settings::new() } ), ( "Tag name selector", Settings { element_content_handlers: vec![element!("div", noop_handler!())], - ..Settings::default() + ..Settings::new() } ), ( "Class selector", Settings { element_content_handlers: vec![element!(".note", noop_handler!())], - ..Settings::default() + ..Settings::new() } ), ( "Attribute selector", Settings { element_content_handlers: vec![element!("[href]", noop_handler!())], - ..Settings::default() + ..Settings::new() } ), ( @@ -43,7 +43,7 @@ define_group!( element!("div img", noop_handler!()), element!("div.note span", noop_handler!()) ], - ..Settings::default() + ..Settings::new() } ) ] diff --git a/c-api/src/rewriter_builder.rs b/c-api/src/rewriter_builder.rs index 23568fc0..c19eaa36 100644 --- a/c-api/src/rewriter_builder.rs +++ b/c-api/src/rewriter_builder.rs @@ -26,7 +26,7 @@ impl ExternHandler { } macro_rules! add_handler { - ($handlers:ident, $self:ident.$ty:ident) => {{ + ($handlers:ident, $el_ty:ident, $self:ident.$ty:ident) => {{ if let Some(handler) = $self.$ty.func { // NOTE: the closure actually holds a reference to the content // handler object, but since we pass the object to the C side this @@ -41,7 +41,7 @@ macro_rules! add_handler { $handlers = $handlers.$ty( - move |arg: &mut _| match unsafe { handler(arg, user_data) } { + move |arg: &mut $el_ty| match unsafe { handler(arg, user_data) } { RewriterDirective::Continue => Ok(()), RewriterDirective::Stop => Err("The rewriter has been stopped.".into()), }, @@ -61,10 +61,10 @@ impl ExternDocumentContentHandlers { pub fn as_safe_document_content_handlers(&self) -> DocumentContentHandlers { let mut handlers = DocumentContentHandlers::default(); - add_handler!(handlers, self.doctype); - add_handler!(handlers, self.comments); - add_handler!(handlers, self.text); - add_handler!(handlers, self.end); + add_handler!(handlers, Doctype, self.doctype); + add_handler!(handlers, Comment, self.comments); + add_handler!(handlers, TextChunk, self.text); + add_handler!(handlers, DocumentEnd, self.end); handlers } @@ -80,9 +80,9 @@ impl ExternElementContentHandlers { pub fn as_safe_element_content_handlers(&self) -> ElementContentHandlers { let mut handlers = ElementContentHandlers::default(); - add_handler!(handlers, self.element); - add_handler!(handlers, self.comments); - add_handler!(handlers, self.text); + add_handler!(handlers, Element, self.element); + add_handler!(handlers, Comment, self.comments); + add_handler!(handlers, TextChunk, self.text); handlers } diff --git a/examples/defer_scripts/main.rs b/examples/defer_scripts/main.rs index 1f5a7277..db52762c 100644 --- a/examples/defer_scripts/main.rs +++ b/examples/defer_scripts/main.rs @@ -21,7 +21,7 @@ fn main() { Ok(()) } )], - ..Settings::default() + ..Settings::new() }, output_sink, ); diff --git a/examples/mixed_content_rewriter/main.rs b/examples/mixed_content_rewriter/main.rs index f18c72e7..858d414a 100644 --- a/examples/mixed_content_rewriter/main.rs +++ b/examples/mixed_content_rewriter/main.rs @@ -37,7 +37,7 @@ fn main() { } ), ], - ..Settings::default() + ..Settings::new() }, output_sink, ); diff --git a/fuzz/test_case/src/lib.rs b/fuzz/test_case/src/lib.rs index 41de2371..f89c1f09 100644 --- a/fuzz/test_case/src/lib.rs +++ b/fuzz/test_case/src/lib.rs @@ -14,9 +14,7 @@ use std::ffi::{CStr, CString}; use encoding_rs::*; use lol_html::html_content::ContentType; -use lol_html::{ - comments, doc_comments, doc_text, element, text, HtmlRewriter, MemorySettings, Settings, -}; +use lol_html::{comments, doc_comments, doc_text, element, text, HtmlRewriter, MemorySettings, Settings}; include!(concat!(env!("OUT_DIR"), "/bindings.rs")); @@ -103,7 +101,7 @@ fn get_random_selector() -> &'static str { } fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) -> () { - let mut rewriter = HtmlRewriter::new( + let mut rewriter: HtmlRewriter<_> = HtmlRewriter::new( Settings { enable_esi_tags: true, element_content_handlers: vec![ @@ -178,7 +176,7 @@ fn run_rewriter_iter(data: &[u8], selector: &str, encoding: &'static Encoding) - }), ], encoding: encoding.try_into().unwrap(), - memory_settings: MemorySettings::default(), + memory_settings: MemorySettings::new(), strict: false, adjust_charset_on_meta_tag: false, }, diff --git a/src/base/encoding.rs b/src/base/encoding.rs index 34878f0f..543eca09 100644 --- a/src/base/encoding.rs +++ b/src/base/encoding.rs @@ -1,8 +1,249 @@ use crate::rewriter::AsciiCompatibleEncoding; use encoding_rs::Encoding; -use std::cell::Cell; -use std::ops::Deref; -use std::rc::Rc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; + +/// This serves as a map from integer to [`Encoding`], which allows more efficient +/// sets/gets of the [SharedEncoding]. +static ALL_ENCODINGS: [&Encoding; 228] = [ + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::IBM866_INIT, + &encoding_rs::MACINTOSH_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::BIG5_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::IBM866_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::WINDOWS_1250_INIT, + &encoding_rs::WINDOWS_1251_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::WINDOWS_1253_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::WINDOWS_1255_INIT, + &encoding_rs::BIG5_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::WINDOWS_1256_INIT, + &encoding_rs::IBM866_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::WINDOWS_1257_INIT, + &encoding_rs::WINDOWS_1258_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::EUC_JP_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::KOI8_U_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::GB18030_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::BIG5_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::ISO_8859_8_I_INIT, + &encoding_rs::KOI8_R_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::KOI8_U_INIT, + &encoding_rs::WINDOWS_1250_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::WINDOWS_1251_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::WINDOWS_1253_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::WINDOWS_1255_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::BIG5_INIT, + &encoding_rs::WINDOWS_1256_INIT, + &encoding_rs::IBM866_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::WINDOWS_1257_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::WINDOWS_1258_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::UTF_16BE_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::EUC_JP_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_13_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::ISO_8859_14_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::MACINTOSH_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_13_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::ISO_8859_14_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::BIG5_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::ISO_8859_13_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_14_INIT, + &encoding_rs::WINDOWS_874_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_16_INIT, + &encoding_rs::ISO_8859_10_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::ISO_8859_15_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::UTF_16BE_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::MACINTOSH_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_I_INIT, + &encoding_rs::SHIFT_JIS_INIT, + &encoding_rs::MACINTOSH_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::ISO_2022_JP_INIT, + &encoding_rs::ISO_2022_JP_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::WINDOWS_1250_INIT, + &encoding_rs::WINDOWS_1251_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::WINDOWS_1253_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::WINDOWS_1255_INIT, + &encoding_rs::WINDOWS_1256_INIT, + &encoding_rs::WINDOWS_1257_INIT, + &encoding_rs::WINDOWS_1258_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_I_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::EUC_KR_INIT, + &encoding_rs::X_MAC_CYRILLIC_INIT, + &encoding_rs::X_USER_DEFINED_INIT, + &encoding_rs::GBK_INIT, + &encoding_rs::UTF_16LE_INIT, + &encoding_rs::WINDOWS_1252_INIT, + &encoding_rs::ISO_8859_2_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::ISO_8859_3_INIT, + &encoding_rs::ISO_8859_4_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::WINDOWS_1254_INIT, + &encoding_rs::ISO_8859_7_INIT, + &encoding_rs::X_MAC_CYRILLIC_INIT, + &encoding_rs::REPLACEMENT_INIT, + &encoding_rs::ISO_8859_6_INIT, + &encoding_rs::ISO_8859_8_INIT, + &encoding_rs::UTF_8_INIT, + &encoding_rs::ISO_8859_5_INIT, + &encoding_rs::EUC_JP_INIT, +]; + +fn encoding_to_index(encoding: AsciiCompatibleEncoding) -> usize { + let encoding: &'static Encoding = encoding.into(); + + ALL_ENCODINGS + .iter() + .position(|&e| e == encoding) + .expect("the ALL_ENCODINGS is not complete and needs to be updated") +} /// A charset encoding that can be shared and modified. /// @@ -11,29 +252,42 @@ use std::rc::Rc; /// [crate::Settings::adjust_charset_on_meta_tag]). #[derive(Clone)] pub struct SharedEncoding { - encoding: Rc>, + encoding: Arc, } impl SharedEncoding { pub fn new(encoding: AsciiCompatibleEncoding) -> SharedEncoding { SharedEncoding { - encoding: Rc::new(Cell::new(encoding)), + encoding: Arc::new(AtomicUsize::new(encoding_to_index(encoding))), } } pub fn get(&self) -> &'static Encoding { - self.encoding.get().into() + let encoding = self.encoding.load(Ordering::Relaxed); + ALL_ENCODINGS[encoding] } pub fn set(&self, encoding: AsciiCompatibleEncoding) { - self.encoding.set(encoding); + self.encoding + .store(encoding_to_index(encoding), Ordering::Relaxed); } } -impl Deref for SharedEncoding { - type Target = Encoding; +#[cfg(test)] +mod tests { + use crate::base::encoding::ALL_ENCODINGS; + use crate::base::SharedEncoding; + use crate::AsciiCompatibleEncoding; + + #[test] + fn test_encoding_round_trip() { + let shared_encoding = SharedEncoding::new(AsciiCompatibleEncoding::utf_8()); - fn deref(&self) -> &'static Encoding { - self.get() + for encoding in ALL_ENCODINGS { + if let Some(ascii_compat_encoding) = AsciiCompatibleEncoding::new(encoding) { + shared_encoding.set(ascii_compat_encoding); + assert_eq!(shared_encoding.get(), encoding); + } + } } } diff --git a/src/lib.rs b/src/lib.rs index a7e3dc33..f31432ef 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,12 +37,51 @@ use cfg_if::cfg_if; pub use self::rewriter::{ rewrite_str, AsciiCompatibleEncoding, CommentHandler, DoctypeHandler, DocumentContentHandlers, - ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerResult, HtmlRewriter, - MemorySettings, RewriteStrSettings, Settings, TextHandler, + ElementContentHandlers, ElementHandler, EndHandler, EndTagHandler, HandlerNormalTypes, + HandlerResult, HandlerTypes, HtmlRewriter, MemorySettings, RewriteStrSettings, Settings, + TextHandler, }; pub use self::selectors_vm::Selector; pub use self::transform_stream::OutputSink; +// WIP! also instruction for `Send` in the readme. + +/// These module contains types to work with [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. +pub mod send { + use crate::rewriter::{ + CommentHandlerSend, DoctypeHandlerSend, ElementHandlerSend, EndHandlerSend, + EndTagHandlerSend, HandlerSendTypes, TextHandlerSend, + }; + + /// An [`HtmlRewriter`](crate::HtmlRewriter) that implements [`Send`]. + pub type HtmlRewriter<'h, O> = crate::HtmlRewriter<'h, O, HandlerSendTypes>; + /// [`Settings`](crate::Settings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type Settings<'h, 's> = crate::Settings<'h, 's, HandlerSendTypes>; + /// [`RewriteStrSettings`](crate::RewriteStrSettings) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type RewriteStrSettings<'h, 's> = crate::RewriteStrSettings<'h, 's, HandlerSendTypes>; + + /// [`ElementContentHandlers`](crate::ElementContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type ElementContentHandlers<'h> = crate::ElementContentHandlers<'h, HandlerSendTypes>; + /// [`DocumentContentHandlers`](crate::DocumentContentHandlers) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type DocumentContentHandlers<'h> = crate::DocumentContentHandlers<'h, HandlerSendTypes>; + + /// [`CommentHandler`](crate::CommentHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type CommentHandler<'h> = CommentHandlerSend<'h>; + /// [`DoctypeHandler`](crate::DoctypeHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type DoctypeHandler<'h> = DoctypeHandlerSend<'h>; + /// [`ElementHandler`](crate::ElementHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type ElementHandler<'h> = ElementHandlerSend<'h>; + /// [`EndHandler`](crate::EndHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type EndHandler<'h> = EndHandlerSend<'h>; + /// [`EndTagHandler`](crate::EndTagHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type EndTagHandler<'h> = EndTagHandlerSend<'h>; + /// [`TextHandler`](crate::TextHandler) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type TextHandler<'h> = TextHandlerSend<'h>; + + /// [`Element`](crate::Element) for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + pub type Element<'r, 't> = crate::rewritable_units::Element<'r, 't, HandlerSendTypes>; +} + /// The errors that can be produced by the crate's API. pub mod errors { pub use super::memory::MemoryLimitExceededError; diff --git a/src/memory/limiter.rs b/src/memory/limiter.rs index e3706501..7d367cb5 100644 --- a/src/memory/limiter.rs +++ b/src/memory/limiter.rs @@ -1,5 +1,5 @@ -use std::cell::Cell; -use std::rc::Rc; +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::Arc; use thiserror::Error; /// An error that occures when rewriter exceedes the memory limit specified in the @@ -12,30 +12,28 @@ pub struct MemoryLimitExceededError; #[derive(Debug, Clone)] pub struct SharedMemoryLimiter { - current_usage: Rc>, + current_usage: Arc, max: usize, } impl SharedMemoryLimiter { pub fn new(max: usize) -> SharedMemoryLimiter { SharedMemoryLimiter { - current_usage: Rc::new(Cell::new(0)), + current_usage: Arc::new(AtomicUsize::new(0)), max, } } #[cfg(test)] pub fn current_usage(&self) -> usize { - self.current_usage.get() + self.current_usage.load(Ordering::Relaxed) } #[inline] pub fn increase_usage(&self, byte_count: usize) -> Result<(), MemoryLimitExceededError> { - let previous_usage = self.current_usage.get(); + let previous_usage = self.current_usage.fetch_add(byte_count, Ordering::Relaxed); let current_usage = previous_usage + byte_count; - self.current_usage.set(current_usage); - if current_usage > self.max { Err(MemoryLimitExceededError) } else { @@ -52,10 +50,7 @@ impl SharedMemoryLimiter { #[inline] pub fn decrease_usage(&self, byte_count: usize) { - let previous_usage = self.current_usage.get(); - let current_usage = previous_usage - byte_count; - - self.current_usage.set(current_usage); + self.current_usage.fetch_sub(byte_count, Ordering::Relaxed); } } diff --git a/src/parser/tree_builder_simulator/mod.rs b/src/parser/tree_builder_simulator/mod.rs index 24fa183d..d5a908af 100644 --- a/src/parser/tree_builder_simulator/mod.rs +++ b/src/parser/tree_builder_simulator/mod.rs @@ -28,7 +28,9 @@ pub enum TreeBuilderFeedback { SwitchTextType(TextType), SetAllowCdata(bool), #[allow(clippy::type_complexity)] - RequestLexeme(Box TreeBuilderFeedback>), + RequestLexeme( + Box TreeBuilderFeedback + Send>, + ), None, } @@ -41,7 +43,7 @@ impl From for TreeBuilderFeedback { #[inline] fn request_lexeme( - callback: impl FnMut(&mut TreeBuilderSimulator, &TagLexeme) -> TreeBuilderFeedback + 'static, + callback: impl FnMut(&mut TreeBuilderSimulator, &TagLexeme) -> TreeBuilderFeedback + 'static + Send, ) -> TreeBuilderFeedback { TreeBuilderFeedback::RequestLexeme(Box::new(callback)) } diff --git a/src/rewritable_units/document_end.rs b/src/rewritable_units/document_end.rs index daa5af2c..a853f869 100644 --- a/src/rewritable_units/document_end.rs +++ b/src/rewritable_units/document_end.rs @@ -40,7 +40,7 @@ impl<'a> DocumentEnd<'a> { /// end.append("", ContentType::Text); /// Ok(()) /// })], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// diff --git a/src/rewritable_units/element.rs b/src/rewritable_units/element.rs index ed3bb295..3a725838 100644 --- a/src/rewritable_units/element.rs +++ b/src/rewritable_units/element.rs @@ -1,6 +1,6 @@ use super::{Attribute, AttributeNameError, ContentType, EndTag, Mutations, StartTag}; use crate::base::Bytes; -use crate::rewriter::EndTagHandler; +use crate::rewriter::{HandlerNormalTypes, HandlerTypes}; use encoding_rs::Encoding; use std::any::Any; use std::fmt::{self, Debug}; @@ -33,18 +33,18 @@ pub enum TagNameError { /// An HTML element rewritable unit. /// /// Exposes API for examination and modification of a parsed HTML element. -pub struct Element<'r, 't> { +pub struct Element<'r, 't, H: HandlerTypes = HandlerNormalTypes> { start_tag: &'r mut StartTag<'t>, end_tag_mutations: Option, modified_end_tag_name: Option>, - end_tag_handlers: Vec>, + end_tag_handlers: Vec>, can_have_content: bool, should_remove_content: bool, encoding: &'static Encoding, user_data: Box, } -impl<'r, 't> Element<'r, 't> { +impl<'r, 't, H: HandlerTypes> Element<'r, 't, H> { pub(crate) fn new(start_tag: &'r mut StartTag<'t>, can_have_content: bool) -> Self { let encoding = start_tag.encoding(); @@ -214,7 +214,7 @@ impl<'r, 't> Element<'r, 't> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -247,7 +247,7 @@ impl<'r, 't> Element<'r, 't> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -291,7 +291,7 @@ impl<'r, 't> Element<'r, 't> { /// element!("#foo", handler), /// element!("img", handler), /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -333,7 +333,7 @@ impl<'r, 't> Element<'r, 't> { /// element!("#foo", handler), /// element!("img", handler), /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -374,7 +374,7 @@ impl<'r, 't> Element<'r, 't> { /// element!("#foo", handler), /// element!("img", handler), /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -409,7 +409,7 @@ impl<'r, 't> Element<'r, 't> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -453,7 +453,7 @@ impl<'r, 't> Element<'r, 't> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -493,14 +493,14 @@ impl<'r, 't> Element<'r, 't> { /// # Example /// /// ``` - /// use lol_html::html_content::ContentType; + /// use lol_html::html_content::{ContentType, Element}; /// use lol_html::{element, rewrite_str, text, RewriteStrSettings}; /// let buffer = std::rc::Rc::new(std::cell::RefCell::new(String::new())); /// let html = rewrite_str( /// "Short13 characters", /// RewriteStrSettings { /// element_content_handlers: vec![ - /// element!("span", |el| { + /// element!("span", |el: &mut Element| { /// // Truncate string for each new span. /// buffer.borrow_mut().clear(); /// let buffer = buffer.clone(); @@ -527,14 +527,14 @@ impl<'r, 't> Element<'r, 't> { /// Ok(()) /// }), /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// }, /// ) /// .unwrap(); /// /// assert_eq!(html, "Short13 characters!"); /// ``` - pub fn end_tag_handlers(&mut self) -> Option<&mut Vec>> { + pub fn end_tag_handlers(&mut self) -> Option<&mut Vec>> { if self.can_have_content { Some(&mut self.end_tag_handlers) } else { @@ -542,30 +542,32 @@ impl<'r, 't> Element<'r, 't> { } } - pub(crate) fn into_end_tag_handler(self) -> Option> { + pub(crate) fn into_end_tag_handler(self) -> Option> { let end_tag_mutations = self.end_tag_mutations; let modified_end_tag_name = self.modified_end_tag_name; - let end_tag_handlers = self.end_tag_handlers; + let mut end_tag_handlers = self.end_tag_handlers; if end_tag_mutations.is_some() || modified_end_tag_name.is_some() || !end_tag_handlers.is_empty() { - Some(Box::new(move |end_tag: &mut EndTag| { - if let Some(name) = modified_end_tag_name { - end_tag.set_name(name); - } + end_tag_handlers.insert( + 0, + H::new_end_tag_handler(|end_tag: &mut EndTag| { + if let Some(name) = modified_end_tag_name { + end_tag.set_name(name); + } - if let Some(mutations) = end_tag_mutations { - end_tag.mutations = mutations; - } + if let Some(mutations) = end_tag_mutations { + end_tag.mutations = mutations; + } - for handler in end_tag_handlers.into_iter() { - handler(end_tag)?; - } + Ok(()) + }), + ); - Ok(()) - })) + // WIP! there must be a better way! + Some(H::combine_handlers(end_tag_handlers)) } else { None } @@ -574,7 +576,7 @@ impl<'r, 't> Element<'r, 't> { impl_user_data!(Element<'_, '_>); -impl Debug for Element<'_, '_> { +impl Debug for Element<'_, '_, H> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Element") .field("tag_name", &self.tag_name()) diff --git a/src/rewritable_units/mod.rs b/src/rewritable_units/mod.rs index 145ae42b..922e964f 100644 --- a/src/rewritable_units/mod.rs +++ b/src/rewritable_units/mod.rs @@ -44,7 +44,7 @@ pub use self::tokens::*; /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// ``` @@ -115,11 +115,11 @@ mod test_utils { .collect() } - pub fn rewrite_html( + pub fn rewrite_html<'h>( html: &[u8], encoding: &'static Encoding, - element_content_handlers: Vec<(Cow<'_, Selector>, ElementContentHandlers)>, - document_content_handlers: Vec, + element_content_handlers: Vec<(Cow<'_, Selector>, ElementContentHandlers<'h>)>, + document_content_handlers: Vec>, ) -> String { let mut output = Output::new(encoding); @@ -129,7 +129,7 @@ mod test_utils { element_content_handlers, document_content_handlers, encoding: AsciiCompatibleEncoding::new(encoding).unwrap(), - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.push(c), ); diff --git a/src/rewritable_units/tokens/comment.rs b/src/rewritable_units/tokens/comment.rs index 4af85822..eec9400a 100644 --- a/src/rewritable_units/tokens/comment.rs +++ b/src/rewritable_units/tokens/comment.rs @@ -94,7 +94,7 @@ impl<'i> Comment<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -126,7 +126,7 @@ impl<'i> Comment<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -158,7 +158,7 @@ impl<'i> Comment<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// diff --git a/src/rewritable_units/tokens/doctype.rs b/src/rewritable_units/tokens/doctype.rs index 3e3fb27c..12a59371 100644 --- a/src/rewritable_units/tokens/doctype.rs +++ b/src/rewritable_units/tokens/doctype.rs @@ -25,7 +25,7 @@ use std::fmt::{self, Debug}; /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// ``` diff --git a/src/rewritable_units/tokens/text_chunk.rs b/src/rewritable_units/tokens/text_chunk.rs index 7d24f332..58bc3e85 100644 --- a/src/rewritable_units/tokens/text_chunk.rs +++ b/src/rewritable_units/tokens/text_chunk.rs @@ -42,7 +42,7 @@ use std::fmt::{self, Debug}; /// Ok(()) /// }) /// ], -/// ..Settings::default() +/// ..Settings::new() /// }, /// |_:&[u8]| {} /// ); @@ -131,7 +131,7 @@ impl<'i> TextChunk<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// ``` @@ -171,7 +171,7 @@ impl<'i> TextChunk<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -205,7 +205,7 @@ impl<'i> TextChunk<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -239,7 +239,7 @@ impl<'i> TextChunk<'i> { /// Ok(()) /// }) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// diff --git a/src/rewriter/handlers_dispatcher.rs b/src/rewriter/handlers_dispatcher.rs index 92965176..eb1ffee9 100644 --- a/src/rewriter/handlers_dispatcher.rs +++ b/src/rewriter/handlers_dispatcher.rs @@ -112,21 +112,35 @@ impl HandlerVec { } } -#[derive(Default)] -pub struct ContentHandlersDispatcher<'h> { - doctype_handlers: HandlerVec>, - comment_handlers: HandlerVec>, - text_handlers: HandlerVec>, - end_tag_handlers: HandlerVec>, - element_handlers: HandlerVec>, - end_handlers: HandlerVec>, +pub struct ContentHandlersDispatcher<'h, H: HandlerTypes> { + doctype_handlers: HandlerVec>, + comment_handlers: HandlerVec>, + text_handlers: HandlerVec>, + end_tag_handlers: HandlerVec>, + element_handlers: HandlerVec>, + end_handlers: HandlerVec>, next_element_can_have_content: bool, matched_elements_with_removed_content: usize, } -impl<'h> ContentHandlersDispatcher<'h> { +impl<'h, H: HandlerTypes> Default for ContentHandlersDispatcher<'h, H> { + fn default() -> Self { + ContentHandlersDispatcher { + doctype_handlers: Default::default(), + comment_handlers: Default::default(), + text_handlers: Default::default(), + end_tag_handlers: Default::default(), + element_handlers: Default::default(), + end_handlers: Default::default(), + next_element_can_have_content: false, + matched_elements_with_removed_content: 0, + } + } +} + +impl<'h, H: HandlerTypes> ContentHandlersDispatcher<'h, H> { #[inline] - pub fn add_document_content_handlers(&mut self, handlers: DocumentContentHandlers<'h>) { + pub fn add_document_content_handlers(&mut self, handlers: DocumentContentHandlers<'h, H>) { if let Some(handler) = handlers.doctype { self.doctype_handlers.push(handler, true); } @@ -147,7 +161,7 @@ impl<'h> ContentHandlersDispatcher<'h> { #[inline] pub fn add_selector_associated_handlers( &mut self, - handlers: ElementContentHandlers<'h>, + handlers: ElementContentHandlers<'h, H>, ) -> SelectorHandlersLocator { SelectorHandlersLocator { element_handler_idx: handlers.element.map(|h| { diff --git a/src/rewriter/mod.rs b/src/rewriter/mod.rs index 780cb2ed..6a8450db 100644 --- a/src/rewriter/mod.rs +++ b/src/rewriter/mod.rs @@ -2,13 +2,15 @@ mod handlers_dispatcher; mod rewrite_controller; #[macro_use] -mod settings; +pub(crate) mod settings; use self::handlers_dispatcher::ContentHandlersDispatcher; use self::rewrite_controller::*; +pub use self::settings::*; use crate::base::SharedEncoding; use crate::memory::{MemoryLimitExceededError, SharedMemoryLimiter}; use crate::parser::ParsingAmbiguityError; +use crate::rewritable_units::Element; use crate::selectors_vm::{self, SelectorMatchingVm}; use crate::transform_stream::*; use encoding_rs::Encoding; @@ -18,8 +20,6 @@ use std::error::Error as StdError; use std::fmt::{self, Debug}; use thiserror::Error; -pub use self::settings::*; - /// This is an encoding known to be ASCII-compatible. /// /// Non-ASCII-compatible encodings (`UTF-16LE`, `UTF-16BE`, `ISO-2022-JP` and @@ -55,7 +55,7 @@ impl From for &'static Encoding { } } -impl std::convert::TryFrom<&'static Encoding> for AsciiCompatibleEncoding { +impl TryFrom<&'static Encoding> for AsciiCompatibleEncoding { type Error = (); fn try_from(enc: &'static Encoding) -> Result { @@ -114,7 +114,7 @@ pub enum RewritingError { /// Ok(()) /// }) /// ], -/// ..Settings::default() +/// ..Settings::new() /// }, /// |c: &[u8]| output.extend_from_slice(c) /// ); @@ -130,8 +130,8 @@ pub enum RewritingError { /// r#"
"# /// ); /// ``` -pub struct HtmlRewriter<'h, O: OutputSink> { - stream: TransformStream, O>, +pub struct HtmlRewriter<'h, O: OutputSink, H: HandlerTypes = HandlerNormalTypes> { + stream: TransformStream, O>, poisoned: bool, } @@ -152,7 +152,7 @@ macro_rules! guarded { }}; } -impl<'h, O: OutputSink> HtmlRewriter<'h, O> { +impl<'h, O: OutputSink, H: HandlerTypes> HtmlRewriter<'h, O, H> { /// Constructs a new rewriter with the provided `settings` that writes /// the output to the `output_sink`. /// @@ -161,10 +161,10 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> { /// For the convenience the [`OutputSink`] trait is implemented for closures. /// /// [`OutputSink`]: trait.OutputSink.html - pub fn new<'s>(settings: Settings<'h, 's>, output_sink: O) -> Self { + pub fn new<'s>(settings: Settings<'h, 's, H>, output_sink: O) -> Self { let encoding = SharedEncoding::new(settings.encoding); let mut selectors_ast = selectors_vm::Ast::default(); - let mut dispatcher = ContentHandlersDispatcher::default(); + let mut dispatcher = ContentHandlersDispatcher::::default(); let has_selectors = !settings.element_content_handlers.is_empty() || settings.adjust_charset_on_meta_tag; @@ -254,19 +254,16 @@ impl<'h, O: OutputSink> HtmlRewriter<'h, O> { // NOTE: this opaque Debug implementation is required to make // `.unwrap()` and `.expect()` methods available on Result // returned by the `HtmlRewriterBuilder.build()` method. -impl Debug for HtmlRewriter<'_, O> { +impl<'h, O: OutputSink, H: HandlerTypes> Debug for HtmlRewriter<'h, O, H> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "HtmlRewriter") } } -fn handler_adjust_charset_on_meta_tag( +fn handler_adjust_charset_on_meta_tag<'h, H: HandlerTypes>( encoding: SharedEncoding, -) -> ( - Cow<'static, crate::Selector>, - ElementContentHandlers<'static>, -) { - element!("meta", move |el| { +) -> (Cow<'h, crate::Selector>, ElementContentHandlers<'h, H>) { + let handler = move |el: &mut Element<'_, '_, H>| { let attr_charset = el .get_attribute("charset") .and_then(|cs| Encoding::for_label_no_replacement(cs.as_bytes())) @@ -285,7 +282,19 @@ fn handler_adjust_charset_on_meta_tag( } Ok(()) - }) + }; + + // WIP! can we make this less aweful? + let content_handlers = ElementContentHandlers { + element: Some(H::new_element_handler(handler)), + comments: None, + text: None, + }; + + ( + Cow::Owned("meta".parse::().unwrap()), + content_handlers, + ) } /// Rewrites given `html` string with the provided `settings`. @@ -312,15 +321,15 @@ fn handler_adjust_charset_on_meta_tag( /// r#"
"#, /// RewriteStrSettings { /// element_content_handlers, -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// /// assert_eq!(output, r#"
"#); /// ``` -pub fn rewrite_str<'h, 's>( +pub fn rewrite_str<'h, 's, H: HandlerTypes>( html: &str, - settings: impl Into>, + settings: impl Into>, ) -> Result { let mut output = vec![]; @@ -342,9 +351,13 @@ mod tests { use crate::test_utils::{Output, ASCII_COMPATIBLE_ENCODINGS, NON_ASCII_COMPATIBLE_ENCODINGS}; use encoding_rs::Encoding; use itertools::Itertools; - use std::cell::RefCell; + use static_assertions::assert_impl_all; use std::convert::TryInto; - use std::rc::Rc; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::{Arc, Mutex}; + + // Assert that HtmlRewriter with `HandlerSendTypes` is `Send`. + assert_impl_all!(crate::send::HtmlRewriter<'_, Box>: Send); fn write_chunks( mut rewriter: HtmlRewriter, @@ -371,9 +384,41 @@ mod tests { out } + #[allow(clippy::drop_non_drop)] + #[test] + fn handlers_covariance() { + let x = AtomicUsize::new(0); + + let el_handler_static = element!("foo", |_| Ok(())); + let el_handler_local = element!("foo", |_| { + x.fetch_add(1, Ordering::Relaxed); + Ok(()) + }); + + let doc_handler_static = end!(|_| Ok(())); + let doc_handler_local = end!(|_| { + x.fetch_add(1, Ordering::Relaxed); + Ok(()) + }); + + let settings = Settings { + document_content_handlers: vec![doc_handler_static, doc_handler_local], + element_content_handlers: vec![el_handler_static, el_handler_local], + encoding: AsciiCompatibleEncoding::utf_8(), + strict: false, + adjust_charset_on_meta_tag: false, + ..Settings::new() + }; + let rewriter = HtmlRewriter::new(settings, |_: &[u8]| ()); + + drop(rewriter); + + drop(x); + } + #[test] fn rewrite_html_str() { - let res = rewrite_str( + let res = rewrite_str::( "
", RewriteStrSettings { element_content_handlers: vec![ @@ -386,7 +431,7 @@ mod tests { Ok(()) }), ], - ..RewriteStrSettings::default() + ..RewriteStrSettings::new() }, ) .unwrap(); @@ -396,7 +441,7 @@ mod tests { #[test] fn rewrite_arbitrary_settings() { - let res = rewrite_str("Some text", Settings::default()).unwrap(); + let res = rewrite_str("Some text", Settings::new()).unwrap(); assert_eq!(res, "Some text"); } @@ -421,7 +466,7 @@ mod tests { })], // NOTE: unwrap() here is intentional; it also tests `Ascii::new`. encoding: enc.try_into().unwrap(), - ..Settings::default() + ..Settings::new() }, |_: &[u8]| {}, ); @@ -468,7 +513,7 @@ mod tests { Ok(()) })], encoding: enc.try_into().unwrap(), - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.push(c), ); @@ -528,7 +573,7 @@ mod tests { }), ], encoding: enc.try_into().unwrap(), - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.push(c), ); @@ -571,15 +616,15 @@ mod tests { #[test] fn handler_invocation_order() { - let handlers_executed = Rc::new(RefCell::new(Vec::default())); + let handlers_executed = Arc::new(Mutex::new(Vec::default())); macro_rules! create_handlers { ($sel:expr, $idx:expr) => { element!($sel, { - let handlers_executed = ::std::rc::Rc::clone(&handlers_executed); + let handlers_executed = ::std::sync::Arc::clone(&handlers_executed); move |_| { - handlers_executed.borrow_mut().push($idx); + handlers_executed.lock().unwrap().push($idx); Ok(()) } }) @@ -596,12 +641,12 @@ mod tests { create_handlers!("[foo]", 3), create_handlers!("div span[foo]", 4), ], - ..RewriteStrSettings::default() + ..RewriteStrSettings::new() }, ) .unwrap(); - assert_eq!(*handlers_executed.borrow(), vec![0, 1, 2, 3, 4]); + assert_eq!(*handlers_executed.lock().unwrap(), vec![0, 1, 2, 3, 4]); } #[test] @@ -614,7 +659,7 @@ mod tests { Ok(()) })], enable_esi_tags: true, - ..RewriteStrSettings::default() + ..RewriteStrSettings::new() }, ) .unwrap(); @@ -657,7 +702,7 @@ mod tests { &html, Settings { document_content_handlers: vec![enthusiastic_text_handler()], - ..Settings::default() + ..Settings::new() }, ); @@ -669,7 +714,7 @@ mod tests { Settings { document_content_handlers: vec![enthusiastic_text_handler()], adjust_charset_on_meta_tag: true, - ..Settings::default() + ..Settings::new() }, ); @@ -709,7 +754,7 @@ mod tests { &html, Settings { document_content_handlers: vec![enthusiastic_text_handler()], - ..Settings::default() + ..Settings::new() }, ); @@ -721,7 +766,7 @@ mod tests { Settings { document_content_handlers: vec![enthusiastic_text_handler()], adjust_charset_on_meta_tag: true, - ..Settings::default() + ..Settings::new() }, ); @@ -733,6 +778,8 @@ mod tests { mod fatal_errors { use super::*; use crate::errors::MemoryLimitExceededError; + use crate::html_content::Comment; + use crate::rewritable_units::{Element, TextChunk}; fn create_rewriter( max_allowed_memory_usage: usize, @@ -745,7 +792,7 @@ mod tests { max_allowed_memory_usage, preallocated_parsing_buffer_size: 0, }, - ..Settings::default() + ..Settings::new() }, output_sink, ) @@ -786,9 +833,9 @@ mod tests { #[test] fn content_handler_error_propagation() { - fn assert_err( - element_handlers: ElementContentHandlers, - document_handlers: DocumentContentHandlers, + fn assert_err<'h>( + element_handlers: ElementContentHandlers<'h>, + document_handlers: DocumentContentHandlers<'h>, expected_err: &'static str, ) { use std::borrow::Cow; @@ -800,7 +847,7 @@ mod tests { element_handlers, )], document_content_handlers: vec![document_handlers], - ..Settings::default() + ..Settings::new() }, |_: &[u8]| {}, ); @@ -854,21 +901,21 @@ mod tests { assert_err( ElementContentHandlers::default() - .element(|_| Err("Error in element handler".into())), + .element(|_: &mut Element<_>| Err("Error in element handler".into())), DocumentContentHandlers::default(), "Error in element handler", ); assert_err( ElementContentHandlers::default() - .comments(|_| Err("Error in element comment handler".into())), + .comments(|_: &mut Comment| Err("Error in element comment handler".into())), DocumentContentHandlers::default(), "Error in element comment handler", ); assert_err( ElementContentHandlers::default() - .text(|_| Err("Error in element text handler".into())), + .text(|_: &mut TextChunk| Err("Error in element text handler".into())), DocumentContentHandlers::default(), "Error in element text handler", ); diff --git a/src/rewriter/rewrite_controller.rs b/src/rewriter/rewrite_controller.rs index 7b96c419..928b8bd4 100644 --- a/src/rewriter/rewrite_controller.rs +++ b/src/rewriter/rewrite_controller.rs @@ -1,5 +1,5 @@ use super::handlers_dispatcher::{ContentHandlersDispatcher, SelectorHandlersLocator}; -use super::RewritingError; +use super::{HandlerTypes, RewritingError}; use crate::html::{LocalName, Namespace}; use crate::rewritable_units::{DocumentEnd, Token, TokenCaptureFlags}; use crate::selectors_vm::{AuxStartTagInfoRequest, ElementData, SelectorMatchingVm, VmError}; @@ -22,15 +22,15 @@ impl ElementData for ElementDescriptor { } } -pub struct HtmlRewriteController<'h> { - handlers_dispatcher: ContentHandlersDispatcher<'h>, +pub struct HtmlRewriteController<'h, H: HandlerTypes> { + handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, } -impl<'h> HtmlRewriteController<'h> { +impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { #[inline] pub fn new( - handlers_dispatcher: ContentHandlersDispatcher<'h>, + handlers_dispatcher: ContentHandlersDispatcher<'h, H>, selector_matching_vm: Option>, ) -> Self { HtmlRewriteController { @@ -40,7 +40,7 @@ impl<'h> HtmlRewriteController<'h> { } } -impl<'h> HtmlRewriteController<'h> { +impl<'h, H: HandlerTypes> HtmlRewriteController<'h, H> { #[inline] fn respond_to_aux_info_request( aux_info_req: AuxStartTagInfoRequest, @@ -65,7 +65,7 @@ impl<'h> HtmlRewriteController<'h> { } } -impl TransformController for HtmlRewriteController<'_> { +impl<'h, H: HandlerTypes> TransformController for HtmlRewriteController<'h, H> { #[inline] fn initial_capture_flags(&self) -> TokenCaptureFlags { self.get_capture_flags() diff --git a/src/rewriter/settings.rs b/src/rewriter/settings.rs index 9dd0ba9e..61f2afda 100644 --- a/src/rewriter/settings.rs +++ b/src/rewriter/settings.rs @@ -5,8 +5,119 @@ use super::AsciiCompatibleEncoding; use std::borrow::Cow; use std::error::Error; +/// Trait used to parameterize the type of handlers used in the rewriter. +/// +/// This is used to select between [`Send`]able and +/// non-[`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. +pub trait HandlerTypes: Sized { + /// Handler type for [`Doctype`]. + type DoctypeHandler<'h>: FnMut(&mut Doctype) -> HandlerResult + 'h; + /// Handler type for [`Comment`]. + type CommentHandler<'h>: FnMut(&mut Comment) -> HandlerResult + 'h; + /// Handler type for [`TextChunk`]. + type TextHandler<'h>: FnMut(&mut TextChunk) -> HandlerResult + 'h; + /// Handler type for [`Element`]. + type ElementHandler<'h>: FnMut(&mut Element<'_, '_, Self>) -> HandlerResult + 'h; + /// Handler type for [`EndTag`]. + type EndTagHandler<'h>: FnOnce(&mut EndTag) -> HandlerResult + 'h; + /// Handler type for [`DocumentEnd`]. + type EndHandler<'h>: FnOnce(&mut DocumentEnd) -> HandlerResult + 'h; + + // WIP! is there no better way? + // Inside the HTML rewriter we need to create handlers, and they need to be the most constrained + // possible version of a handler (i.e. if we have `Send` and non-`Send` handlers we need to + // create a `Send` handler since that's compatible with both classes of handlers), so that's + // what we offer below. + // + // Note that in the HTML rewriter all we have is an abstract `H` that implements `HandlerTypes`. + // Therefore, there is no direct way of create a handler that is compatible with *all* possible + // implementations of `HandlerTypes`, so each implementation of `HandlerTypes` needs to prove + // that a handler compatible with itself is creatable. + + /// WIP! + fn new_end_tag_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::EndTagHandler<'h>; + + /// WIP! + fn new_element_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::ElementHandler<'h>; + + /// WIP! + fn combine_handlers(handlers: Vec>) -> Self::EndTagHandler<'_>; +} + +/// Handler type for non-[`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. +pub struct HandlerNormalTypes {} + +impl HandlerTypes for HandlerNormalTypes { + type DoctypeHandler<'h> = DoctypeHandler<'h>; + type CommentHandler<'h> = CommentHandler<'h>; + type TextHandler<'h> = TextHandler<'h>; + type ElementHandler<'h> = ElementHandler<'h>; + type EndTagHandler<'h> = EndTagHandler<'h>; + type EndHandler<'h> = EndHandler<'h>; + + fn new_end_tag_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::EndTagHandler<'h> { + handler.into_handler() + } + + fn new_element_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::ElementHandler<'h> { + handler.into_handler() + } + + fn combine_handlers(handlers: Vec>) -> Self::EndTagHandler<'_> { + Box::new(move |end_tag: &mut EndTag| { + for handler in handlers { + handler(end_tag)?; + } + + Ok(()) + }) + } +} + +pub struct HandlerSendTypes {} + +impl HandlerTypes for HandlerSendTypes { + type DoctypeHandler<'h> = DoctypeHandlerSend<'h>; + type CommentHandler<'h> = CommentHandlerSend<'h>; + type TextHandler<'h> = TextHandlerSend<'h>; + type ElementHandler<'h> = ElementHandlerSend<'h, HandlerSendTypes>; + type EndTagHandler<'h> = EndTagHandlerSend<'h>; + type EndHandler<'h> = EndHandlerSend<'h>; + + fn new_end_tag_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::EndTagHandler<'h> { + handler.into_handler() + } + + fn new_element_handler<'h>( + handler: impl IntoHandler>, + ) -> Self::ElementHandler<'h> { + handler.into_handler() + } + + fn combine_handlers(handlers: Vec>) -> Self::EndTagHandler<'_> { + Box::new(move |end_tag: &mut EndTag| { + for handler in handlers { + handler(end_tag)?; + } + + Ok(()) + }) + } +} + /// The result of a handler. pub type HandlerResult = Result<(), Box>; + /// Handler for the [document type declaration]. /// /// [document type declaration]: https://developer.mozilla.org/en-US/docs/Glossary/Doctype @@ -16,44 +127,160 @@ pub type CommentHandler<'h> = Box HandlerResult + 'h> /// Handler for text chunks present the HTML. pub type TextHandler<'h> = Box HandlerResult + 'h>; /// Handler for elements matched by a selector. -pub type ElementHandler<'h> = Box HandlerResult + 'h>; -/// Handler for an end tag. +pub type ElementHandler<'h> = + Box) -> HandlerResult + 'h>; +/// Handler for end tags. pub type EndTagHandler<'h> = Box HandlerResult + 'h>; -/// Handler for the document end, which is called after the last chunk is processed. +/// Handler for the document end. This is called after the last chunk is processed. pub type EndHandler<'h> = Box HandlerResult + 'h>; +/// Handler for the [document type declaration] that are [`Send`]able. +/// +/// [document type declaration]: https://developer.mozilla.org/en-US/docs/Glossary/Doctype +pub type DoctypeHandlerSend<'h> = Box HandlerResult + Send + 'h>; +/// Handler for HTML comments that are [`Send`]able. +pub type CommentHandlerSend<'h> = Box HandlerResult + Send + 'h>; +/// Handler for text chunks present the HTML that are [`Send`]able. +pub type TextHandlerSend<'h> = Box HandlerResult + Send + 'h>; +/// Handler for elements matched by a selector that are [`Send`]able. +pub type ElementHandlerSend<'h, H = HandlerSendTypes> = + Box) -> HandlerResult + Send + 'h>; +/// Handler for end tags that are [`Send`]able. +pub type EndTagHandlerSend<'h> = Box HandlerResult + Send + 'h>; +/// Handler for the document end that are [`Send`]able. This is called after the last chunk is processed. +pub type EndHandlerSend<'h> = Box HandlerResult + Send + 'h>; + +pub trait IntoHandler { + fn into_handler(self) -> T; +} + +impl<'h, F: FnMut(&mut Doctype) -> HandlerResult + 'h> IntoHandler> for F { + fn into_handler(self) -> DoctypeHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut Comment) -> HandlerResult + 'h> IntoHandler> for F { + fn into_handler(self) -> CommentHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut TextChunk) -> HandlerResult + 'h> IntoHandler> for F { + fn into_handler(self) -> TextHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut Element<'_, '_, HandlerNormalTypes>) -> HandlerResult + 'h> + IntoHandler> for F +{ + fn into_handler(self) -> ElementHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnOnce(&mut EndTag) -> HandlerResult + 'h> IntoHandler> for F { + fn into_handler(self) -> EndTagHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnOnce(&mut DocumentEnd) -> HandlerResult + 'h> IntoHandler> for F { + fn into_handler(self) -> EndHandler<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut Doctype) -> HandlerResult + Send + 'h> IntoHandler> + for F +{ + fn into_handler(self) -> DoctypeHandlerSend<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut Comment) -> HandlerResult + Send + 'h> IntoHandler> + for F +{ + fn into_handler(self) -> CommentHandlerSend<'h> { + Box::new(self) + } +} + +impl<'h, F: FnMut(&mut TextChunk) -> HandlerResult + Send + 'h> IntoHandler> + for F +{ + fn into_handler(self) -> TextHandlerSend<'h> { + Box::new(self) + } +} + +impl<'h, H: HandlerTypes, F: FnMut(&mut Element<'_, '_, H>) -> HandlerResult + Send + 'h> + IntoHandler> for F +{ + fn into_handler(self) -> ElementHandlerSend<'h, H> { + Box::new(self) + } +} + +impl<'h, F: FnOnce(&mut EndTag) -> HandlerResult + Send + 'h> IntoHandler> + for F +{ + fn into_handler(self) -> EndTagHandlerSend<'h> { + Box::new(self) + } +} + +impl<'h, F: FnOnce(&mut DocumentEnd) -> HandlerResult + Send + 'h> IntoHandler> + for F +{ + fn into_handler(self) -> EndHandlerSend<'h> { + Box::new(self) + } +} + /// Specifies element content handlers associated with a selector. -#[derive(Default)] -pub struct ElementContentHandlers<'h> { - /// Element handler. See [ElementHandler]. - pub element: Option>, - /// Comment handler. See [CommentHandler]. - pub comments: Option>, - /// Text handler. See [TextHandler]. - pub text: Option>, +pub struct ElementContentHandlers<'h, H: HandlerTypes = HandlerNormalTypes> { + /// Element handler. See [H::ElementHandler]. + pub element: Option>, + /// Comment handler. See [H::CommentHandler]. + pub comments: Option>, + /// Text handler. See [H::TextHandler]. + pub text: Option>, +} + +impl<'h, H: HandlerTypes> Default for ElementContentHandlers<'h, H> { + fn default() -> Self { + ElementContentHandlers { + element: None, + comments: None, + text: None, + } + } } -impl<'h> ElementContentHandlers<'h> { +impl<'h, H: HandlerTypes> ElementContentHandlers<'h, H> { /// Sets a handler for elements matched by a selector. #[inline] - pub fn element(mut self, handler: impl FnMut(&mut Element) -> HandlerResult + 'h) -> Self { - self.element = Some(Box::new(handler)); + pub fn element(mut self, handler: impl IntoHandler>) -> Self { + self.element = Some(handler.into_handler()); self } /// Sets a handler for HTML comments in the inner content of elements matched by a selector. #[inline] - pub fn comments(mut self, handler: impl FnMut(&mut Comment) -> HandlerResult + 'h) -> Self { - self.comments = Some(Box::new(handler)); + pub fn comments(mut self, handler: impl IntoHandler>) -> Self { + self.comments = Some(handler.into_handler()); self } /// Sets a handler for text chunks in the inner content of elements matched by a selector. #[inline] - pub fn text(mut self, handler: impl FnMut(&mut TextChunk) -> HandlerResult + 'h) -> Self { - self.text = Some(Box::new(handler)); + pub fn text(mut self, handler: impl IntoHandler>) -> Self { + self.text = Some(handler.into_handler()); self } @@ -74,49 +301,59 @@ impl<'h> ElementContentHandlers<'h> { /// /// /// ``` -#[derive(Default)] -pub struct DocumentContentHandlers<'h> { - /// Doctype handler. See [DoctypeHandler]. - pub doctype: Option>, - /// Comment handler. See [CommentHandler]. - pub comments: Option>, - /// Text handler. See [TextHandler]. - pub text: Option>, - /// End handler. See [EndHandler]. - pub end: Option>, -} - -impl<'h> DocumentContentHandlers<'h> { +pub struct DocumentContentHandlers<'h, H: HandlerTypes = HandlerNormalTypes> { + /// Doctype handler. See [H::DoctypeHandler]. + pub doctype: Option>, + /// Comment handler. See [H::CommentHandler]. + pub comments: Option>, + /// Text handler. See [H::TextHandler]. + pub text: Option>, + /// End handler. See [H::EndHandler]. + pub end: Option>, +} + +impl<'h, H: HandlerTypes> Default for DocumentContentHandlers<'h, H> { + fn default() -> Self { + DocumentContentHandlers { + doctype: None, + comments: None, + text: None, + end: None, + } + } +} + +impl<'h, H: HandlerTypes> DocumentContentHandlers<'h, H> { /// Sets a handler for the [document type declaration]. /// /// [document type declaration]: https://developer.mozilla.org/en-US/docs/Glossary/Doctype #[inline] - pub fn doctype(mut self, handler: impl FnMut(&mut Doctype) -> HandlerResult + 'h) -> Self { - self.doctype = Some(Box::new(handler)); + pub fn doctype(mut self, handler: impl IntoHandler>) -> Self { + self.doctype = Some(handler.into_handler()); self } /// Sets a handler for all HTML comments present in the input HTML markup. #[inline] - pub fn comments(mut self, handler: impl FnMut(&mut Comment) -> HandlerResult + 'h) -> Self { - self.comments = Some(Box::new(handler)); + pub fn comments(mut self, handler: impl IntoHandler>) -> Self { + self.comments = Some(handler.into_handler()); self } /// Sets a handler for all text chunks present in the input HTML markup. #[inline] - pub fn text(mut self, handler: impl FnMut(&mut TextChunk) -> HandlerResult + 'h) -> Self { - self.text = Some(Box::new(handler)); + pub fn text(mut self, handler: impl IntoHandler>) -> Self { + self.text = Some(handler.into_handler()); self } /// Sets a handler for the document end, which is called after the last chunk is processed. #[inline] - pub fn end(mut self, handler: impl FnMut(&mut DocumentEnd) -> HandlerResult + 'h) -> Self { - self.end = Some(Box::new(handler)); + pub fn end(mut self, handler: impl IntoHandler>) -> Self { + self.end = Some(handler.into_handler()); self } @@ -151,7 +388,7 @@ macro_rules! __element_content_handler { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -159,9 +396,19 @@ macro_rules! __element_content_handler { /// ``` #[macro_export(local_inner_macros)] macro_rules! element { - ($selector:expr, $handler:expr) => { - __element_content_handler!($selector, element, $handler) - }; + ($selector:expr, $handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + // WIP! do we still need type hints? + #[inline(always)] + fn type_hint<'h, T, H: $crate::HandlerTypes>(h: T) -> T + where + T: FnMut(&mut $crate::html_content::Element<'_, '_, H>) -> $crate::HandlerResult + 'h, + { + h + } + + __element_content_handler!($selector, element, type_hint($handler)) + }}; } /// A convenience macro to construct a rewriting handler for text chunks in the inner content of an @@ -184,7 +431,7 @@ macro_rules! element { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -192,9 +439,18 @@ macro_rules! element { /// ``` #[macro_export(local_inner_macros)] macro_rules! text { - ($selector:expr, $handler:expr) => { - __element_content_handler!($selector, text, $handler) - }; + ($selector:expr, $handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnMut(&mut $crate::html_content::TextChunk) -> $crate::HandlerResult, + { + h + } + + __element_content_handler!($selector, text, type_hint($handler)) + }}; } /// A convenience macro to construct a rewriting handler for HTML comments in the inner content of @@ -215,7 +471,7 @@ macro_rules! text { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -223,9 +479,18 @@ macro_rules! text { /// ``` #[macro_export(local_inner_macros)] macro_rules! comments { - ($selector:expr, $handler:expr) => { - __element_content_handler!($selector, comments, $handler) - }; + ($selector:expr, $handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnMut(&mut $crate::html_content::Comment) -> $crate::HandlerResult, + { + h + } + + __element_content_handler!($selector, comments, type_hint($handler)) + }}; } #[doc(hidden)] @@ -253,7 +518,7 @@ macro_rules! __document_content_handler { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// ``` @@ -261,9 +526,18 @@ macro_rules! __document_content_handler { /// [document type declarations]: https://developer.mozilla.org/en-US/docs/Glossary/Doctype #[macro_export(local_inner_macros)] macro_rules! doctype { - ($handler:expr) => { - __document_content_handler!(doctype, $handler) - }; + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnMut(&mut $crate::html_content::Doctype) -> $crate::HandlerResult, + { + h + } + + __document_content_handler!(doctype, type_hint($handler)) + }}; } /// A convenience macro to construct a rewriting handler for all text chunks in the HTML document. @@ -285,7 +559,7 @@ macro_rules! doctype { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -293,9 +567,18 @@ macro_rules! doctype { /// ``` #[macro_export(local_inner_macros)] macro_rules! doc_text { - ($handler:expr) => { - __document_content_handler!(text, $handler) - }; + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnMut(&mut $crate::html_content::TextChunk) -> $crate::HandlerResult, + { + h + } + + __document_content_handler!(text, type_hint($handler)) + }}; } /// A convenience macro to construct a rewriting handler for all HTML comments in the HTML document. @@ -315,7 +598,7 @@ macro_rules! doc_text { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -323,9 +606,18 @@ macro_rules! doc_text { /// ``` #[macro_export(local_inner_macros)] macro_rules! doc_comments { - ($handler:expr) => { - __document_content_handler!(comments, $handler) - }; + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnMut(&mut $crate::html_content::Comment) -> $crate::HandlerResult, + { + h + } + + __document_content_handler!(comments, type_hint($handler)) + }}; } /// A convenience macro to construct a rewriting handler for the end of the document. @@ -354,7 +646,7 @@ macro_rules! doc_comments { /// Ok(()) /// }) /// ], -/// ..RewriteStrSettings::default() +/// ..RewriteStrSettings::new() /// } /// ).unwrap(); /// @@ -362,9 +654,18 @@ macro_rules! doc_comments { /// ``` #[macro_export(local_inner_macros)] macro_rules! end { - ($handler:expr) => { - __document_content_handler!(end, $handler) - }; + ($handler:expr) => {{ + // Without this rust won't be able to always infer the type of the handler. + #[inline(always)] + fn type_hint(h: T) -> T + where + T: FnOnce(&mut $crate::html_content::DocumentEnd) -> $crate::HandlerResult, + { + h + } + + __document_content_handler!(end, type_hint($handler)) + }}; } /// Specifies the memory settings for [`HtmlRewriter`]. @@ -388,7 +689,7 @@ pub struct MemorySettings { /// /// ### Default /// - /// `1024` bytes when constructed with `MemorySettings::default()`. + /// `1024` bytes when constructed with `MemorySettings::new()`. /// /// [`HtmlRewriter`]: struct.HtmlRewriter.html pub preallocated_parsing_buffer_size: usize, @@ -404,7 +705,7 @@ pub struct MemorySettings { /// /// ### Default /// - /// [`std::usize::MAX`] when constructed with `MemorySettings::default()`. + /// [`std::usize::MAX`] when constructed with `MemorySettings::new()`. /// /// [`HtmlRewriter`]: struct.HtmlRewriter.html /// [`std::usize::MAX`]: https://doc.rust-lang.org/std/usize/constant.MAX.html @@ -423,10 +724,17 @@ impl Default for MemorySettings { } } +impl MemorySettings { + /// Create a new [`MemorySettings`] with default values. + pub fn new() -> MemorySettings { + MemorySettings::default() + } +} + /// Specifies settings for [`HtmlRewriter`]. /// /// [`HtmlRewriter`]: struct.HtmlRewriter.html -pub struct Settings<'h, 's> { +pub struct Settings<'h, 's, H: HandlerTypes = HandlerNormalTypes> { /// Specifies CSS selectors and rewriting handlers for elements and their inner content. /// /// ### Hint @@ -438,12 +746,13 @@ pub struct Settings<'h, 's> { /// ``` /// use std::borrow::Cow; /// use lol_html::{ElementContentHandlers, Settings}; + /// use lol_html::html_content::{Comment, Element}; /// /// let settings = Settings { /// element_content_handlers: vec! [ /// ( /// Cow::Owned("div[foo]".parse().unwrap()), - /// ElementContentHandlers::default().element(|el| { + /// ElementContentHandlers::default().element(|el: &mut Element| { /// // ... /// /// Ok(()) @@ -451,21 +760,21 @@ pub struct Settings<'h, 's> { /// ), /// ( /// Cow::Owned("body".parse().unwrap()), - /// ElementContentHandlers::default().comments(|c| { + /// ElementContentHandlers::default().comments(|c: &mut Comment| { /// // ... /// /// Ok(()) /// }) /// ) /// ], - /// ..Settings::default() + /// ..Settings::new() /// }; /// ``` /// /// [`element`]: macro.element.html /// [`comments`]: macro.comments.html /// [`text`]: macro.text.html - pub element_content_handlers: Vec<(Cow<'s, Selector>, ElementContentHandlers<'h>)>, + pub element_content_handlers: Vec<(Cow<'s, Selector>, ElementContentHandlers<'h, H>)>, /// Specifies rewriting handlers for the content without associating it to a particular /// CSS selector. @@ -480,7 +789,7 @@ pub struct Settings<'h, 's> { /// [`doctype`]: macro.doctype.html /// [`doc_comments`]: macro.doc_comments.html /// [`doc_text`]: macro.doc_text.html - pub document_content_handlers: Vec>, + pub document_content_handlers: Vec>, /// Specifies the [character encoding] for the input and the output of the rewriter. /// @@ -493,7 +802,7 @@ pub struct Settings<'h, 's> { /// /// ### Default /// - /// `"utf-8"` when constructed with `Settings::default()`. + /// `"utf-8"` when constructed with `Settings::new()`. pub encoding: AsciiCompatibleEncoding, /// Specifies the memory settings. @@ -530,7 +839,7 @@ pub struct Settings<'h, 's> { /// /// ### Default /// - /// `true` when constructed with `Settings::default()`. + /// `true` when constructed with `Settings::new()`. pub strict: bool, /// If enabled the rewriter enables support for [Edge Side Includes] tags, treating them as @@ -563,13 +872,37 @@ pub struct Settings<'h, 's> { /// /// ### Default /// - /// `false` when constructed with `Settings::default()`. + /// `false` when constructed with `Settings::new()`. pub adjust_charset_on_meta_tag: bool, } -impl Default for Settings<'_, '_> { +impl Default for Settings<'_, '_, HandlerNormalTypes> { #[inline] fn default() -> Self { + Self::new() + } +} + +impl Settings<'_, '_, HandlerNormalTypes> { + /// Creates [`Settings`] for non-[`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + #[inline] + pub fn new() -> Self { + Self::new_for_handler_types() + } +} + +impl Settings<'_, '_, HandlerSendTypes> { + /// Creates [`Settings`] for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + #[inline] + pub fn new_send() -> Self { + Self::new_for_handler_types() + } +} + +impl Settings<'_, '_, H> { + /// Creates [`Settings`]. + #[inline] + pub fn new_for_handler_types() -> Self { Settings { element_content_handlers: vec![], document_content_handlers: vec![], @@ -582,15 +915,15 @@ impl Default for Settings<'_, '_> { } } -impl<'h, 's> From> for Settings<'h, 's> { +impl<'h, 's, H: HandlerTypes> From> for Settings<'h, 's, H> { #[inline] - fn from(settings: RewriteStrSettings<'h, 's>) -> Self { + fn from(settings: RewriteStrSettings<'h, 's, H>) -> Self { Settings { element_content_handlers: settings.element_content_handlers, document_content_handlers: settings.document_content_handlers, strict: settings.strict, enable_esi_tags: settings.enable_esi_tags, - ..Settings::default() + ..Settings::new_for_handler_types() } } } @@ -598,7 +931,7 @@ impl<'h, 's> From> for Settings<'h, 's> { /// Specifies settings for the [`rewrite_str`] function. /// /// [`rewrite_str`]: fn.rewrite_str.html -pub struct RewriteStrSettings<'h, 's> { +pub struct RewriteStrSettings<'h, 's, H: HandlerTypes = HandlerNormalTypes> { /// Specifies CSS selectors and rewriting handlers for elements and their inner content. /// /// ### Hint @@ -610,12 +943,13 @@ pub struct RewriteStrSettings<'h, 's> { /// ``` /// use std::borrow::Cow; /// use lol_html::{ElementContentHandlers, RewriteStrSettings}; + /// use lol_html::html_content::{Comment, Element}; /// /// let settings = RewriteStrSettings { /// element_content_handlers: vec! [ /// ( /// Cow::Owned("div[foo]".parse().unwrap()), - /// ElementContentHandlers::default().element(|el| { + /// ElementContentHandlers::default().element(|el: &mut Element| { /// // ... /// /// Ok(()) @@ -623,21 +957,21 @@ pub struct RewriteStrSettings<'h, 's> { /// ), /// ( /// Cow::Owned("div[foo]".parse().unwrap()), - /// ElementContentHandlers::default().comments(|c| { + /// ElementContentHandlers::default().comments(|c: &mut Comment| { /// // ... /// /// Ok(()) /// }) /// ) /// ], - /// ..RewriteStrSettings::default() + /// ..RewriteStrSettings::new() /// }; /// ``` /// /// [`element`]: macro.element.html /// [`comments`]: macro.comments.html /// [`text`]: macro.text.html - pub element_content_handlers: Vec<(Cow<'s, Selector>, ElementContentHandlers<'h>)>, + pub element_content_handlers: Vec<(Cow<'s, Selector>, ElementContentHandlers<'h, H>)>, /// Specifies rewriting handlers for the content without associating it to a particular /// CSS selector. @@ -652,7 +986,7 @@ pub struct RewriteStrSettings<'h, 's> { /// [`doctype`]: macro.doctype.html /// [`doc_comments`]: macro.doc_comments.html /// [`doc_text`]: macro.doc_text.html - pub document_content_handlers: Vec>, + pub document_content_handlers: Vec>, /// If set to `true` the rewriter bails out if it encounters markup that drives the HTML parser /// into ambigious state. @@ -685,7 +1019,7 @@ pub struct RewriteStrSettings<'h, 's> { /// /// ### Default /// - /// `true` when constructed with `Settings::default()`. + /// `true` when constructed with `Settings::new()`. pub strict: bool, /// If enabled the rewriter enables support for [Edge Side Includes] tags, treating them as @@ -696,9 +1030,33 @@ pub struct RewriteStrSettings<'h, 's> { pub enable_esi_tags: bool, } -impl Default for RewriteStrSettings<'_, '_> { +impl Default for RewriteStrSettings<'_, '_, HandlerNormalTypes> { #[inline] fn default() -> Self { + Self::new() + } +} + +impl RewriteStrSettings<'_, '_, HandlerNormalTypes> { + /// Creates [`Settings`] for non-[`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + #[inline] + pub fn new() -> Self { + Self::new_for_handler_types() + } +} + +impl RewriteStrSettings<'_, '_, HandlerSendTypes> { + /// Creates [`Settings`] for [`Send`]able [`HtmlRewriter`](crate::HtmlRewriter)s. + #[inline] + pub fn new_send() -> Self { + Self::new_for_handler_types() + } +} + +impl RewriteStrSettings<'_, '_, H> { + /// Creates [`RewriteStrSettings`]. + #[inline] + pub fn new_for_handler_types() -> Self { RewriteStrSettings { element_content_handlers: vec![], document_content_handlers: vec![], diff --git a/src/selectors_vm/compiler.rs b/src/selectors_vm/compiler.rs index c1558a2b..de8190dc 100644 --- a/src/selectors_vm/compiler.rs +++ b/src/selectors_vm/compiler.rs @@ -13,9 +13,9 @@ use std::hash::Hash; use std::iter; /// An expression using only the tag name of an element. -pub type CompiledLocalNameExpr = Box bool>; +pub type CompiledLocalNameExpr = Box bool + Send>; /// An expression using the attributes of an element. -pub type CompiledAttributeExpr = Box bool>; +pub type CompiledAttributeExpr = Box bool + Send>; #[derive(Default)] struct ExprSet { @@ -31,7 +31,7 @@ pub struct AttrExprOperands { impl Expr { #[inline] - pub fn compile_expr bool + 'static>( + pub fn compile_expr bool + Send + 'static>( &self, f: F, ) -> CompiledLocalNameExpr { @@ -92,7 +92,7 @@ impl Compilable for Expr { impl Expr { #[inline] - pub fn compile_expr bool + 'static>( + pub fn compile_expr bool + Send + 'static>( &self, f: F, ) -> CompiledAttributeExpr { diff --git a/src/selectors_vm/mod.rs b/src/selectors_vm/mod.rs index 953423b2..74cfda4d 100644 --- a/src/selectors_vm/mod.rs +++ b/src/selectors_vm/mod.rs @@ -28,10 +28,11 @@ pub struct MatchInfo

{ pub type AuxStartTagInfoRequest = Box< dyn FnOnce( - &mut SelectorMatchingVm, - AuxStartTagInfo, - &mut dyn FnMut(MatchInfo

), - ) -> Result<(), MemoryLimitExceededError>, + &mut SelectorMatchingVm, + AuxStartTagInfo, + &mut dyn FnMut(MatchInfo

), + ) -> Result<(), MemoryLimitExceededError> + + Send, >; pub enum VmError { @@ -143,7 +144,10 @@ pub struct SelectorMatchingVm { enable_esi_tags: bool, } -impl SelectorMatchingVm { +impl SelectorMatchingVm +where + E: Send, +{ #[inline] pub fn new( ast: Ast, @@ -238,7 +242,7 @@ impl SelectorMatchingVm { Ok(()) } - fn bailout( + fn bailout( ctx: ExecutionCtx, bailout: Bailout, recovery_point_handler: RecoveryPointHandler, diff --git a/src/transform_stream/dispatcher.rs b/src/transform_stream/dispatcher.rs index f0d62d03..1ff99e83 100644 --- a/src/transform_stream/dispatcher.rs +++ b/src/transform_stream/dispatcher.rs @@ -17,8 +17,9 @@ pub struct AuxStartTagInfo<'i> { pub self_closing: bool, } -type AuxStartTagInfoRequest = - Box) -> Result>; +type AuxStartTagInfoRequest = Box< + dyn FnOnce(&mut C, AuxStartTagInfo<'_>) -> Result + Send, +>; pub enum DispatcherError { InfoRequest(AuxStartTagInfoRequest), diff --git a/src/transform_stream/mod.rs b/src/transform_stream/mod.rs index 7298be8c..c59750c3 100644 --- a/src/transform_stream/mod.rs +++ b/src/transform_stream/mod.rs @@ -1,15 +1,14 @@ mod dispatcher; use self::dispatcher::Dispatcher; +pub use self::dispatcher::{ + AuxStartTagInfo, DispatcherError, OutputSink, StartTagHandlingResult, TransformController, +}; use crate::base::SharedEncoding; use crate::memory::{Arena, SharedMemoryLimiter}; use crate::parser::{Parser, ParserDirective}; use crate::rewriter::RewritingError; -pub use self::dispatcher::{ - AuxStartTagInfo, DispatcherError, OutputSink, StartTagHandlingResult, TransformController, -}; - pub struct TransformStreamSettings where C: TransformController, diff --git a/tests/fixtures/element_content_replacement.rs b/tests/fixtures/element_content_replacement.rs index 0d4b3846..b7ba5d9d 100644 --- a/tests/fixtures/element_content_replacement.rs +++ b/tests/fixtures/element_content_replacement.rs @@ -31,7 +31,7 @@ impl TestFixture for ElementContentReplacementTests { }) ], encoding, - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.push(c) ); diff --git a/tests/fixtures/selector_matching.rs b/tests/fixtures/selector_matching.rs index 1411a123..8400faf4 100644 --- a/tests/fixtures/selector_matching.rs +++ b/tests/fixtures/selector_matching.rs @@ -68,7 +68,7 @@ impl TestFixture for SelectorMatchingTests { }) ], encoding, - ..Settings::default() + ..Settings::new() }, |c: &[u8]| output.push(c) ); diff --git a/tests/harness/suites/html5lib_tests/decoder.rs b/tests/harness/suites/html5lib_tests/decoder.rs index 848203dd..52df4c97 100644 --- a/tests/harness/suites/html5lib_tests/decoder.rs +++ b/tests/harness/suites/html5lib_tests/decoder.rs @@ -91,10 +91,8 @@ impl<'a> Decoder<'a> { self.chars.next(); if m.0 != 0 { if c != ';' && self.entities == Entities::Attribute { - if let Some(&c) = self.chars.peek() { - if matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '=') { - continue; - } + if let Some('A'..='Z' | 'a'..='z' | '0'..='9' | '=') = self.chars.peek() { + continue; } } name_match = (m.0, m.1, name_buf.len());