diff --git a/Cargo.lock b/Cargo.lock index 35e0667..b70716d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -391,6 +391,15 @@ version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +[[package]] +name = "html5gum" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c4e556171a058ba117bbe88b059fb37b6289023e007d2903ea6dca3a3cbff14" +dependencies = [ + "jetscii", +] + [[package]] name = "humantime" version = "2.1.0" @@ -464,6 +473,12 @@ version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +[[package]] +name = "jetscii" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e" + [[package]] name = "js-sys" version = "0.3.69" @@ -547,6 +562,7 @@ dependencies = [ "anyhow", "env_logger", "genawaiter", + "html5gum", "insta", "log", "mdbook", diff --git a/Cargo.toml b/Cargo.toml index 70164e5..3cd1e1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,7 @@ aho-corasick = "1.0.0" anyhow = "1.0.47" env_logger = "0.11.0" genawaiter = { version = "0.99.1", default-features = false } +html5gum = "0.5.7" log = "0.4.0" mdbook = { version = "0.4.35", default-features = false } normpath = "1.0.0" diff --git a/src/lib.rs b/src/lib.rs index 832799d..351d175 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1216,6 +1216,63 @@ fn main() {} "###); } + #[test] + fn matched_html_tags() { + let output = MDBook::init() + .config(Config::pandoc()) + .chapter(Chapter::new( + "Chapter", + " +
+ + +## Heading + +text + + +

+ +more **markdown** + +

+
+ +outside divs + ", + "chapter.md", + )) + .build(); + insta::assert_snapshot!(output, @r###" + ├─ log output + │ INFO mdbook::book: Running the pandoc backend + │ INFO mdbook_pandoc::pandoc::renderer: Wrote output to book/markdown/pandoc-ir + ├─ markdown/pandoc-ir + │ [ RawBlock (Format "html") "
\n\n" + │ , Div + │ ( "" , [ "details" ] , [] ) + │ [ Div + │ ( "" , [ "summary" ] , [] ) + │ [ Header + │ 2 + │ ( "book__markdown__src__chaptermd__heading" + │ , [ "unnumbered" , "unlisted" ] + │ , [] + │ ) + │ [ Str "Heading" ] + │ , Para [ Str "text" ] + │ ] + │ , RawBlock (Format "html") "\n

\n" + │ , Div + │ ( "" , [ "p" ] , [] ) + │ [ Para [ Str "more" , Space , Strong [ Str "markdown" ] ] ] + │ ] + │ , RawBlock (Format "html") "

\n
\n" + │ , Para [ Str "outside" , Space , Str "divs" ] + │ ] + "###); + } + #[test] /// Respect enabled/disabled extensions in Pandoc's `from` option fn extension_overrides() { diff --git a/src/pandoc/extension.rs b/src/pandoc/extension.rs index bf2a941..b7d6024 100644 --- a/src/pandoc/extension.rs +++ b/src/pandoc/extension.rs @@ -9,6 +9,7 @@ pub enum Extension { Attributes, GfmAutoIdentifiers, RawAttribute, + FencedDivs, // TODO: pandoc's `rebase_relative_paths` extension works for Markdown links and images, // but not for raw HTML links and images. Switch if/when pandoc supports HTML as well. /// Treat paths as relative to the chapter containing them @@ -27,6 +28,7 @@ impl Extension { Extension::Attributes => "attributes", Extension::GfmAutoIdentifiers => "gfm_auto_identifiers", Extension::RawAttribute => "raw_attribute", + Extension::FencedDivs => "fenced_divs", Extension::RebaseRelativePaths => "rebase_relative_paths", } } @@ -41,6 +43,7 @@ impl Extension { Extension::Attributes => (2, 10, 1), Extension::GfmAutoIdentifiers => (2, 0, 0), Extension::RawAttribute => (2, 10, 1), + Extension::FencedDivs => (2, 0, 0), Extension::RebaseRelativePaths => (2, 14, 0), }; Version { diff --git a/src/preprocess.rs b/src/preprocess.rs index 2c8d04d..ff9902c 100644 --- a/src/preprocess.rs +++ b/src/preprocess.rs @@ -672,6 +672,7 @@ struct PreprocessChapter<'book, 'preprocessor> { parser: Peekable>, matching_tags: Vec, encountered_h1: bool, + open_html_tags: Vec, } impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { @@ -697,6 +698,7 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { .peekable(), matching_tags: Default::default(), encountered_h1: false, + open_html_tags: Vec::new(), } } @@ -1036,8 +1038,10 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { // Actually consume the item from the iterator self.parser.next(); } - html = self.preprocess_contiguous_html(html); - Event::Html(html) + for event in self.preprocess_contiguous_html(html, Event::Html) { + co.yield_((event, None)).await + } + continue 'events; } Event::InlineHtml(mut html) => { while let Some((Event::InlineHtml(more), _)) = self.parser.peek() { @@ -1047,8 +1051,10 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { // Actually consume the item from the iterator self.parser.next(); } - html = self.preprocess_contiguous_html(html); - Event::InlineHtml(html) + for event in self.preprocess_contiguous_html(html, Event::InlineHtml) { + co.yield_((event, None)).await + } + continue 'events; } Event::TaskListMarker(checked) => { (self.preprocessor.ctx.pandoc).enable_extension(pandoc::Extension::TaskLists); @@ -1060,7 +1066,11 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { } } - fn preprocess_contiguous_html(&mut self, mut html: CowStr<'book>) -> CowStr<'book> { + fn preprocess_contiguous_html( + &mut self, + mut html: CowStr<'book>, + wrap_html: impl FnOnce(CowStr<'book>) -> pulldown_cmark::Event, + ) -> impl Iterator> + '_ { if let OutputFormat::Latex { packages } = &mut self.preprocessor.ctx.output { static FONT_AWESOME_ICON: Lazy = Lazy::new(|| { Regex::new(r#".*?)"(>\s*|/>)"#).unwrap() @@ -1078,7 +1088,60 @@ impl<'book, 'preprocessor> PreprocessChapter<'book, 'preprocessor> { }; } } - html + let already_open_tags = self.open_html_tags.len(); + let mut still_open_tags = self.open_html_tags.len(); + for node in html5gum::Tokenizer::new(html.as_ref()).infallible() { + match node { + html5gum::Token::StartTag(start) => { + self.open_html_tags.push(start.name); + } + html5gum::Token::EndTag(end) => match self.open_html_tags.last() { + Some(tag) if *tag == end.name => { + self.open_html_tags.pop(); + still_open_tags = still_open_tags.min(self.open_html_tags.len()); + } + _ => {} + }, + _ => {} + } + } + use pulldown_cmark::Event; + let mut fenced_divs_available = || { + self.preprocessor + .ctx + .pandoc + .enable_extension(pandoc::Extension::FencedDivs) + .is_available() + }; + let close_divs = { + let closed_tags = already_open_tags - still_open_tags; + (closed_tags > 0 && fenced_divs_available()) + .then(|| { + iter::once(Event::Text("\n\n".into())) + .chain((0..closed_tags).map(|_| Event::Text(":::\n\n".into()))) + .chain(iter::once(Event::Text("\n\n".into()))) + }) + .into_iter() + .flatten() + }; + let open_divs = { + let opened_tags = &self.open_html_tags[still_open_tags..]; + (!opened_tags.is_empty() && fenced_divs_available()) + .then(|| { + iter::once(Event::Text("\n\n".into())) + .chain(opened_tags.iter().map(|tag| { + Event::Text( + format!("::: {}\n\n", String::from_utf8_lossy(&tag.0)).into(), + ) + })) + .chain(iter::once(Event::Text("\n\n".into()))) + }) + .into_iter() + .flatten() + }; + close_divs + .chain(iter::once(wrap_html(html))) + .chain(open_divs) } }